Sorry I haven't been tuned in for the past couple cro of days since i've just switched from windows to linux but you might like this link: http://syntax.cwarn23.info/PHP:_Making_a_search_engine . Also there is an update to my previous code which is as follows:
<form method="post">Scan site: <input type="text" name="site" value="http://" style="width:300px">
<input value="Scan" type="submit"></form>
<?
set_time_limit (0);
if (isset($_POST['site']) && !empty($_POST['site'])) {
/* Formats Allowed */
$formats=array('html'=>true,'htm'=>true,'xhtml'=>true,'xml'=>true,'mhtml'=>true,'xht'=>true,
'mht'=>true,'asp'=>true,'aspx'=>true,'adp'=>true,'bml'=>true,'cfm'=>true,'cgi'=>true,
'ihtml'=>true,'jsp'=>true,'las'=>true,'lasso'=>true,'lassoapp'=>true,'pl'=>true,'php'=>true,
'php1'=>true,'php2'=>true,'php3'=>true,'php4'=>true,'php5'=>true,'php6'=>true,'phtml'=>true,
'shtml'=>true,'search'=>true,'query'=>true,'forum'=>true,'blog'=>true,'1'=>true,'2'=>true,
'3'=>true,'4'=>true,'5'=>true,'6'=>true,'7'=>true,'8'=>true,'9'=>true,'10'=>true,'11'=>true,
'12'=>true,'13'=>true,'14'=>true,'15'=>true,'16'=>true,'17'=>true,'18'=>true,'19'=>true,
'20'=>true,'01'=>true,'02'=>true,'03'=>true,'04'=>true,'05'=>true,'06'=>true,'07'=>true,
'08'=>true,'09'=>true,'go'=>true,'page'=>true,'file'=>true);
function domain ($ddomain) {
return preg_replace('/^((http(s)?:\/\/)?([^\/]+))(.*)/','$1',$ddomain);
}
function url_exists($durl)
{
// Version 4.x supported
$handle = curl_init($durl);
if (false === $handle)
{
return false;
}
curl_setopt($handle, CURLOPT_HEADER, true);
curl_setopt($handle, CURLOPT_FAILONERROR, true); // this works
curl_setopt($handle, CURLOPT_HTTPHEADER,
Array("User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.15) Gecko/20080623 Firefox/2.0.0.15") );
curl_setopt($handle, CURLOPT_NOBODY, true);
curl_setopt($handle, CURLOPT_RETURNTRANSFER, true);
$connectable = curl_exec($handle);
curl_close($handle);
if (stripos(substr_replace($connectable,'',30),'200 OK')) {
return true;
} else {
return false;
}
}
$fdata='';
//below function will only get links within own domain and not links outside the site.
function getlinks($generateurlf) {
global $formats;
global $f_data;
$f_data=file_get_contents($generateurlf);
$datac=$f_data;
preg_match_all('/(href|src)\=(\"|\')([^\"\'\>]+)/i',$datac,$media);
unset($datac);
$datac=$media[3];
unset($media);
$datab=array();
$str_start=array('http'=>true,'www.'=>true);
foreach($datac AS $dfile) {
$generateurle=$generateurlf;
$format=strtolower(preg_replace('/(.*)[.]([^.\?]+)(\?(.*))?/','$2',basename($generateurle.$dfile)));
if (!isset($str_start[substr_replace($dfile,'',4)])) {
if (substr_replace($generateurle,'',0, -1)!=='/') {
$generateurle=preg_replace('/(.*)\/[^\/]+/is', "$1", $generateurle);
} else {
$generateurle=substr_replace($generateurle,'',-1);
}
if (substr_replace($dfile,'',1)=='/') {
if (domain($generateurle)==domain($generateurle.$dfile)) {
if (isset($formats[$format])
|| substr($generateurle.$dfile,-1)=='/' || substr_count(basename($generateurle.$dfile),'.')==0) {
$datab[]=$generateurle.$dfile;
}
}
} else if (substr($dfile,0,2)=='./') {
$dfile=substr($dfile,2);
if (isset($formats[$format])) {$datab[]=$generateurle.'/'.$dfile;}
} else if (substr_replace($dfile,'',1)=='.') {
while (preg_match('/\.\.\/(.*)/i', $dfile)) {
$dfile=substr_replace($dfile,'',0,3);
$generateurle=preg_replace('/(.*)\/[^\/]+/i', "$1", $generateurle);
}
if (domain($generateurle)==domain($generateurle.'/'.$dfile)) {
if (isset($formats[$format]) || substr($generateurle.'/'.$dfile,-1)=='/'
|| substr_count(basename($generateurle.'/'.$dfile),'.')==0) {
$datab[]=$generateurle.'/'.$dfile;
}
}
} else {
if (domain($generateurle)==domain($generateurle.'/'.$dfile)) {
if (isset($formats[$format]) || substr($generateurle.'/'.$dfile,-1)=='/'
|| substr_count(basename($generateurle.'/'.$dfile),'.')==0) {
$datab[]=$generateurle.'/'.$dfile;
}
}
}
} else {
if (domain($generateurle)==domain($dfile)) {
if (isset($formats[$format]) || substr($dfile,-1)=='/' || substr_count(basename($dfile),'.')==0) {
$datab[]=$dfile;
}
}
}
unset($format);
}
unset($datac);
unset($dfile);
return $datab;
}
//=============================================
/* Modify only code between these two lines and $formats variable above. */
function generate($url) {
echo $url.'';
global $f_data; //Data of file contents
//do something with webpage $f_data.
unset($f_data);
}
//=============================================
// Below is what actually process the search engine
$sites=array();
$sites[]=stripslashes($_POST['site']);
for ($i=0;isset($sites[$i]);$i++) {
foreach (getlinks(stripslashes($sites[$i])) AS $val) {
if (!isset($sites[$val])) {
$sites[]=$val;
$sites[$val]=true;
}
} unset($val);
if (url_exists($sites[$i])) {
generate($sites[$i]);
flush();
}
}
}
?>