0

Hello master...
I want to build a web crawler that can detect all files which extension is :
.pdf
.ppt
.xls
.doc
First, insert the domain of a website to search files, than the result will be like this :
domain : www.examle.com
Files :
1. xxx.doc
2. xxx2.doc
3. xxx3.ppt
4. xxx4.pdf
5. xxx5.xls
Found 5 files in www.example.com

Please tell me how to do something like that. Please.

2
Contributors
1
Reply
2
Views
8 Years
Discussion Span
Last Post by cwarn23
0

Try this:

<form method="post">Scan site: <input type="text" name="site" value="http://" style="width:300px">
<input value="Scan" type="submit"></form>
<?
set_time_limit (0);
if (isset($_POST['site']) && !empty($_POST['site'])) {
/* Formats Allowed */
$formats=array('html'=>true,'htm'=>true,'xhtml'=>true,'xml'=>true,'mhtml'=>true,'xht'=>true,
'mht'=>true,'asp'=>true,'aspx'=>true,'adp'=>true,'bml'=>true,'cfm'=>true,'cgi'=>true,
'ihtml'=>true,'jsp'=>true,'las'=>true,'lasso'=>true,'lassoapp'=>true,'pl'=>true,'php'=>true,
'php1'=>true,'php2'=>true,'php3'=>true,'php4'=>true,'php5'=>true,'php6'=>true,'phtml'=>true,
'shtml'=>true,'search'=>true,'query'=>true,'forum'=>true,'blog'=>true,'1'=>true,'2'=>true,
'3'=>true,'4'=>true,'5'=>true,'6'=>true,'7'=>true,'8'=>true,'9'=>true,'10'=>true,'11'=>true,
'12'=>true,'13'=>true,'14'=>true,'15'=>true,'16'=>true,'17'=>true,'18'=>true,'19'=>true,
'20'=>true,'01'=>true,'02'=>true,'03'=>true,'04'=>true,'05'=>true,'06'=>true,'07'=>true,
'08'=>true,'09'=>true,'go'=>true,'page'=>true,'file'=>true);
 
function domain ($ddomain) {
return preg_replace('/^((http(s)?:\/\/)?([^\/]+))(.*)/','$1',$ddomain);
}
 
function url_exists($durl)
		{
		// Version 4.x supported
		$handle   = curl_init($durl);
		if (false === $handle)
			{
			return false;
			}
		curl_setopt($handle, CURLOPT_HEADER, true);
		curl_setopt($handle, CURLOPT_FAILONERROR, true);  // this works
		curl_setopt($handle, CURLOPT_HTTPHEADER, 
Array("User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.15) Gecko/20080623 Firefox/2.0.0.15") );
		curl_setopt($handle, CURLOPT_NOBODY, true);
		curl_setopt($handle, CURLOPT_RETURNTRANSFER, true);
		$connectable = curl_exec($handle);
		curl_close($handle);  
        if (stripos(substr_replace($connectable,'',30),'200 OK')) {
            return true;
            } else {
            return false;
            }
		}
 $fdata='';
//below function will only get links within own domain and not links outside the site.
function getlinks($generateurlf) {
    global $formats;
    global $f_data;
    $f_data=file_get_contents($generateurlf);
    $datac=$f_data;
    preg_match_all('/(href|src)\=(\"|\')([^\"\'\>]+)/i',$datac,$media);
    unset($datac);
    $datac=$media[3];
    unset($media);
    global $global_links;
    $global_links=array();
    $str_start=array('http'=>true,'www.'=>true);
    foreach($datac AS $dfile) {
        $generateurle=$generateurlf;
		$format=strtolower(preg_replace('/(.*)[.]([^.\?]+)(\?(.*))?/','$2',basename($generateurle.$dfile)));
        if (!isset($str_start[substr_replace($dfile,'',4)])) {
            if (substr_replace($generateurle,'',0, -1)!=='/') {
                $generateurle=preg_replace('/(.*)\/[^\/]+/is', "$1", $generateurle);
                } else {
                $generateurle=substr_replace($generateurle,'',-1);
                }
 
            if (substr_replace($dfile,'',1)=='/') {
                if (domain($generateurle)==domain($generateurle.$dfile)) {
                    if (isset($formats[$format]) 
                        || substr($generateurle.$dfile,-1)=='/' || substr_count(basename($generateurle.$dfile),'.')==0) {
                        $global_links[]=$generateurle.$dfile;
                        }
                    }
                } else if (substr($dfile,0,2)=='./') {
                $dfile=substr($dfile,2);
                if (isset($formats[$format])) {$global_links[]=$generateurle.'/'.$dfile;}
                } else if (substr_replace($dfile,'',1)=='.') {
                while (preg_match('/\.\.\/(.*)/i', $dfile)) {
                $dfile=substr_replace($dfile,'',0,3);
                $generateurle=preg_replace('/(.*)\/[^\/]+/i', "$1", $generateurle);
                }
                if (domain($generateurle)==domain($generateurle.'/'.$dfile)) {
                    if (isset($formats[$format]) || substr($generateurle.'/'.$dfile,-1)=='/' 
                        || substr_count(basename($generateurle.'/'.$dfile),'.')==0) {
                        $global_links[]=$generateurle.'/'.$dfile;
                        }
                    }
                } else {
                if (domain($generateurle)==domain($generateurle.'/'.$dfile)) {
                    if (isset($formats[$format]) || substr($generateurle.'/'.$dfile,-1)=='/' 
                        || substr_count(basename($generateurle.'/'.$dfile),'.')==0) {
                        $global_links[]=$generateurle.'/'.$dfile;
                        }
                    }
                }
            } else {
            if (domain($generateurle)==domain($dfile)) {
                if (isset($formats[$format]) || substr($dfile,-1)=='/' || substr_count(basename($dfile),'.')==0) {
                    $global_links[]=$dfile;
                    }
                }
            }
		unset($format);
        }
    unset($datac);
    unset($dfile);
    return $global_links;
    }
 
 
 
 
 
//=============================================
/* Modify only code between these two lines and $formats variable above. */
 
function generate($url) {
    global $f_data; //Data of file contents
    global $global_links;
    global $linksarray;
    $extension=array('pdf'=>true,'ppt'=>true,'xls'=>true,'doc'=>true,'php'=>true);
    foreach ($global_links AS $link) {
        if (!isset($linksarray[$link])) {
            if (preg_match('#^.*\.(pdf|ppt|xls|doc)\??(.*)?$#',$link)) {
                $linksarray[$link]=true;
                echo $link."<br>\n";
                flush();
                }
            }
        }
    unset($f_data,$global_links);
    }
 
 
//=============================================
// Below is what actually process the search engine
$sites=array();
$sites[]=stripslashes($_POST['site']);
for ($i=0;isset($sites[$i]);$i++) {
    foreach (getlinks(stripslashes($sites[$i])) AS $val) {
        if (!isset($sites[$val])) {
            $sites[]=$val;
            $sites[$val]=true;
            }
        } unset($val);
    if (url_exists($sites[$i])) {
        generate($sites[$i]);
        flush();
        }
    }
}
?>
This topic has been dead for over six months. Start a new discussion instead.
Have something to contribute to this discussion? Please be thoughtful, detailed and courteous, and be sure to adhere to our posting rules.