I have just completed a bot that indexes the pages linked from the rss feeds. Below is the script that I have created for the job and will alter it to scan 15 rss feeds. But does anybody know how often the rss feeds update? The minimum time gap in bot startup periods I could set this bot to would be about 1 hour due to monthly bandwidth. So please advise on what time intervals the bot should scan. Also you may be able to see in the code below the algorithm I have created. Are there any suggestions on how to improve the algorithm for later sentence retrieval?
Code is:
<?
set_time_limit (1800);
mysql_connect('localhost','root','');
mysql_select_db('botsearch');
function domain ($ddomain) {
return preg_replace('/^((http(s)?:\/\/)?([^\/]+))(.*)/','$1',$ddomain);
}
function url_exists($durl)
{
// Version 4.x supported
$handle = curl_init($durl);
if (false === $handle)
{
return false;
}
curl_setopt($handle, CURLOPT_HEADER, true);
curl_setopt($handle, CURLOPT_FAILONERROR, true); // this works
curl_setopt($handle, CURLOPT_HTTPHEADER,
Array("User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.15) Gecko/20080623 Firefox/2.0.0.15") );
curl_setopt($handle, CURLOPT_NOBODY, true);
curl_setopt($handle, CURLOPT_RETURNTRANSFER, true);
$connectable = curl_exec($handle);
//echo '"'.$connectable.'"';
curl_close($handle);
$con=substr_replace($connectable,'',30);
if (stripos($con,'200 OK') || stripos($con,'Moved')) {
return true;
} else {
return false;
}
}
$in_valid_wordlist=array();
$r=mysql_query('SELECT * FROM `noun_list`');
while ($row=mysql_fetch_assoc($r)) {
$in_wordlist[$row['word']]=true;
}
function generate($url,$topicid) {
global $f_data; //Data of file contents
global $in_wordlist;
$r=mysql_query('SELECT * FROM `faq_datareference` WHERE `id`="'.mysql_real_escape_string($topicid).'"');
if (mysql_num_rows($r)==0) {
//do something with webpage $f_data.
//$tmp=htmlspecialchars_decode($f_data);
$f_data=html_entity_decode($f_data);
$f_data=str_replace('-|code|--|code|-','-|code|- -|code|-',$f_data);
$f_data=preg_replace('#<blockquote[^>]+>(.*)</blockquote>#U','',$f_data);
$tmp=str_replace(array('',''),"\r",$f_data);
preg_match_all('#<div class="codecontent">(.*)</div>#iUm',$tmp,$code);
$codeid=0;
unset($tmp);
$f_data=str_replace(array('<br/>',''),'',$f_data);
$f_data=str_replace('','',$f_data);
$f_data=str_replace('','',$f_data); //in case of 4 in a row
$f_data=preg_replace('#<pre class="code">(.*)</pre>#iU','-|code|-',$f_data);
preg_match_all('#<div id="post_message_[0-9]+">(.*)</div> </div> </div>#isU',$f_data,$a_data);
$val=0;
$totalval=0;
$abortval=0;
$totalcodeval=0;
foreach ($a_data[1] AS $adata) {
$adata=strtolower(str_replace('...','.',strip_tags($adata,'')));
$d_data=preg_split('/([\?.!]|)/',$adata);
if ($abortval<4) {
foreach ($d_data AS $sentence) {
if (strlen($sentence)>16 && $abortval<4) {
$re=mysql_query('SELECT * FROM `faq_data` WHERE `string`="'.mysql_real_escape_string($sentence).'" AND `id`="'.mysql_real_escape_string($topicid).'"');
$did_wordcheck=false;
if (mysql_num_rows($re)==0) {
$did_wordcheck=true;
$words=explode(' ',$sentence);
foreach ($words AS $word) {
if (isset($in_wordlist[$word])) {
$val+=1;
if (!isset($nounverb_result[$word])) {
$nounverb_result[$word]=1;
} else {
$nounverb_result[$word]+=1;
}
}
}
$var[0]=$sentence;
$var[1]=$val;
$totalval+=($val-$val2);
mysql_query('INSERT INTO `faq_data` SET `match`="'.$var[1].'", `string`="'.mysql_real_escape_string($var[0]).'", `id`="'.mysql_real_escape_string($topicid).'"');
} else {
$abortval+=1;
}
$val=0;
$val2=0;
} else if ($sentence=='-|code|--|code|-') {
$re=mysql_query('SELECT * FROM `faq_data` WHERE `match`="-1" AND `string`="'.mysql_real_escape_string($code[1][$codeid]).'" AND `id`="'.mysql_real_escape_string($topicid).'"');
if ($abortval==0 || mysql_num_rows($re)==0) {
$var[0]=$sentence;
$var[1]+=3;
mysql_query('UPDATE `faq_data` SET `match`="'.$var[1].'" WHERE `string`="'.mysql_real_escape_string($var[0]).'" AND `id`="'.mysql_real_escape_string($topicid).'"');
mysql_query('INSERT INTO `faq_data` SET `match`="-1", `string`="'.mysql_real_escape_string($code[1][$codeid]).'", `id`="'.mysql_real_escape_string($topicid).'"');
$codeid+=1;
$val+=2;
}
$val2=$val;
$totalcodeval+=1;
}
}
} else {
break;
}
}
if ($did_wordcheck==true) {
arsort($nounverb_result);
$var='INSERT INTO `faq_datareference` SET `id`="'.mysql_real_escape_string($topicid).'"';
$i=1;
foreach ($nounverb_result AS $key=>$val) {
if ($i>10) { break; } else {
$var.=', `word'.$i.'`="'.mysql_real_escape_string($key).'", `word'.$i.'num`='.$val;
}
$i++;
}
for (;$i<11;$i++) {
$var.=', `word'.$i.'`=" ", `word'.$i.'num`=-1';
}
$var.=', `num_nounsverbs`='.$totalval.', `num_codeboxes`='.$totalcodeval;
mysql_query($var);
}
}
echo '<b>Generated Url: </b>'.$url.'';
unset($f_data);
}
$rss=file_get_contents('http://www.daniweb.com/forums/rss143.xml');
preg_match_all('#<link><!\[CDATA\[(.*[&?]t=([^\&]++).*)\]\]></link>#U',$rss,$topic_url);
for ($id=0;isset($topic_url[2][$id]);$id++) {
$rr=mysql_query('SELECT `id` FROM `faq_datareference` WHERE `id`="'.mysql_real_escape_string($topic_url[2][$id]).'"');
if (url_exists($topic_url[1][$id]) && mysql_num_rows($rr)==0) {
$f_data=file_get_contents($topic_url[1][$id]);
preg_match('#<td class="alt1" nowrap="nowrap"><span>Page 1 of ([0-9])</span></td>#i',$f_data,$pages_num);
generate($topic_url[1][$id],$topic_url[2][$id]);
flush();
if (!empty($pages_num[1]) && isset($pages_num[1]) && $pages_num[1]>1) {
for ($i=2;$i<=$pages_num[1];$i++) {
if (url_exists($topic_url[1][$id])) {
$f_data=file_get_contents($topic_url[1][$id]);
generate('http://www.daniweb.com/forums/showthread.php?t='.$topic_url[2][$id].'&page='.$i,$topic_url[2][$id]);
flush();
}
}
}
}
}
?>
Thanks.