"; $result = mysqli_query($link,$SQL); //echo mysql_numrows($result)."
"; if (mysql_numrows($result)==0){ $Uppdatera="INSERT INTO url (url) VALUES ('".html_entity_decode($url)."')"; //echo $Uppdatera; echo "
".$url."
"; $result = mysqli_query($link,$Uppdatera) or die("Query failed-insert_url_in_db"); Return 1; } else { //echo "
else-------------------".$url."
"; return 0; } } function insert_leftoverurls_in_db($link,$url){ $SQL="SELECT * FROM url WHERE url LIKE '%".$url."%'"; $result = mysqli_query($link,$SQL); if ($mysql_numrows($result)==0){ $url=html_entity_decode($url); $Uppdatera="INSERT INTO leftoverurl (url,crawled_date) VALUES ('".$url."','".date("Y-m-d")."')"; //echo $Uppdatera; echo "
".$url."
"; $result = mysqli_query($link,$Uppdatera) or die("Query failed-insert_url_in_db"); Return 1; } else { echo "
elseleftover-------------------".$url."
"; return 0; } } function get_url_from_db($link){ $SQL="SELECT * FROM url WHERE crawled_date=0 ORDER BY url_id"; //echo $SQL; $result = mysqli_query($link,$SQL) or die("Query failed-get_url_fromDB"); $RS=mysql_fetch_array($result); $url_result=array(); $url_result[0]=$RS["url_id"]; $url_result[1]=$RS["url"]; echo $url_result[1]."
_____________________"; Return $url_result; } function get_urls_and_insert_in_db ($link,$urlstring,$baseurl) { $pattern=array(); //$pattern[0]= "()"; $pattern[0]="(href=(\"|)((http://[^'.>/]{2,30}\.[^'.>/]{1,30}\.[^'.>/]{2,30}\.[a-zA-Z]{2,4}|http://[^'./]{1,20}\.[^'./]{1,20}\.[a-z]{2,4}|)([^:'><]{2,50}\.[a-z]{2,4}|)(\?[^> ]{0,100}|))(\"|))"; for ($w=0; $w
data->".$data."pattern".$pattern."company->".$company."url->".$url."

"; for ($i=0; $i< count($matches); $i++) { $crawled_link=$matches[$i][2]; //echo "
".strpos($crawled_link, 'http')."----".$crawled_link."------"; $search_string="http"; if (substr($crawled_link, 0, 4)!=$search_string){ //echo "
ifkkkkkkkkkkkkkkkkkkkkkkkkkkk"; $crawled_link=$baseurl."/".$crawled_link; } //echo $crawled_link; insert_url_in_db($link,$crawled_link); } } } } function set_crawled_date($link,$url_id) { $Uppdatera="UPDATE url SET crawled_date = '".date("Y-m-d")."' WHERE url_id ='".$url_id."'"; //echo $Uppdatera; //exit(); $result = mysqli_query($link,$Uppdatera) or die("Query failed-set_crawled_date"); Return 1; } function get_urlstring_from_url($url,$be){ $file = $be->fopen($url); $response = $be->getLastResponseHeaders(); $urlstring= get_urlstring_unchanged($file); //fclose($file); return $urlstring; } function categorise_string($link,$url){ $pattern=array(); //$pattern[0]= "()"; $pattern[0][0]="(kalender)"; $pattern[0][1]="(program)"; $pattern[1][0]="(meny)"; $pattern[1][1]="(matsedel)"; for ($e=0; $eaddHeaderLine("Referer", "www.hubnet.se"); $be->addHeaderLine("Accept-Encoding", "x-compress; x-zip"); $be->addHeaderLine("Content-Type", "application/x-www-form-urlencoded"); //**********************************************************************************"********** //$thisMonth = date("m"); //$year = date("Y"); //$upper_month = $thisMonth+3; //$debugg=""; ///$urlstring=""; //for ($month = $thisMonth; $month <$upper_month; $month++) //{ // if ($month > 12){ // $month=1; // $year = $year +1; // $upper_month = $upper_month -12; // } // $debugg = $debugg.$year; // $debugg = $debugg.$month; // $be->addPostData("year", $year); // $be->addPostData("month", $month); //$be->addPostData("menu1", "index.php?month=3"); //$baseurl="http://www.bonhuset.com"; //$suburl="index.asp"; $baseurl=""; $url_result=array(); $url_result=get_url_from_db($link); $url=$url_result[1]; //echo $url; set_crawled_date($link,$url_result[0]); $pattern="((http://[^/]*\.[a-zA-Z]{2,4}))"; if (preg_match_all($pattern, $url, $matches, PREG_SET_ORDER)) { $baseurl=$matches[0][0]; } //echo $baseurl; if ($baseurl==$url){ //echo "IFoooooooooooooooooooooo
"; $index_alias=array(); $index_alias[0]="index.php"; $index_alias[1]="index.htm"; $index_alias[2]="index.html"; $index_alias[3]="index.asp"; $index_alias[4]="index.aspx"; $index_alias[5]="index.phtml"; for ($q=0; $q"; $urlstring=get_urlstring_from_url($url,$be); get_urls_and_insert_in_db ($link,$urlstring,$baseurl); analyze_urlstring($link,$urlstring,$url); } } else { //echo "elserrrrrrrrrrrrrrrrrrrrrr
"; $urlstring=get_urlstring_from_url($url,$be); get_urls_and_insert_in_db ($link,$urlstring,$baseurl); analyze_urlstring($link,$urlstring,$url); } //$url=$baseurl."/".$suburl; /*$file = $be->fopen($url); $response = $be->getLastResponseHeaders(); $urlstring= get_urlstring_unchanged($file); $pattern= "()"; if (preg_match_all($pattern, $urlstring, $matches, PREG_SET_ORDER)) { //echo "

data->".$data."pattern".$pattern."company->".$company."url->".$url."

"; for ($i=0; $i< count($matches); $i++) { $crawled_link=$matches[$i][1]; if (!strpos($crawled_link, $baseurl)){ $crawled_link=$baseurl."/".$crawled_link; } echo $crawled_link; insert_url_in_db($link,$crawled_link); } }*/ //echo $urlstring; //$debugg = $debugg.$urlstring; //echo $urlstring."

"; //$company=1; //if (strlen($urlstring)>80){ // if(insert_data_in_db($url,$town=1,$category=1,$company,$urlstring, $year, $month, 0, 0, $link, $search_type=1,date("Y-m-d"),$sortorder=1)){ // delete_data_in_db($company,$link); // } //} //else { // send_mail("jerker.bergman@gmail.com","Urlstring error","URLsring to short p堢.$url); //} //} ?>