require_once('browser_emulator.php');
require_once('../functions.php');
require_once('database_connection.php');
function get_urlstring_unchanged($file){
$urlstring="";
while ($line = fgets($file, 1024)) { //L䳥r in data fr宠den 氰nade filen
$urlstring = $urlstring.$line;
//echo $line;
}
$urlstring=change_swedish_charset($urlstring);
//echo $urlstring;
//echo $split_start."wwwwwwwwwwwwwwwwwwwwwwwwww";
//$urlstring = splitPageHTML($url,$urlstring,$split_start,$split_end);
//Sparar bara informationen mellan mellan tv堳tr䮧ar
//echo $urlstring;
//$urlstring = removeTags($urlstring,$allowedTags);
//Tar bort on椩ga taggar
//$urlstring = reformatHTML($urlstring);
// $document should contain an HTML document.
// This will remove HTML tags, javascript sections
// and white space. It will also convert some
// common HTML entities to their text equivalent.
//
//echo $urlstring;
return $urlstring;
}
function insert_url_in_db($link,$url){
$url = str_replace("\"", "", $url);
$SQL="SELECT * FROM url WHERE url='".html_entity_decode($url)."'";
//echo $SQL."
";
$result = mysqli_query($link,$SQL);
//echo mysql_numrows($result)."
";
if (mysql_numrows($result)==0){
$Uppdatera="INSERT INTO url (url) VALUES ('".html_entity_decode($url)."')";
//echo $Uppdatera;
echo "
".$url."
";
$result = mysqli_query($link,$Uppdatera) or die("Query failed-insert_url_in_db");
Return 1;
}
else {
//echo "
else-------------------".$url."
";
return 0;
}
}
function insert_leftoverurls_in_db($link,$url){
$SQL="SELECT * FROM url WHERE url LIKE '%".$url."%'";
$result = mysqli_query($link,$SQL);
if ($mysql_numrows($result)==0){
$url=html_entity_decode($url);
$Uppdatera="INSERT INTO leftoverurl (url,crawled_date) VALUES ('".$url."','".date("Y-m-d")."')";
//echo $Uppdatera;
echo "
".$url."
";
$result = mysqli_query($link,$Uppdatera) or die("Query failed-insert_url_in_db");
Return 1;
}
else {
echo "
elseleftover-------------------".$url."
";
return 0;
}
}
function get_url_from_db($link){
$SQL="SELECT * FROM url WHERE crawled_date=0 ORDER BY url_id";
//echo $SQL;
$result = mysqli_query($link,$SQL) or die("Query failed-get_url_fromDB");
$RS=mysql_fetch_array($result);
$url_result=array();
$url_result[0]=$RS["url_id"];
$url_result[1]=$RS["url"];
echo $url_result[1]."
_____________________";
Return $url_result;
}
function get_urls_and_insert_in_db ($link,$urlstring,$baseurl) {
$pattern=array();
//$pattern[0]= "()";
$pattern[0]="(href=(\"|)((http://[^'.>/]{2,30}\.[^'.>/]{1,30}\.[^'.>/]{2,30}\.[a-zA-Z]{2,4}|http://[^'./]{1,20}\.[^'./]{1,20}\.[a-z]{2,4}|)([^:'><]{2,50}\.[a-z]{2,4}|)(\?[^> ]{0,100}|))(\"|))";
for ($w=0; $w
data->".$data."pattern".$pattern."company->".$company."url->".$url."
";
for ($i=0; $i< count($matches); $i++) {
$crawled_link=$matches[$i][2];
//echo "
".strpos($crawled_link, 'http')."----".$crawled_link."------";
$search_string="http";
if (substr($crawled_link, 0, 4)!=$search_string){
//echo "
ifkkkkkkkkkkkkkkkkkkkkkkkkkkk";
$crawled_link=$baseurl."/".$crawled_link;
}
//echo $crawled_link;
insert_url_in_db($link,$crawled_link);
}
}
}
}
function set_crawled_date($link,$url_id) {
$Uppdatera="UPDATE url SET crawled_date = '".date("Y-m-d")."' WHERE url_id ='".$url_id."'";
//echo $Uppdatera;
//exit();
$result = mysqli_query($link,$Uppdatera) or die("Query failed-set_crawled_date");
Return 1;
}
function get_urlstring_from_url($url,$be){
$file = $be->fopen($url);
$response = $be->getLastResponseHeaders();
$urlstring= get_urlstring_unchanged($file);
//fclose($file);
return $urlstring;
}
function categorise_string($link,$url){
$pattern=array();
//$pattern[0]= "()";
$pattern[0][0]="(kalender)";
$pattern[0][1]="(program)";
$pattern[1][0]="(meny)";
$pattern[1][1]="(matsedel)";
for ($e=0; $eaddHeaderLine("Referer", "www.hubnet.se");
$be->addHeaderLine("Accept-Encoding", "x-compress; x-zip");
$be->addHeaderLine("Content-Type", "application/x-www-form-urlencoded");
//**********************************************************************************"**********
//$thisMonth = date("m");
//$year = date("Y");
//$upper_month = $thisMonth+3;
//$debugg="";
///$urlstring="";
//for ($month = $thisMonth; $month <$upper_month; $month++)
//{
// if ($month > 12){
// $month=1;
// $year = $year +1;
// $upper_month = $upper_month -12;
// }
// $debugg = $debugg.$year;
// $debugg = $debugg.$month;
// $be->addPostData("year", $year);
// $be->addPostData("month", $month);
//$be->addPostData("menu1", "index.php?month=3");
//$baseurl="http://www.bonhuset.com";
//$suburl="index.asp";
$baseurl="";
$url_result=array();
$url_result=get_url_from_db($link);
$url=$url_result[1];
//echo $url;
set_crawled_date($link,$url_result[0]);
$pattern="((http://[^/]*\.[a-zA-Z]{2,4}))";
if (preg_match_all($pattern, $url, $matches, PREG_SET_ORDER)) {
$baseurl=$matches[0][0];
}
//echo $baseurl;
if ($baseurl==$url){
//echo "IFoooooooooooooooooooooo
";
$index_alias=array();
$index_alias[0]="index.php";
$index_alias[1]="index.htm";
$index_alias[2]="index.html";
$index_alias[3]="index.asp";
$index_alias[4]="index.aspx";
$index_alias[5]="index.phtml";
for ($q=0; $q";
$urlstring=get_urlstring_from_url($url,$be);
get_urls_and_insert_in_db ($link,$urlstring,$baseurl);
analyze_urlstring($link,$urlstring,$url);
}
}
else {
//echo "elserrrrrrrrrrrrrrrrrrrrrr
";
$urlstring=get_urlstring_from_url($url,$be);
get_urls_and_insert_in_db ($link,$urlstring,$baseurl);
analyze_urlstring($link,$urlstring,$url);
}
//$url=$baseurl."/".$suburl;
/*$file = $be->fopen($url);
$response = $be->getLastResponseHeaders();
$urlstring= get_urlstring_unchanged($file);
$pattern= "()";
if (preg_match_all($pattern, $urlstring, $matches, PREG_SET_ORDER)) {
//echo "
data->".$data."pattern".$pattern."company->".$company."url->".$url."
";
for ($i=0; $i< count($matches); $i++) {
$crawled_link=$matches[$i][1];
if (!strpos($crawled_link, $baseurl)){
$crawled_link=$baseurl."/".$crawled_link;
}
echo $crawled_link;
insert_url_in_db($link,$crawled_link);
}
}*/
//echo $urlstring;
//$debugg = $debugg.$urlstring;
//echo $urlstring."
";
//$company=1;
//if (strlen($urlstring)>80){
// if(insert_data_in_db($url,$town=1,$category=1,$company,$urlstring, $year, $month, 0, 0, $link, $search_type=1,date("Y-m-d"),$sortorder=1)){
// delete_data_in_db($company,$link);
// }
//}
//else {
// send_mail("jerker.bergman@gmail.com","Urlstring error","URLsring to short p堢.$url);
//}
//}
?>