萬盛學電腦網

 萬盛學電腦網 >> 網絡編程 >> php編程 >> 簡單Spider類

簡單Spider類

<?php
class Spider {

var $mysql_host;
var $mysql_name;
var $mysql_pwd;
var $mysql_db;

var $parentUrl; //開始搜索的url
var $searchNum; //搜索的層數
var $url;
var $db;

//數據庫連接函數
function connect_to_db($mysql_host,$mysql_name,$mysql_pwd){
$db=mysql_connect($mysql_host,$mysql_name,$mysql_pwd);
return $db;
}


//處理url,以符合標准
function dealUrl($url){
if(strstr($url,"http://")){

}else{
$url="http://".$url;
}
if(strrpos($url,'/')==strlen($url)-1){
$url = substr($url,0,-1);
}
return $url;
}

//取一個鏈接下的所有鏈接
function getUrl($url){
$fcontents = file($url);
$nextUrl = "succeed";
while(list(,$line)=each($fcontents)){

//while(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-] "?)(.*)',$line,$regs)){
while(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-] [^([:space:]|>|")]*)(.*)',$line,$regs)){
$regs[1] =
eregi_replace('(href[[:space:]]*=[[:space:]]*"?)([[:alnum:]:@/._-] )("?)',"2",$regs[1]);

if(strstr($regs[1],"http://")){
}else{
$regs[1]=$url."/".$regs[1];
}

//echo "&nbsp;&nbsp;&nbsp;&nbsp;$regs[1]<br>";
$line = $regs[2];
if(strstr($nextUrl,$regs[1])){

}else{
if(strstr($regs[1],".php")||strstr($regs[1],".asp")||strstr($regs[1],".jsp")||strstr($regs[1],".htm")||strstr($regs[1],".com")||strstr($regs[1],".cn")||strstr($regs[1],".net")||strstr($regs[1],".org")){
if(strstr($regs[1],"_bak")){

}else{
$nextUrl=$nextUrl.",".$regs[1];
}

}
}
}

}
return $nextUrl;
}


//查詢該URL是否需要重新搜索
function queryUrl($url,$contentDesc,$db){
mysql_select_db("SearchEngine");
$sql="select * from visited where visitedUrl='".$url."' and contentDesc='".$contentDesc."'";
$rs=mysql_query($sql,$db);
if(mysql_fetch_row($rs)){
return false;
}else{
return true;

copyright © 萬盛學電腦網 all rights reserved