2011-11-21 54 views
0

在頁面的各個環節和環節後,我有AA函數返回使用PHP正則表達式從指定的頁面鏈接, 現在我想找到鏈接,每個鏈接等後去....PHP去鏈接

這裏是我的代碼有

function getLinks($url){ 
$content = file_get_contents($url); 
preg_match_all("|<a [^>]+>(.*)</[^>]+>|U", $content, $links, PREG_PATTERN_ORDER); 
$l_clean = array(); 
foreach($links[0] as $link){ 
     $e_link = explode("href",$link); 
     $e_link = explode("\"",$e_link[1]); 
     $f_link = $e_link[1]; 
     if((substr($f_link,0,strlen('javascript:;')) != "javascript:;")){ 

      $sperator = ""; 

      $first = substr($f_link,0,1); 
      if($first != "/"){ 
       $f_link = "/$f_link"; 
      } 

      if(substr($f_link,0,7) != "http://"){ 
       $f_link = "http://" . $sperator . $_SERVER['HTTP_HOST'] . $f_link;    
      } 
      $f_link = str_replace("///","//",$f_link); 
      if(!in_array($f_link, $l_clean)){ 
       array_push($l_clean , $f_link); 
      } 
    } 
} 
} 
+1

可能重複HTTP:// stackove rflow.com/questions/4736906/i-need-help-making-a-website-crawler-using-php) - 或 - http://stackoverflow.com/questions/2313107/how-do-i-make-a - 簡單 - 履帶式的PHP – mario

回答

1

只要做到這一點遞歸,並設置一個深度終止:

function getLinks($url, $depth){ 
    if(--$depth <= 0) return; 
    $content = file_get_contents($url); 
    preg_match_all("|<a [^>]+>(.*)</[^>]+>|U", $content, $links, PREG_PATTERN_ORDER); 
    $l_clean = array(); 
    foreach($links[0] as $link){ 
      $e_link = explode("href",$link); 
      $e_link = explode("\"",$e_link[1]); 
      $f_link = $e_link[1]; 
      if((substr($f_link,0,strlen('javascript:;')) != "javascript:;")){ 

       $sperator = ""; 

       $first = substr($f_link,0,1); 
       if($first != "/"){ 
        $f_link = "/$f_link"; 
       } 

       if(substr($f_link,0,7) != "http://"){ 
        $f_link = "http://" . $sperator . $_SERVER['HTTP_HOST'] . $f_link;    
       } 
       $f_link = str_replace("///","//",$f_link); 
       if(!in_array($f_link, $l_clean)){ 
        array_push($l_clean , $f_link); 
        getLinks($f_link, $depth); 
       } 
     } 
    } 
} 

$links = getLinks("http://myurl.com", 3); 
的[我需要別人幫助了使用PHP的網站爬蟲(