2016-03-01 116 views
0

我有這段代碼嘗試使用PHP獲得分頁鏈接,但結果並不安靜。任何人都可以幫助我。獲取分頁鏈接使用PHP和簡單的html dom

我回來的只是第一個鏈接的重複實例。

<?php 
include_once('simple_html_dom.php'); 
function dlPage($href) { 
    $curl = curl_init(); 
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); 
    curl_setopt($curl, CURLOPT_HEADER, false); 
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); 
    curl_setopt($curl, CURLOPT_URL, $href); 
    curl_setopt($curl, CURLOPT_REFERER, $href); 
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE); 
    curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.125 Safari/533.4"); 
    $str = curl_exec($curl); 
    curl_close($curl); 

    // Create a DOM object 
    $dom = new simple_html_dom(); 
    // Load HTML from a string 
    $dom->load($str); 


    $Next_Link = array(); 
    foreach($dom->find('a[title=Next]') as $element){ 
     $Next_Link[] = $element->href; 
    } 

    print_r($Next_Link); 

    $next_page_url = $Next_Link[0]; 
    if($next_page_url !='') { 
     echo '<br>' . $next_page_url; 
     $dom->clear(); 
     unset($dom); 

     //load the next page from the pagination to collect the next link 
     dlPage($next_page_url); 
    } 

} 

$url = 'https://www.jumia.com.gh/phones/'; 
$data = dlPage($url); 
//print_r($data) 
?> 

什麼,我想是
mySiteUrl/?facet_is_mpg_child=0&viewType=gridView&page=2
mySiteUrl//?facet_is_mpg_child=0&viewType=gridView&page=3

。 。 。 到分頁中的最後一個鏈接。請幫忙

回答

0

這是它。看看我htmlspecialchars_decode鏈接。因爲curl中的href不應該像xml中那樣是&。 dlPage應該返回的值是分頁中的最後一個鏈接。我明白如此。

<?php 
include_once('simple_html_dom.php'); 

function dlPage($href, $already_loaded = array()) { 
    $curl = curl_init(); 
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); 
    curl_setopt($curl, CURLOPT_HEADER, false); 
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); 
    curl_setopt($curl, CURLOPT_URL, $href); 
    curl_setopt($curl, CURLOPT_REFERER, $href); 
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE); 
    curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.125 Safari/533.4"); 
    $htmlPage = curl_exec($curl); 
    curl_close($curl); 

    echo "Loading From URL:" . $href . "<br/>\n"; 
    $already_loaded[$href] = true; 

    // Create a DOM object 
    $dom = file_get_html($href); 
    // Load HTML from a string 
    $dom->load($htmlPage); 

    $next_page_url = null; 
    $items = $dom->find('ul[class="osh-pagination"] li[class="item"] a[title="Next"]'); 

    foreach ($items as $item) { 
     $link = htmlspecialchars_decode($item->href); 
     if (!isset($already_loaded[$link])) { 
      $next_page_url = $link; 
      break; 
     } 
    } 

    if ($next_page_url !== null) { 
     $dom->clear(); 
     unset($dom); 

     //load the next page from the pagination to collect the next link 
     return dlPage($next_page_url, $already_loaded); 
    } 

    return $href; 
} 

$url = 'https://www.jumia.com.gh/phones/'; 
$data = dlPage($url); 
echo "DATA:" . $data . "\n"; 

,輸出是:

Loading From URL:https://www.jumia.com.gh/phones/<br/> 
Loading From URL:https://www.jumia.com.gh/phones/?facet_is_mpg_child=0&viewType=gridView&page=2<br/> 
Loading From URL:https://www.jumia.com.gh/phones/?facet_is_mpg_child=0&viewType=gridView&page=3<br/> 
Loading From URL:https://www.jumia.com.gh/phones/?facet_is_mpg_child=0&viewType=gridView&page=4<br/> 
Loading From URL:https://www.jumia.com.gh/phones/?facet_is_mpg_child=0&viewType=gridView&page=5<br/> 
DATA:https://www.jumia.com.gh/phones/?facet_is_mpg_child=0&viewType=gridView&page=5 
相關問題