2017-05-29 73 views
3

我寫了一些代碼來解析來自yell.com不同商店的名稱,地址和電話號碼。如果有任何鏈接提供給我的抓取工具,它將解析整個內容,而不管它傳播了多少頁面。但是,我發現的唯一問題是,它總是跳過第一頁的內容,如果有10頁,我的抓取工具會抓取最後9頁。有點抽搐可能導致我得到解決方法。這是完整的代碼。提前致謝。刮刀無法解析從第一頁的內容

Sub YellUK() 
Const mlink = "https://www.yell.com" 
Dim http As New MSXML2.XMLHTTP60, html As New HTMLDocument, htm As New HTMLDocument 
Dim post As HTMLHtmlElement, page As Object, newlink As String 

With http 
    .Open "GET", "https://www.yell.com/ucs/UcsSearchAction.do?keywords=pizza&location=United+Kingdom&scrambleSeed=1426936001", False 
    .send 
    html.body.innerHTML = .responseText 
End With 
Set page = html.getElementsByClassName("row pagination")(0).getElementsByTagName("a") 
For i = 0 To page.Length - 2 
    newlink = mlink & Replace(page(i).href, "about:", "") 
    With http 
     .Open "GET", newlink, False 
     .send 
     htm.body.innerHTML = .responseText 
    End With 

    For Each post In htm.getElementsByClassName("js-LocalBusiness") 
     x = x + 1 
     With post.getElementsByClassName("row businessCapsule--title")(0).getElementsByTagName("a") 
      If .Length Then Cells(x + 1, 1) = .Item(0).innerText 
     End With 
     With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span") 
      If .Length > 1 Then Cells(x + 1, 2) = .Item(1).innerText 
     End With 
     With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span") 
      If .Length > 2 Then Cells(x + 1, 3) = .Item(2).innerText 
     End With 
     With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span") 
      If .Length > 3 Then Cells(x + 1, 4) = .Item(3).innerText 
     End With 
     With post.getElementsByClassName("businessCapsule--tel") 
      If .Length > 1 Then Cells(x + 1, 5) = .Item(1).innerText 
     End With 
    Next post 
Next i 
End Sub 

下面是其內的下一個頁面的頁面數存儲元素:

<div class="row pagination"> 
<div class="col-sm-24"> 
&nbsp;<span class="pagination--page is-selected">1</span> 
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=2" data-tracking="DISPLAY:PAGINATION:NUMBER">2</a> 
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=3" data-tracking="DISPLAY:PAGINATION:NUMBER">3</a> 
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=4" data-tracking="DISPLAY:PAGINATION:NUMBER">4</a> 
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=5" data-tracking="DISPLAY:PAGINATION:NUMBER">5</a> 
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=6" data-tracking="DISPLAY:PAGINATION:NUMBER">6</a> 
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=7" data-tracking="DISPLAY:PAGINATION:NUMBER">7</a> 
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=8" data-tracking="DISPLAY:PAGINATION:NUMBER">8</a> 
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=9" data-tracking="DISPLAY:PAGINATION:NUMBER">9</a> 
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=10" data-tracking="DISPLAY:PAGINATION:NUMBER">10</a> 
&nbsp;<a rel="nofollow" class="pagination--next" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=2" data-tracking="DISPLAY:PAGINATION:NEXT">Next</a> 
</div> 
</div> 

回答

1

這裏的問題是,第一頁已被選中的事實,因此它不具有分頁中的錨點。解決方案將首先處理第一頁,然後使用分頁處理剩餘頁面。 HTH

Option Explicit 

Sub YellUK() 
Const mlink = "https://www.yell.com" 
Dim http As New MSXML2.XMLHTTP60 
Dim html As New HTMLDocument 
Dim page As Object, newlink As String 

With http 
    .Open "GET", "https://www.yell.com/ucs/UcsSearchAction.do?keywords=pizza&location=United+Kingdom&scrambleSeed=1426936001", False 
    .send 
    html.body.innerHTML = .responseText 
End With 

Set page = html.getElementsByClassName("row pagination")(0).getElementsByTagName("a") 

Dim i, x 
' First page first, is selected already, 'row pagination' doesn't have 'a' for it 
GetPageData x, html 

' Next pages then 
Dim html2 As New HTMLDocument 
For i = 0 To page.Length - 2 
    newlink = mlink & Replace(page(i).href, "about:", "") 
    With http 
     .Open "GET", newlink, False 
     .send 
     html2.body.innerHTML = .responseText 
    End With 
    GetPageData x, html2 
Next i 
End Sub 

Private Sub GetPageData(ByRef x, ByRef html As HTMLDocument) 
    Dim post As HTMLHtmlElement 
    For Each post In html.getElementsByClassName("js-LocalBusiness") 
     x = x + 1 
     With post.getElementsByClassName("row businessCapsule--title")(0).getElementsByTagName("a") 
      If .Length Then Cells(x + 1, 1) = .Item(0).innerText 
     End With 
     With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span") 
      If .Length > 1 Then Cells(x + 1, 2) = .Item(1).innerText 
     End With 
     With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span") 
      If .Length > 2 Then Cells(x + 1, 3) = .Item(2).innerText 
     End With 
     With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span") 
      If .Length > 3 Then Cells(x + 1, 4) = .Item(3).innerText 
     End With 
     With post.getElementsByClassName("businessCapsule--tel") 
      If .Length > 1 Then Cells(x + 1, 5) = .Item(1).innerText 
     End With 
    Next post 
End Sub 

編輯: 可能是這樣的。第一頁鏈接是爲i=-1創建的,然後是下一頁。

For i = -1 To page.Length - 2 
    If i = -1 Then 
     newlink = mlink & Replace(page(i + 1).href, "about:", "") 
     newlink = Left(newlink, Len(newlink) - 1) & "1" 
    Else 
     newlink = mlink & Replace(page(i).href, "about:", "") 
    End If 
    Debug.Print i & ", " & newlink ' Prints the links for all the pages 
    With http 
     .Open "GET", newlink, False 
     .send 
     htm.body.innerHTML = .responseText 
    End With 
    ' Get page data here ... 
Next i 
+0

謝謝主席先生,讓您一直處於循環狀態總是非常高興。您的解決方案確實獲得了全部內容。難道不可能在一個子程序中創建整個事物嗎? – SIM

+0

歡迎您!這是可能的,例如,首先準備好'URLs'(包括第一頁,然後通過這個'URLs'循環),否則就像我寫的那樣:第一頁當前被加載,並且在分頁控件中沒有任何'a'元素合乎邏輯的,因爲分頁控件包含指向下一頁的鏈接) – dee

+0

請先生,再詳細解釋一下我可以如何在一個子程序中完成整個工作 – SIM