2017-07-07 62 views
0

我正在編寫一個網絡爬蟲來提取網站的信息。但是,由於我使用美麗的湯4在窗口上提取大量數據,因此爬取速度非常緩慢。任何人都可以告訴我如何在我的情況下使用多線程。如果你想幫助我,非常感謝你。我的代碼如下:使用多線程加速由beautifulsoup4和python編寫的網絡爬蟲

import requests 
from html.parser import HTMLParser 
from bs4 import BeautifulSoup 
import re 
import time 
import sys 
import json 


HTML_PARSER = "html.parser" 



def get_shop_link_list(links): 
global food_id 
list_req=requests.get(links) 
if list_req.status_code == requests.codes.ok: 
    soup = BeautifulSoup(list_req.content, HTML_PARSER) 
    link_tag=soup.find_all('h2',{'class':'heavy break-word'}) 
    for h2 in link_tag: 
     print(food_id,end="@") 
     link=h2.find('a') 
     parse_shop_information(link['href']) 
     food_id=food_id+1 
     print("")  
#turn next page 
    try: 
     next_page=soup.find('a',{'class':'button -small'}) 
     get_shop_link_list(next_page['href']) 
    except KeyError: 
     pass 



def parse_shop_information(shop_link): 
req=requests.get(shop_link) 
if req.status_code == requests.codes.ok: 
    soup = BeautifulSoup(req.content, HTML_PARSER) 

    #restaurant name 
    shop_header_tags = soup.find('div',{'class' : 'cassette'}) 
    japname_tag=shop_header_tags.find('p',{'class':'small'}) 
    print(japname_tag.get_text(" ",strip=True).encode(encoding="utf-8",errors="strict").decode(sys.stdin.encoding).strip(),end="@") 
    name_tag = shop_header_tags.find('h1',{'class' : 'jumbo break-word'}) 
    print(name_tag.get_text(" ",strip=True).encode(encoding="utf-8",errors="strict").decode(sys.stdin.encoding).strip(),end="@") 
    #Basic information 
    shop_body_tags = soup.find('ul',{'class' : 'icon-list -space sentence'}) 
    information = shop_body_tags.find_all('li') 
    for li in information: 
     print((li.get_text("|",strip=True).encode(encoding="utf-8",errors="strict")).strip(),end="{") 
    #Detail information 
    restaurant_detail = soup.find_all("table", {"class":"table sentence"}) 
    basic_detail=restaurant_detail[0].find_all('tr') 
    address="No record" 
    address_pic="No record" 
    access="No record" 
    parking="No record" 
    service="No record" 
    cards="No record" 
    for tr in basic_detail: 
     if tr.find('th').get_text(" ",strip=True)=='Address': 
      address=((tr.find('p').get_text(" ",strip=True))) 
      address_pic=((tr.find('img')["src"])) 
     if tr.find('th').get_text(" ",strip=True)=='Access': 
      access=((tr.find('td').get_text(" ",strip=True))) 
     if tr.find('th').get_text(" ",strip=True)=='Parking': 
      parking=((tr.find('td').get_text(" ",strip=True))) 
     if tr.find('th').get_text(" ",strip=True)=='Service charge': 
      service=((tr.find('td').get_text(" ",strip=True))) 
     if tr.find('th').get_text(" ",strip=True)=='Cards Accepted': 
      cards=((tr.find('td').get_text(" ",strip=True))) 
    print(address.strip(),end="@") 
    print(address_pic.strip(),end="@") 
    print(access.strip(),end="@") 
    print(parking.strip(),end="@") 
    print(service.strip(),end="@") 
    print(cards.strip(),end="@") 

    try: 
     facility_detail=restaurant_detail[1].find_all('tr') 
     seating="No record" 
     MPS="No record" 
     RMPS="No record" 
     Smoking="No record" 
     WAR="No record" 
     KF="No record" 
     LS="No record" 
     WP="No record" 
     Other="No record" 
     for tr in facility_detail: 
      if tr.find('th').get_text(" ",strip=True)=='Seating Capacity': 
       seating=((tr.find('td').get_text(" ",strip=True))) 
      if tr.find('th').get_text(" ",strip=True)=='Banquet Maximum Party Size': 
       MPS=((tr.find('td').get_text(" ",strip=True))) 
      if tr.find('th').get_text(" ",strip=True)=='Reservation Maximum Party Size': 
       RMPS=((tr.find('td').get_text(" ",strip=True))) 
      if tr.find('th').get_text(" ",strip=True)=='Smoking': 
       Smoking=((tr.find('td').get_text(" ",strip=True))) 
      if tr.find('th').get_text(" ",strip=True)=='Wheelchair Accessible Restrooms': 
       WAR=((tr.find('td').get_text(" ",strip=True))) 
      if tr.find('th').get_text(" ",strip=True)=='Kid Friendly': 
       KF=((tr.find('td').get_text(" ",strip=True))) 
      if tr.find('th').get_text(" ",strip=True)=='Language Support': 
       LS=((tr.find('td').get_text(" ",strip=True))) 
      if tr.find('th').get_text(" ",strip=True)=='Wi-Fi/Plug-in': 
       WP=((tr.find('td').get_text(" ",strip=True))) 
      if tr.find('th').get_text(" ",strip=True)=='Other': 
       Other=((tr.find('td').get_text(" ",strip=True))) 
     print(seating.strip(),end="@") 
     print(MPS.strip(),end="@") 
     print(RMPS.strip(),end="@") 
     print(Smoking.strip(),end="@") 
     print(WAR.strip(),end="@") 
     print(KF.strip(),end="@") 
     print(LS.strip(),end="@") 
     print(WP.strip(),end="@") 
     print(Other.strip(),end="@") 


    except IndexError: 
     print("No record",end="@") 
     print("No record",end="@") 
     print("No record",end="@") 
     print("No record",end="@") 
     print("No record",end="@") 
     print("No record",end="@") 
     print("No record",end="@") 
     print("No record",end="@") 
     print("No record",end="@") 

    try: 
     other_detail=restaurant_detail[2].find_all('tr') 
     menu="No record" 
     lunch="No record" 
     dress_code="No record" 
     Delivery="No record" 
     for tr in other_detail: 
      if tr.find('th').get_text(" ",strip=True)=='Lunch Service': 
       lunch=(tr.find('td').get_text(" ",strip=True)) 
      if tr.find('th').get_text(" ",strip=True)=='Dress Code': 
       dress_code=(tr.find('td').get_text(" ",strip=True)) 

      if tr.find('th').get_text(" ",strip=True)=='Menu': 
       lunch=(tr.find('td').get_text(" ",strip=True)) 
      if tr.find('th').get_text(" ",strip=True)=='Delivery/Catering': 
       dress_code=(tr.find('td').get_text(" ",strip=True)) 
     print(menu.strip(),end="@") 
     print(lunch.strip(),end="@") 
     print(dress_code.strip(),end="@") 
     print(Delivery.strip(),end="@") 
    except IndexError: 
     print("No record",end="@") 
     print("No record",end="@") 
     print("No record",end="@") 
     print("No record",end="@") 

    try: 
     main_col_tag=soup.find('div',{'class':'global-navigation'}) 
     main_col=main_col_tag.find_all('li') 
     for li in main_col: 
      if li.find('a').get_text()=="Menu": 
       print("{",end="") 
       sub_menu(shop_link) 
       print("}",end="") 
    except (IndexError, AttributeError): 
     print("No record",end="@") 



def sub_menu(link): 
list_req=requests.get(link) 
if list_req.status_code == requests.codes.ok: 
    soup=BeautifulSoup(list_req.content,HTML_PARSER) 
    all_menu_tag=soup.find('ul',{'class':'-sub-menu hide'}) 
    menus=all_menu_tag.find_all('li') 
    for i, li in enumerate(menus): 
     type=li.find('a').get_text() 
     print("\"",end="") 
     print(type,end="\":[") 
     link=li.find('a') 
     sub_menu_json(link['href']) 
     if i != len(menus)-1: 
      print("]",end=",") 
     else: 
      print("]",end="") 


def sub_menu_json(link): 
list_req=requests.get(link) 
if list_req.status_code == requests.codes.ok: 
    soup = BeautifulSoup(list_req.content, HTML_PARSER) 
    eachfood=soup.find_all('div',{'class':'cassette normal-colored'}) 
    for i,div in enumerate(eachfood): 
     food_jap_name="No record" 
     food_eng_name="No record" 
     food_price="No record" 
     tax_inclusion="No record" 
     description="No record" 
     if div.find('div',{'class':'small'}): 
      food_jap_name=div.find('div',{'class':'small'}).get_text(" ",strip=True) 
     if div.find('h3',{'class':'huge'}): 
      food_eng_name=div.find('h3',{'class':'huge'}).get_text(" ",strip=True) 
     if div.find('h3',{'class':'huge abit-spacing'}): 
      food_eng_name=div.find('h3',{'class':'huge'}).get_text(" ",strip=True) 
     if div.find('p',{'class':'small spacing'}): 
      food_price=div.find('p',{'class':'small spacing'}).get_text(" ",strip=True) 
     if div.find('span',{'class':'-value'}): 
      food_price=div.find('span',{'class':'-value'}).get_text(" ",strip=True) 
     if div.find('p',{'class':'text-right small'}): 
      tax_inclusion=div.find('p',{'class':'text-right small'}).get_text(" ",strip=True) 
     if div.find('div',{'class':'panel -light-silver -in'}): 
      description=div.find('div',{'class':'panel -light-silver -in'}).get_text(" ",strip=True) 
     if div.find('div',{'class':'sake-detail'}): 
      description=div.find('div',{'class':'sake-detail'}).get_text(" ",strip=True) 
     print("{\"JpnFoodname:\":",end="\"") 
     print(food_jap_name.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding).strip(),end="\",") 
     print("\"EngFoodname\":",end="\"") 
     print(food_eng_name.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding).strip(),end="\",") 
     print("\"Price\":",end="\"") 
     print(food_price.strip().encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding).strip(),end="\",") 
     print("\"TaxIncludeExclude\":",end="\"") 
     print(tax_inclusion.strip().encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding).strip(),end="\",") 
     print("\"Description\":",end="\"") 
     print(description.strip().encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding).strip(),end="\",") 
     if div.find('a') or div.find('img'): 
      print("\"ImgUrl:\":",end="\"") 
      if div.find('img'): 
       food_pic=div.find('img')["src"] 
      if div.find('a'): 
       food_pic=div.find('a')['href'] 
      print(food_pic,end="\"}") 
     else: 
      print("\"ImgUrl:\":\"No record\"",end="}") 
     if i != len(eachfood)-1: 
      print(',',end="") 



if __name__ == '__main__': 
global food_id 
food_id=1 
get_shop_link_list("https://gurunavi.com/en/reg/pf_tokyo/rs/srh/?p=461") 
+0

我承認只能通過'get_shop_link_list'瀏覽,但從我看到您使用遞歸方法來抓取網站。我建議你切換到迭代的方法(例如,創建一個隊列並在發現它們時將它們排入隊列,將它們出隊並逐個處理它們)。你也應該檢查你不會循環訪問兩次相同的鏈接(這會使你的遞歸方法btw崩潰)。隨着隊列的建立,你可以產生多個線程分別排隊和處理頁面。 – GPhilo

+2

我建議你使用** Scrapy **,它建立在Twisted上,是一個異步庫,所以你可以瀏覽大量的數據。 – Pablo

+0

是的,我知道Scrapy速度要快得多,但我想要抓取的網站使用起來有點複雜。 – Michael

回答

0

下面是您可以做什麼的大綱;將作業分成單獨的作業,將作業添加到作業隊列中並根據需要啓動儘可能多的進程(將作業隊列傳遞給每個作業)

每個進程都將一個作業從隊列作爲日誌,因爲仍有作業處理。

這樣的工人數量是可配置的

import multiprocessing,Queue 

def getData(tasksQ): 
    while (True): 
     try: 
      job=tasksQ.get(block = False)  
     except Queue.Empty: 
      break; 
     else:    
      <do_work> 


tasks = multiprocessing.Queue() 

for job in getJob(): 
    tasks.put(job) 

noOfProcesses = 10 
processes = []  
for i in range(noOfProcesses): 
    p = multiprocessing.Process(target = getData, args=(tasks,)) 
     processes.append(p) 

for p in processes: 
    p.start()  

for p in processes: 
    p.join() 

希望這有助於。

+0

它確實幫了我很多,謝謝 – Michael