2016-11-15 63 views
0

以下是我用來刮取產品信息的代碼。網頁上有很多產品。我把它們都刮掉了,然後轉到下一頁。問題在於scrapy只是選擇頁面上的第一個產品,而不是遍歷頁面上的所有產品。我哪裏錯了?使用scrapy颳去不同的產品信息

import re 
import time 
import sys 
from scrapy.spider import BaseSpider 
from scrapy.selector import Selector 
from scrapy.http import Request 
import parsedatetime 
from datetime import datetime 
from airline_sentiment.items import * 
from airline_sentiment.spiders.crawlerhelper import * 

class TripAdvisorRestaurantBaseSpider(BaseSpider): 
    name = "shoebuy" 

    allowed_domains = ["shoebuy.com"] 
    base_uri = "http://www.shoebuy.com" 
    start_urls = [ 
       base_uri + "/womens-leather-boots/category_2493?cm_sp=cat-_-d_womensboots_tiles_b1_leather-_-092216" 
       ] 


    def parse(self, response): 

     sel = Selector(response) 

     snode_airline = sel.xpath('//*[starts-with(@class, "pt_grid")]/div[starts-with(@class, "pt_product\")]') 

     for snode_restaurant in snode_airline: 
      tripadvisor_item = AirlineSentimentItem() 

      tripadvisor_item['url'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/@href')) 

      tripadvisor_item['name'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/span[@class="pt_title"]/text()')) 
      tripadvisor_item['price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/span[@class="pt_price"]/text()')) 
      tripadvisor_item['discount'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_percent_off"]/text()')) 
      tripadvisor_item['orig_price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_price_orig"]/text()')) 
      tripadvisor_item['stars'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//*[@class="bv-rating-ratio"]/span/span[3]/text()')) 
      tripadvisor_item['reviews'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "bv-inline-rating-container")]/dl/dd[2]/span/text()')) 

      yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_fetch_review) 


     next_page_url = clean_parsed_string(get_parsed_string(sel, '//div[@class="paging"]/a[@class="next"]/@href')) 
     if next_page_url and len(next_page_url) > 0: 
      yield Request(url=self.base_uri + next_page_url, meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_next_page) 

    def parse_next_page(self, response): 
     sel = Selector(response) 

     snode_airline = sel.xpath('//*[starts-with(@class, "pt_grid")]/div[starts-with(@class, "pt_product")]') 

     for snode_restaurant in snode_airline: 

      tripadvisor_item = AirlineSentimentItem() 

      tripadvisor_item['url'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/@href')) 
      tripadvisor_item['name'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/span[@class="pt_title"]/text()')) 
      tripadvisor_item['price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/span[@class="pt_price"]/text()')) 
      tripadvisor_item['discount'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_percent_off"]/text()')) 
      tripadvisor_item['orig_price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_price_orig"]/text()')) 
      tripadvisor_item['stars'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//*[@class="bv-rating-ratio"]/span/span[3]/text()')) 
      tripadvisor_item['reviews'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "bv-inline-rating-container")]/dl/dd[2]/span/text()')) 

      yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_fetch_review) 

     next_page_url = clean_parsed_string(get_parsed_string(sel, '//div[@class="paging"]/a[@class="next"]/@href')) 
     if next_page_url and len(next_page_url) > 0: 
      yield Request(url=self.base_uri + next_page_url, meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_next_page) 

    def parse_fetch_review(self, response): 

     tripadvisor_item = response.meta['tripadvisor_item'] 
     sel = Selector(response) 

     snode_reviews = sel.xpath('//*[starts-with(@class, "product_info_wrapper")]') 

     for snode_review in snode_reviews: 

      tripadvisor_item['img'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_review, '//div[starts-with(@class,"large_thumb")]/img/@src')) 

      tripadvisor_item['desc'] = clean_parsed_string(get_parsed_string(snode_review, '//*[starts-with(@class,"product_information")]/div[1]/span/text()')) 

      tripadvisor_item['brand'] = clean_parsed_string(get_parsed_string(snode_review, '//div[starts-with(@class,"seo_module")]/h3/text()')) 

     yield tripadvisor_item 

回答

0

這是故障線路:

 tripadvisor_item['url'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/@href')) 

的XPath與..//div開始表示相對節點:

'.//div[starts-with(@class, "pt_info")]/a/@href' 

。因爲沒有相對的XPath使到你的節點(用'。'表示法),你總是在頁面上獲得第一個產品鏈接,作爲每個項目的URL。現在,scrapy具有自動重複URL過濾器,所以發生的事情是您的所有檢索評論請求都會在稍後過濾出來,而您最終得到的只是第一個項目。

鉈;博士:只是.之前你//在親戚的XPath補充。

+0

完美的工作。謝謝。此外,我沒有得到明星和評論價值(我得到None)。我不知道爲什麼我給出的xpath不起作用。如果我能爲它找到一些解決方案,那將是非常好的。 –

+0

@NeelShah它發生是因爲星號和評論是由一些JavaScript調用(ajax)生成的,scrapy不執行任何JavaScript。也許你應該爲此打開一個新的問題,因爲它與當前的無關。 – Granitosaurus