2017-05-04 69 views
1

我想刮掉amazon.com的產品鏈接有超過800個評論,但我一直從下一頁獲得相同的頁面鏈接按鈕,它一直返回頁面2一遍又一遍我應該得到第3,4頁等Python Scrapy繼續從下一頁按鈕獲得相同的頁面鏈接

我已經設置了條件溢出和轉換審查字符像1,020整數和比較如果大於800或不以此爲基礎訪問頁

這裏是代碼

# -*- coding: utf-8 -*- 
import scrapy 
from amazon.items import AmazonItem 
from urlparse import urljoin 


class AmazonspiderSpider(scrapy.Spider): 
    name = "amazonspider" 
    DOWNLOAD_DELAY = 1 
    start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011'] 


    def parse(self, response): 


     SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card' 
     for attr in response.css(SET_SELECTOR): 
      #print '\n\n', attr 

      item = AmazonItem() 

      review_selector = './/*[@class="acs_product-rating__review-count"]/text()' 
      link_selector = './/*[@class="a-link-normal"]/@href' 

      if attr.xpath(review_selector).extract_first(): 
       if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800: 
        url = urljoin(response.url, attr.xpath(link_selector).extract_first()) 
        item['LINKS'] = url 
        if url: 
         yield scrapy.Request(url, callback=self.parse_link, meta={'item': item}) 


      next_page = './/span[@class="pagnRA"]/a[@id="pagnNextLink"]/@href' 
      next_page = response.xpath(next_page).extract_first() 
      print '\n\n', urljoin(response.url, next_page) 
      if next_page: 
       yield scrapy.Request(
        urljoin(response.url, next_page), 
        callback=self.parse 
       ) 
    def parse_link(self, response): 

     item = AmazonItem(response.meta['item']) 

     catselector = '.cat-link ::text' 
     defaultcatselector = '.nav-search-label ::text' 
     cat = response.css(catselector).extract_first() 
     if cat: 
      item['CATAGORY'] = cat 
     else: 
      item['CATAGORY'] = response.css(defaultcatselector).extract_first() 
     return item 

這裏是當我打印調用解析函數之前下一個頁面鏈接輸出遞歸

herehere

,這裏是從頁面的下一個頁面選擇截圖 here 我在哪裏出錯了?

回答

2

將下一頁代碼塊移到循環之外。

class AmazonspiderSpider(scrapy.Spider): 
name = "amazonspider" 
DOWNLOAD_DELAY = 1 
start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011'] 


def parse(self, response): 


    SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card' 
    for attr in response.css(SET_SELECTOR): 
     #print '\n\n', attr 


     review_selector = './/*[@class="acs_product-rating__review-count"]/text()' 
     link_selector = './/*[@class="a-link-normal"]/@href' 

     if attr.xpath(review_selector).extract_first(): 
      if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800: 
       url = urljoin(response.url, attr.xpath(link_selector).extract_first()) 


    next_page = './/span[@class="pagnRA"]/a[@id="pagnNextLink"]/@href' 
    next_page = response.xpath(next_page).extract_first() 
    print '\n\n', urljoin(response.url, next_page) 

    if next_page: 
     yield scrapy.Request(
      urljoin(response.url, next_page), 
      callback=self.parse 
     ) 
相關問題