1
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from asdf.items import AsdfItem
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst
from scrapy.http.request import Request
import scrapy
class ProductLoader(ItemLoader):
default_output_processor = TakeFirst()
class MySpider(BaseSpider):
name = "asdf"
search_text = "midi key synth"
allowed_domains = ["http://www.amazon.com"]
start_urls = ["http://www.amazon.com/s?ie=UTF8&page=1&rh=i%3Aaps%2Ck%3A" + search_text]
def parse(self, response):
#title
view = '//a[contains(@class, "a-link-normal s-access-detail-page a-text-normal")]'
nextPage = '//a[contains(@title, "Next Page")]'
nextPageLink = 'http://www.amazon.com' + response.xpath(nextPage + '/@href').extract()[0]
i = 0
for sel in response.xpath(view):
l = ItemLoader(item=AsdfItem(), selector=sel)
l.add_xpath('title','.//@title')
i+=1
yield l.load_item()
request = Request(nextPageLink, callback=self.parse_page2)
request.meta['item'] = AsdfItem()
yield request
def parse_page2(self, reponse):
#title
view = '//a[contains(@class, "a-link-normal s-access-detail-page a-text-normal")]'
nextPage = '//a[contains(@title, "Next Page")]'
nextPageLink = 'http://www.amazon.com' + response.xpath(nextPage + '/@href').extract()[0]
i = 0
for sel in response.xpath(view):
l = ItemLoader(item=AsdfItem(), selector=sel)
l.add_xpath('title','.//@title')
i+=1
yield l.load_item()
我有一個scrapy bot爬行亞馬遜和尋找標題。爲什麼響應/請求不適用於抓取後續頁面?我通過創建nextPageLink變量並將其推入請求來識別下一頁。爲什麼這不起作用?我怎麼修復它?scrapy請求/響應(爬行到第2,3頁等)
理想情況下,我想抓取所有後續頁面。
下手,你的'允許domains'應該_not_包括協議。嘗試'allowed_domains = ['www.amazon.com']。 – tegancp