0
我想建立這種爬蟲從Craigslist網站得到住房數據,Scrapy履帶不會遞歸爬行下一頁
,但獲取的第一頁後,履帶停止,不進入下一個頁面。
下面是代碼,它的工作原理爲第一頁,但對上帝的愛我不明白爲什麼它不進入下一個頁面。任何見解是非常感謝。我跟着this part from scrapy tutorial
import scrapy
import re
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(scrapy.Spider):
name = "craigslistmm"
start_urls = [
"https://vancouver.craigslist.ca/search/hhh"
]
def parse_second(self,response):
#need all the info in a dict
meta_dict = response.meta
for q in response.css("section.page-container"):
meta_dict["post_details"]= {
"location":
{"longitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-longitude)").extract(),
"latitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-latitude)").extract()},
"detailed_info": ' '.join(q.css('section#postingbody::text').extract()).strip()
}
return meta_dict
def parse(self, response):
pattern = re.compile("\/([a-z]+)\/([a-z]+)\/.+")
for q in response.css("li.result-row"):
post_urls = q.css("p.result-info a::attr(href)").extract_first()
mm = re.match(pattern, post_urls)
neighborhood= q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
next_url = "https://vancouver.craigslist.ca/"+ post_urls
request = scrapy.Request(next_url,callback=self.parse_second)
#next_page = response.xpath('.//a[@class="button next"]/@href').extract_first()
#follow_url = "https://vancouver.craigslist.ca/" + next_page
#request1 = scrapy.Request(follow_url,callback=self.parse)
#yield response.follow(next_page,callback = self.parse)
request.meta['id'] = q.css("li.result-row::attr(data-pid)").extract_first()
request.meta['pricevaluation'] = q.css("p.result-info span.result-meta span.result-price::text").extract_first()
request.meta["information"] = q.css("p.result-info span.result-meta span.housing::text").extract_first()
request.meta["neighborhood"] =q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
request.meta["area"] = mm.group(1)
request.meta["adtype"] = mm.group(2)
yield request
#yield scrapy.Request(follow_url, callback=self.parse)
next_page = LinkExtractor(allow="s=\d+").extract_links(response)[0]
# = "https://vancouver.craigslist.ca/" + next_page
yield response.follow(next_page.url,callback=self.parse)
只有將由此中獲取一個鏈接,我都試過DIFF方式,類似於你所提到的,但它沒有奏效。 – Bg1850
但我再試着用你的soln並更新 – Bg1850
它對我有用(至少直到我的IP被阻止.. :-)) –