1
我試圖刮掉所有從本網站的CSV刮痧的CSV:transparentnevada.com與Scrapy
當您導航到一個具體的機構,即http://transparentnevada.com/salaries/2016/university-nevada-reno/,並創下下載記錄,有多項CSV中的鏈接。我想下載所有的CSV。
我的蜘蛛運行,並出現抓取所有的記錄,但不下載任何東西:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
class Spider2(CrawlSpider):
#name of the spider
name = 'nevada'
#list of allowed domains
allowed_domains = ['transparentnevada.com']
#starting url for scraping
start_urls = ['http://transparentnevada.com/salaries/all/']
rules = [
Rule(LinkExtractor(
allow=['/salaries/all/*']),
follow=True),
Rule(LinkExtractor(
allow=['/salaries/2016/*/']),
follow=True),
Rule(LinkExtractor(
allow=['/salaries/2016/*/#']),
callback='parse_article',
follow=True),
]
#setting the location of the output csv file
custom_settings = {
'FEED_FORMAT' : "csv",
'FEED_URI' : 'tmp/nevada2.csv'
}
def parse_article(self, response):
for href in response.css('div.view-downloads a[href$=".csv"]::attr(href)').extract():
yield Request(
url=response.urljoin(href),
callback=self.save_pdf
)
def save_pdf(self, response):
path = response.url.split('/')[-1]
self.logger.info('Saving CSV %s', path)
with open(path, 'wb') as f:
f.write(response.body)
日誌?在pastebin上創建它們併發布鏈接 –