CrawlSpider無法遵循一些網站的規則

我試圖開始我的第一個scrapy項目，我陷入了一個奇怪的問題。對於某些網站，我的抓取工具運行良好，對於其他網站，它不遵循提取鏈接的規則。我在SO上搜索，看到其他人有類似的問題，但在他們的情況下，他們的格式錯誤allow參數導致Filtered offsite request，這不會發生在我身上。我的日誌在這裏http://pastebin.com/r1pXmeJW（首先是失敗的url，然後是一個正常工作的url，因爲我不能發佈超過2個鏈接...）。CrawlSpider無法遵循一些網站的規則

我的蜘蛛是通過使用該API的Python腳本控制：

# -*- coding: utf-8 -*- 

from twisted.internet import reactor 
from scrapy.crawler import Crawler 
from scrapy import log, signals 
from scrapy.utils.project import get_project_settings 
from govcrawl.spiders.main_spider import DomainSpider 
import sys, urlparse, re 
from scrapy.contrib.spiders import Rule 
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor 

args = sys.argv[1].split('§') 
url_id = args[0] 
start_url = args[1] 
url_parts = urlparse.urlparse(start_url) 
allowed_domain = url_parts.netloc 
allowed_path = '/'.join(url_parts.path.split('/')[:-1]) 
cur_state = sys.argv[2] 

spider = DomainSpider(
    start_urls = [start_url], 
    allowed_domains = [allowed_domain], 
    url_id = url_id, 
    cur_state = cur_state, 
    rules = (
     Rule(
     LxmlLinkExtractor(
      allow = re.compile(r".*%s.*" % re.escape(allowed_path), re.IGNORECASE), 
      allow_domains = [allowed_domain], 
      tags = ('a', 'area', 'frame'), 
      attrs = ('href', 'src') 
     ), 
     callback = "parse_items", 
     follow = True 
    ), 
    ) 
) 
settings = get_project_settings() 
crawler = Crawler(settings) 
crawler.signals.connect(reactor.stop, signal = signals.spider_closed) 
crawler.configure() 
crawler.crawl(spider) 
crawler.start() 
log.start() 
reactor.run()

這是我DomainSpider：

import re 
from govcrawl.items import DomainItem 
from scrapy.utils.markup import remove_tags 
from scrapy.contrib.spiders import CrawlSpider 
from scrapy import log 

class DomainSpider(CrawlSpider): 
    name = "govcrawl_main" 

    def parse_start_url(self, response): 
     return self.parse_items(response) 

    def parse_items(self, response): 
     pages_done = self.crawler.stats.get_value('downloader/response_count') 
     pages_todo = self.crawler.stats.get_value('scheduler/enqueued') - self.crawler.stats.get_value('downloader/response_count') 
     log.msg("URL: %s (%s) Crawled %d pages. To Crawl: %d" % (self.start_urls[0], self.url_id, pages_done, pages_todo), spider = self) 
     links = [] 
     for sel in response.xpath('//a'): 
      href = sel.xpath('@href').extract() 
      if len(href) > 0: 
       href = href[0] 
       if href.startswith("http"): 
       links.append(href) 
     item = DomainItem() 
     item["url"] = response.url 
     item["text"] = re.sub(r'\s{2,}', ' ', remove_tags(' '.join(response.xpath('//body//text()').extract()))).strip() 
     item["links"] = links 
     self.crawler.stats.inc_value('pages_crawled') 
     yield item

任何想法如何使履帶遵循網站規則那失敗了？

來源

2014-11-04 Mikk

有了'ìpdb'我能在這裏https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/linkextractors/lxmlhtml.py#L97把休息，看到在' html'正文的內容不正確，沒有鏈接。我**知道**服務器正在向我發送正確的html，所以問題出在數據獲取和鏈接提取器的調用者之間...... – Mikk 2014-11-04 22:56:06

原來，返回錯誤的頁面有一個格式不正確的html代碼，其中多個</html>，其中lxml解析器不喜歡。由於scrapy不允許使用CrawlSpider有不同的解析器，我最終重新實現定期Spider對象，其行爲或多或少爲CrawlSpider：

import urlparse, re 
from scrapy import Spider, log 
from bs4 import BeautifulSoup 
from scrapy.http import Request 
from govcrawl.items import DomainItem 

class DomainSimpleSpider(Spider): 
    name = "govcrawl_simple" 

    def parse(self, response): 
     pages_done = self.crawler.stats.get_value('downloader/response_count') 
     pages_todo = self.crawler.stats.get_value('scheduler/enqueued') - self.crawler.stats.get_value('downloader/response_count') 
     log.msg("URL: %s (%s) Crawled %d pages. To Crawl: %d" % (self.start_urls[0], self.url_id, pages_done, pages_todo), spider = self) 
     #import ipdb 
     #ipdb.set_trace() 
     soup = BeautifulSoup(response._body, "html5lib") 
     links = [] 
     for tag in self.tags: 
      for a in soup.find_all(tag): 
       for attr in self.attrs: 
       if attr in a.attrs: 
        href = a.attrs[attr] 
        if href.startswith("http"): 
         links.append(href) 
        href = urlparse.urljoin(response.url, href) 
        href_parts = urlparse.urlparse(href.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '+')) 
        if re.match(self.allow, href_parts.path) and not self.forbidden_extension(href_parts.path): 
         yield Request(href) 
     for script in soup(["script", "style"]): 
      script.extract() 
     item = DomainItem() 
     item["url"] = response.url 
     #item["text"] = re.sub(r'\s{2,}', ' ', remove_tags(' '.join(response.xpath('//body//text()').extract()))).strip() 
     item["text"] = soup.get_text() 
     item["links"] = links 
     self.crawler.stats.inc_value('pages_crawled') 
     yield item 

    def forbidden_extension(self, url): 
     url = url.lower() 
     return url.endswith("pdf") or url.endswith("jpg") or url.endswith("wmv") or url.endswith("avi") or url.endswith("pptx") or url.endswith("gif") or url.endswith("mp3") or url.endswith("mp4") or url.endswith("wav") or url.endswith("mov") or url.endswith("ppt") or url.endswith("xls") or url.endswith("doc") or url.endswith("docx") or url.endswith("xlsx") or url.endswith("flv") or url.endswith("wma") or url.endswith("jpeg") or url.endswith("png") or url.endswith("odf") or url.endswith("ods") or url.endswith("zip") or url.endswith("gz") or url.endswith("tar") or url.endswith("7z") or url.endswith("rar") or url.endswith("vob")

這種蜘蛛可以通過以下Python腳本進行控制：

from twisted.internet import reactor 
from scrapy.crawler import Crawler 
from scrapy import log, signals 
from scrapy.utils.project import get_project_settings 
from govcrawl.spiders.simple_spider import DomainSimpleSpider 
import urlparse, re 
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor 

start_url = ... 
url_parts = urlparse.urlparse(start_url) 
allowed_domain = url_parts.netloc 
allowed_path = '/'.join(url_parts.path.split('/')[:-1]) 

spider = DomainSimpleSpider(
    start_urls = [start_url], 
    allowed_domains = [allowed_domain], 
    allow = re.compile(r".*%s.*" % re.escape(allowed_path), re.IGNORECASE), 
    tags = ('a', 'area', 'frame'), 
    attrs = ('href', 'src'), 
    response_type_whitelist = [r"text/html", r"application/xhtml+xml", r"application/xml"] 
) 
settings = get_project_settings() 
crawler = Crawler(settings) 
crawler.signals.connect(reactor.stop, signal = signals.spider_closed) 
crawler.configure() 
crawler.crawl(spider) 
crawler.start() 
log.start() 
reactor.run()

需要注意的是：

我使用html5lib解析器從BeautifulSoup而不是lxml。 html5lib可以很好地處理多個</html>，但它是一個外部依賴項，因此您必須安裝它。
出於某種原因，mimetype檢查似乎不起作用。因此，我增加了一個forbidden_extensions功能防止Request創建非html文件，我不得不添加另一個DownloaderMiddleware，它利用了蜘蛛的response_type_whitelist（見Python Scrapy - mimetype based filter to avoid non-text file downloads爲中間件實現）
看來，這個蜘蛛正在處理起始頁面兩次，但我坦率地不在意解決這個問題。

來源

2014-11-05 18:56:16 Mikk

CrawlSpider無法遵循一些網站的規則

回答

相關問題