2014-03-27 42 views
3

我正在使用scrapy取消一個網站,並將內部/外部鏈接存儲在我的物品類中。網站抓取和截圖

有沒有辦法,當鏈接報廢時,我可以捕獲它的截圖?

注意:該網站有一個登錄授權表單。

我的代碼(spider.py)

from scrapy.spider import BaseSpider 
    from scrapy.contrib.spiders.init import InitSpider 
    from scrapy.http import Request, FormRequest 
    from scrapy.selector import HtmlXPathSelector 
    from tutorial.items import DmozItem 
    from scrapy.contrib.spiders import CrawlSpider, Rule 
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
    import urlparse 
    from scrapy import log 

    class MySpider(CrawlSpider): 

     items = [] 
     failed_urls = [] 
     duplicate_responses = [] 

     name = 'myspiders' 
     allowed_domains = ['someurl.com'] 
     login_page = 'someurl.com/login_form' 
     start_urls = 'someurl.com/' 

     rules = [Rule(SgmlLinkExtractor(deny=('logged_out', 'logout',)), follow=True, callback='parse_start_url')] 

     def start_requests(self): 

      yield Request(
       url=self.login_page, 
       callback=self.login, 
       dont_filter=False 
      ) 


     def login(self, response): 
      """Generate a login request.""" 
      return FormRequest.from_response(response, 
      formnumber=1, 
      formdata={'username': 'username', 'password': 'password' }, 
      callback=self.check_login_response) 


     def check_login_response(self, response): 
      """Check the response returned by a login request to see if we are 
      successfully logged in. 
      """ 
      if "Logout" in response.body: 
       self.log("Successfully logged in. Let's start crawling! :%s" % response, level=log.INFO) 
       self.log("Response Url : %s" % response.url, level=log.INFO) 

       yield Request(url=self.start_urls) 
      else: 
       self.log("Bad times :(", loglevel=log.INFO) 


     def parse_start_url(self, response): 


      # Scrape data from page 
      hxs = HtmlXPathSelector(response) 

      self.log('response came in from : %s' % (response), level=log.INFO) 

      # check for some important page to crawl 
      if response.url == 'someurl.com/medical/patient-info' : 

       self.log('yes I am here', level=log.INFO) 

       urls = hxs.select('//a/@href').extract() 
       urls = list(set(urls)) 


       for url in urls : 

        self.log('URL extracted : %s' % url, level=log.INFO) 

        item = DmozItem() 

        if response.status == 404 or response.status == 500: 
         self.failed_urls.append(response.url) 
         self.log('failed_url : %s' % self.failed_urls, level=log.INFO) 
         item['failed_urls'] = self.failed_urls 

        else : 

         if url.startswith('http') : 
          if url.startswith('someurl.com'): 
           item['internal_link'] = url 

           # Need to capture screenshot of the extracted url here 

           self.log('internal_link :%s' % url, level=log.INFO) 
          else : 
           item['external_link'] = url 

           # Need to capture screenshot of the extracted url here 

           self.log('external_link :%s' % url, level=log.INFO) 

        self.items.append(item) 

       self.items = list(set(self.items)) 
       return self.items 
      else : 
       self.log('did not recieved expected response', level=log.INFO) 

更新:我使用的是虛擬機(通過膩子登錄)

回答

4

你可以看一下渲染服務器像splash

+0

是否有可能在我的代碼中使用硒來截取報廢的URL(也是認證的)的屏幕截圖? – user2722127

+0

飛濺看起來很有希望!謝謝,保羅 –