2014-12-21 37 views
0

我使用Scrapy (0.22)來抓取一個網站。我需要做三件事情:Scrapy無法將圖片下載到本地

  1. 我需要的圖像的類別和子類別
  2. 我需要下載圖像,並將它們存儲在本地
  3. 我需要存儲的產品分類,子類別,圖像蒙戈網址

但現在我被封鎖了,我用'管道'來下載圖像,但是我的代碼無法工作,它無法將圖片下載到本地。

此外,由於我想在Mongo中存儲信息,任何人都可以給我一些關於「Mongo表結構」的建議嗎?

我的代碼如下:

settings.py

BOT_NAME = 'tutorial' 

SPIDER_MODULES = ['tutorial.spiders'] 
NEWSPIDER_MODULE = 'tutorial.spiders' 

ITEM_PIPELINES = {'tutorial.pipelines.TutorialPipeline': 1} 
IMAGES_STORE = '/ttt' 

items.py

from scrapy.item import Item, Field 

    class TutorialItem(Item): 
     # define the fields for your item here like: 
     # name = Field() 
     catname=Field() 
     caturl=Field() 
     image_urls = Field() 
     images = Field() 
     pass 

pipelines.py

from scrapy.contrib.pipeline.images import ImagesPipeline 
from scrapy.exceptions import DropItem 
from scrapy.http import Request 
from pprint import pprint as pp 

class TutorialPipeline(object): 
    # def get_media_requests(self, item, info): 
    #  for image_url in item['image_urls']: 
    #   yield Request(image_url) 

    # def process_item(self, item, spider): 
     # print '**********************===================*******************' 
     # return item 
     # pp(item) 
     # pass 

    def get_media_requests(self,item,info): 
     # pass 
     pp('**********************===================*******************') 

     # yield Request(item['image_urls']) 
     for image_url in item['image_urls']: 
      # pass 
      # print image_url 
      yield Request(image_url) 

spider.py

import scrapy 
import os 
from pprint import pprint as pp 
from scrapy import log 
from scrapy.http import Request 
from scrapy.selector import Selector 
from scrapy.spider import Spider 

from scrapy.spider import Spider 
from scrapy.selector import Selector 

from tutorial.items import TutorialItem 
from pprint import pprint as pp 

class BaiduSpider(scrapy.spider.Spider): 
    name='baidu' 
    start_urls=[ 
     # 'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/' 
     'http://giphy.com/categories' 
    ] 

    domain='http://giphy.com' 

    def parse(self,response): 
     selector=Selector(response) 

     topCategorys=selector.xpath('//div[@id="None-list"]/a') 

     # pp(topCategorys) 
     items=[] 
     for tc in topCategorys: 
      item=TutorialItem() 
      item['catname']=tc.xpath('./text()').extract()[0] 
      item['caturl']=tc.xpath('./@href').extract()[0] 
      if item['catname']==u'ALL': 
       continue 
      reqUrl=self.domain+'/'+item['caturl'] 
      # pp(reqUrl) 
      yield Request(url=reqUrl,meta={'caturl':reqUrl},callback=self.getSecondCategory) 
    def getSecondCategory(self,response): 
     selector=Selector(response) 
     # pp(response.meta['caturl']) 
     # pp('*****************=================**************') 

     secondCategorys=selector.xpath('//div[@class="grid_9 omega featured-category-tags"]/div/a') 

     # pp(secondCategorys) 
     items=[] 
     for sc in secondCategorys: 
      item=TutorialItem() 
      item['catname']=sc.xpath('./div/h4/text()').extract()[0] 
      item['caturl']=sc.xpath('./@href').extract()[0] 
      items.append(item) 

      reqUrl=self.domain+item['caturl'] 
     # pp(items) 
      # pp(item) 
      # pp(reqUrl) 
      yield Request(url=reqUrl,meta={'caturl':reqUrl},callback=self.getImages) 

    def getImages(self,response): 
     selector=Selector(response) 
     # pp(response.meta['caturl']) 
     # pp('*****************=================**************') 


     # images=selector.xpath('//ul[@class="gifs freeform grid_12"]/div[position()=3]') 
     images=selector.xpath('//*[contains (@class,"hoverable-gif")]') 
     # images=selector.xpath('//ul[@class="gifs freeform grid_12"]//div[@class="hoverable-gif"]') 
     # pp(len(images)) 
     items=[] 
     for image in images: 
      item=TutorialItem() 
      item['image_urls']=image.xpath('./a/figure/img/@src').extract()[0] 
      # item['imgName']=image.xpath('./a/figure/img/@alt').extract()[0] 
      items.append(item) 
      # pp(item) 
      # pp(items) 
      # pp('==============************==============') 

     # pp(items) 
     # items=[{'images':"hello world"}] 
     return items 

另外,還有不輸出錯誤,恰恰是如下:

2014-12-21 13:49:56+0800 [scrapy] INFO: Enabled item pipelines: TutorialPipeline 
2014-12-21 13:49:56+0800 [baidu] INFO: Spider opened 
2014-12-21 13:49:56+0800 [baidu] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 
2014-12-21 13:49:56+0800 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023 
2014-12-21 13:49:56+0800 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080 
2014-12-21 13:50:07+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com/categories> (referer: None) 
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/science/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/sports/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/news-politics/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/transportation/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/interests/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/memes/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/tv/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/gaming/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/nature/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/emotions/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/movies/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/holiday/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/reactions/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/music/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/decades/> (referer: http://giphy.com/categories) 
2014-12-21 13:50:12+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com/search/the-colbert-report/> (referer: http://giphy.com//categories/news-politics/) 
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/> 
    {'image_urls': u'http://media1.giphy.com/media/2BDLDXFaEiuBy/200_s.gif'} 
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/> 
    {'image_urls': u'http://media2.giphy.com/media/WisjAI5QGgsrC/200_s.gif'} 
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/> 
    {'image_urls': u'http://media3.giphy.com/media/ZgDGEMihlZXCo/200_s.gif'} 
............. 
+1

請提供錯誤/結果你得到。它將幫助答覆者更好地分類你的代碼。 – kartikg3

+0

@ kartikg3沒有錯誤,看起來一切正常。 –

回答

2

據我看到它,也沒有必要因爲您沒有修改其行爲,因此您可以覆蓋ImagesPipeline。但是,既然你這樣做了,你應該正確地做。
當重寫ImagesPipeline,兩種方法應該重寫:

  • get_media_requests(項目信息)image_urls爲每個URL返回Request。這部分你做得對。

  • item_completed(results,items,info)在單個項目的所有圖像請求都已完成(完成下載或由於某種原因失敗)時被調用。從official documentation

    item_completed()方法必須返回將發送 後續項目流水線階段的輸出,所以你必須返回(或下降)的 項目,就像在任何管道。

因此,爲了使您的自定義圖像流水線的工作,你需要重寫item_completed()方法,像這樣:

def item_completed(self, results, item, info): 
    image_paths = [x['path'] for ok, x in results if ok] 
    if not image_paths: 
     raise DropItem("Item contains no images") 
    item['image_paths'] = image_paths 
    return item 

進一步上,你的代碼中的其他問題使其無法按預期工作:

  1. 您實際上並未創建任何有用的項目。
    如果你看看你的parse()getSecondCategory()函數,你會注意到你沒有返回也沒有產生任何項目。儘管您似乎已經準備好了items列表,您顯然希望使用它來存儲項目,但它永遠不會用於將項目實際上沿着處理路徑進一步傳遞。在某一點上,您只需爲下一頁生成一個Request,並且當該功能完成時,您的items將被刪除。

  2. 您未使用通過meta字典傳遞的caturl信息。您在parse()˙和getSecondCategory()中都傳遞了此信息,但您從不在回調函數中收集該信息。因此它也被忽略。

所以,基本上只有工作的唯一東西就是圖像管道,如果你按我已經建議的那樣修復它。爲了在你的代碼來解決這些問題,請按照下面的指南(請記住,這不是測試,它只是爲您考慮的準則):

def parse(self,response): 
    selector=Selector(response) 
    topCategorys=selector.xpath('//div[@id="None-list"]/a') 

    for tc in topCategorys: 
     # no need to create the item just yet, 
     # only get the category and the url so we can 
     # continue the work in our callback 
     catname = tc.xpath('./text()').extract()[0] 
     caturl = tc.xpath('./@href').extract()[0] 
     if catname == u'ALL': 
      continue 
     reqUrl=self.domain + '/' + caturl 

     # pass the category name in the meta so we can retreive it 
     # from the response in the callback function 
     yield Request(url=reqUrl,meta={'catname': catname}, 
         callback=self.getSecondCategory) 

def getSecondCategory(self,response): 
    selector=Selector(response) 
    secondCategorys=selector.xpath('//div[@class="grid_9 omega featured-category-tags"]/div/a') 

    # retreive the category name from the response 
    # meta dictionary, which was copied from our request 
    catname = response.meta['catname'] 

    for sc in secondCategorys: 
     # still no need to create the item, 
     # since we are just trying to get to 
     # the subcategory 
     subcatname = sc.xpath('./div/h4/text()').extract()[0] 
     subcaturl = sc.xpath('./@href').extract()[0] 

     reqUrl=self.domain + '/' + subcaturl 

     # this time pass both the category and the subcategory 
     # so we can read them both in the callback function 
     yield Request(url=reqUrl,meta={'catname':catname, 'subcatname':subcatname}, 
         callback=self.getImages) 

def getImages(self,response): 
    selector=Selector(response) 

    # retreive the category and subcategory name 
    catname = response.meta['catname'] 
    subcatname = response.meta['subcatname'] 

    images = selector.xpath('//*[contains (@class,"hoverable-gif")]') 

    for image in images: 
     # now could be a good time to create the items 
     item=TutorialItem() 

     # fill the items category information. You can concatenate 
     # the category and subcategory if you like, or you can 
     # add another field in your TutorialItem called subcatname 
     item['catname'] = catname + ":" + subcatname 
     # or alternatively: 
     # item['catname'] = catname 
     # item['subcatname'] = subcatname 

     item['image_urls']=image.xpath('./a/figure/img/@src').extract()[0] 

     # no need to store the items in the list to return 
     # it later, we can just yield the items as they are created 
     yield item