2017-03-06 115 views
3

我正在創建一個web應用程序,用於從不同的網站中擦出一長串鞋子。 這裏是我的兩個獨立scrapy腳本:Scrapy - 在一個scrapy腳本中颳去不同的網頁

http://store.nike.com/us/en_us/pw/mens-clearance-soccer-shoes/47Z7puZ896Zoi3

from scrapy import Spider 
from scrapy.http import Request 
class ShoesSpider(Spider): 
    name = "shoes" 
    allowed_domains = ["store.nike.com"] 
    start_urls = ['http://store.nike.com/us/en_us/pw/mens-clearance-soccer-shoes/47Z7puZ896Zoi3'] 
    def parse(self, response): 
     shoes = response.xpath('//*[@class="grid-item-image-wrapper sprite-sheet sprite-index-0"]/a/@href').extract() 
     for shoe in shoes: 
      yield Request(shoe, callback=self.parse_shoes) 

    def parse_shoes(self, response): 
     url = response.url 
     name = response.xpath('//*[@itemprop="name"]/text()').extract_first() 
     price = response.xpath('//*[@itemprop="price"]/text()').extract_first() 
     price = price.replace('$','') 
     shoe_type = response.css('.exp-product-subtitle::text').extract_first() 

     sizes = response.xpath('//*[@class="nsg-form--drop-down exp-pdp-size-dropdown exp-pdp-dropdown two-column-dropdown"]/option') 
     sizes = sizes.xpath('text()[not(parent::option/@class="exp-pdp-size-not-in-stock selectBox-disabled")]').extract() 
     sizes = [s.strip() for s in sizes] 
     yield { 
      'url': url, 
      'name' : name, 
      'price' : price, 
      'sizes' : sizes, 
      'shoe_type': shoe_type 
     } 

http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp

from scrapy import Spider 
    from scrapy.http import Request 
    class ShoesSpider(Spider): 
     name = "shoes" 
     allowed_domains = ["dickssportinggoods.com"] 
     start_urls = ['http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp'] 
     def parse(self, response): 
      shoes = response.xpath('//*[@class="fplpTitle header4"]/a/@href').extract() 
      for shoe in shoes: 
       yield Request(shoe, callback=self.parse_shoes) 
     def parse_shoes(self, response): 
      sizes = response.xpath('//*[@class="swatches clearfix"]/input/@value').extract() 
      if sizes == []: 
       pass 
      url = response.url 
      name = response.xpath('.//*[@id="PageHeading_3074457345618261107"]/h1/text()').extract_first() 
      price = response.xpath('.//*[@itemprop="price"]/text()').extract_first() 
      #shoe_type = response.css('.exp-product-subtitle::text').extract_first() 
      yield { 
        'url': url, 
        'name' : name, 
        'price' : price, 
        'sizes' : sizes, 
        'shoe_type': '' 
       } 

如何管理把他們兩個在一起?我已經閱讀了scrapy文檔,我沒有看到他們提到這一點,它只是提到如何從根地址中刮取兩個地址。 感謝

回答

2

把你的兩個域在allowed_domains,並把你的網址都在start_urls,然後用簡單的if-else來確定要執行的代碼的一部分。

from scrapy import Spider 
from scrapy.http import Request 
class ShoesSpider(Spider): 
    name = "shoes" 
    allowed_domains = ["store.nike.com", "dickssportinggoods.com"] 
    start_urls = ['http://store.nike.com/us/en_us/pw/mens-clearance-soccer-shoes/47Z7puZ896Zoi3', 'http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp'] 
    def parse(self, response): 

     if "store.nike.com" in response.url: 
      shoes = response.xpath('//*[@class="grid-item-image-wrapper sprite-sheet sprite-index-0"]/a/@href').extract() 
     elif "dickssportinggoods.com" in response.url: 
      shoes = response.xpath('//*[@class="fplpTitle header4"]/a/@href').extract() 

     for shoe in shoes: 
      yield Request(shoe, callback=self.parse_shoes) 

    def parse_shoes(self, response): 
     url = response.url 

     if "store.nike.com" in response.url: 
      name = response.xpath('//*[@itemprop="name"]/text()').extract_first() 
      price = response.xpath('//*[@itemprop="price"]/text()').extract_first() 
      price = price.replace('$','') 
      shoe_type = response.css('.exp-product-subtitle::text').extract_first() 

      sizes = response.xpath('//*[@class="nsg-form--drop-down exp-pdp-size-dropdown exp-pdp-dropdown two-column-dropdown"]/option') 
      sizes = sizes.xpath('text()[not(parent::option/@class="exp-pdp-size-not-in-stock selectBox-disabled")]').extract() 
      sizes = [s.strip() for s in sizes] 
      yield { 
       'url': url, 
       'name' : name, 
       'price' : price, 
       'sizes' : sizes, 
       'shoe_type': shoe_type 
      } 
     elif "dickssportinggoods.com" in response.url: 
       sizes = response.xpath('//*[@class="swatches clearfix"]/input/@value').extract() 
       if sizes == []: 
        pass 
       url = response.url 
       name = response.xpath('.//*[@id="PageHeading_3074457345618261107"]/h1/text()').extract_first() 
       price = response.xpath('.//*[@itemprop="price"]/text()').extract_first() 
       #shoe_type = response.css('.exp-product-subtitle::text').extract_first() 

       yield { 
         'url': url, 
         'name' : name, 
         'price' : price, 
         'sizes' : sizes, 
         'shoe_type': '' 
       } 
+1

我明白了,非常感謝。 – tadm123

+1

@ tadm123太棒了!我沒有運行你的代碼,如果我的代碼有任何語法錯誤或任何其他問題,請編輯我的答案與您的完整工作代碼,以便它可能會幫助未來的讀者/ – Umair

+0

只是'ellif',但你現在修復它。感謝你們如何做到這一點,我喜歡10種不同的鞋子腳本,試圖將它們全部結合在一起。 – tadm123

0

您不必指定allowed_domains變量。 您可以忽略allowed_domains變量,因此您沒有域限制。