2017-09-14 100 views
0
import scrapy 
from universities.items import UniversitiesItem 


def clean_full_name(full_name): 
    sp = full_name.split(',') 
    last_name = sp[0].strip() 
    first_name = sp[1].replace('\r\n', '').strip() 
    first_name = ' '.join(first_name.split()[:-1]).strip() 
    return ' '.join([last_name, first_name]) 


class DerexlUniversity(scrapy.Spider): 
    name = 'drexel_university' 
    allowed_domains = ['drexel.edu'] 
    start_urls = ['http://drexel.edu/search?t=people&q='] 

    def __init__(self): 
     self.last_name = '' 

    def parse(self, response): 
     with open('kw.txt') as file_object: 
      last_names = file_object.readlines() 

     for ln in ['Chong', 'Zhao']: 
      self.last_name = ln.strip() 
      print('-----------------------------------------------------') 
      print("scraping last name: ", self.last_name) 
      query = response.url + self.last_name 
      yield scrapy.Request(query, callback=self.parse_item) 

    def parse_item(self, response): 
     self.logger.info('This is item page %s', self.last_name) 
     result_rows = response.xpath('//table//tr[@class="result-row"]') 
     result_details = response.xpath('//table//tr[@class="result-details"]') 

     for row, detail in zip(result_rows, result_details): 
      full_name = row.xpath('.//span[@class="fullname"]/text()').extract_first() 
      if full_name: 
       full_name = clean_full_name(full_name) 
       if self.last_name in full_name.split(): 
        item = UniversitiesItem() 
        item['fullname'] = full_name 
        item['university'] = 'Drexel University' 
        try: 
         item['email'] = row.xpath('.//span[@class="email-address"]/a/@href').extract_first()[7:] 
         item['phone'] = row.xpath('.//span[@class="phone-numbers"]/a/@href').extract_first()[4:] 

         person_detail = detail.xpath('.//span[@class="person-detail"]/text()').extract() 
        except ValueError: 
         pass 
        else: 
         person_detail_clean = ', '.join([pd.strip() for pd in person_detail[0].split(',')][1:]) 
         item['person_detail'] = person_detail_clean 

        yield item 

for循環中有2個關鍵詞,即'衝'和'趙'。我試圖將結果保存在CSV文件中。每次在parse_item函數的for循環中生成一個新項目。但是,只有'趙'正在被拯救。我無法弄清楚爲什麼。scrapy結果只保存一個項目在循環中

+0

我檢查,衝一直沒有結果 –

+0

[http://drexel.edu/search?t=people&q=chong](http://drexel.edu/search?t=people&q= chong)有166場比賽。 – user8314628

回答

0

您的問題與self.last_name有關。你不應該在整個響應中使用類級變量。您應該改爲response.meta。下面爲我​​工作

class DerexlUniversity(scrapy.Spider): 
    name = 'drexel_university' 
    allowed_domains = ['drexel.edu'] 
    start_urls = ['http://drexel.edu/search?t=people&q='] 

    def parse(self, response): 
     # with open('kw.txt') as file_object: 
     #  last_names = file_object.readlines() 

     for ln in ['Chong', 'Zhao']: 
      last_name = ln.strip() 
      print('-----------------------------------------------------') 
      print("scraping last name: ", last_name) 
      query = response.url + last_name 
      yield scrapy.Request(query, callback=self.parse_item, meta=dict(last_name=last_name)) 

    def parse_item(self, response): 
     last_name = response.meta['last_name'] 
     self.logger.info('This is item page %s', last_name) 
     result_rows = response.xpath('//table//tr[@class="result-row"]') 
     result_details = response.xpath('//table//tr[@class="result-details"]') 

     for row, detail in zip(result_rows, result_details): 
      full_name = row.xpath('.//span[@class="fullname"]/text()').extract_first() 
      if full_name: 
       full_name = clean_full_name(full_name) 
       if last_name in full_name.split(): 
        item = {} 
        item['fullname'] = full_name 
        item['university'] = 'Drexel University' 
        try: 
         item['email'] = row.xpath('.//span[@class="email-address"]/a/@href').extract_first()[7:] 
         item['phone'] = row.xpath('.//span[@class="phone-numbers"]/a/@href').extract_first()[4:] 

         person_detail = detail.xpath('.//span[@class="person-detail"]/text()').extract() 
        except ValueError: 
         pass 
        else: 
         person_detail_clean = ', '.join([pd.strip() for pd in person_detail[0].split(',')][1:]) 
         item['person_detail'] = person_detail_clean 

        yield item 
+0

很酷,它的作品。但我仍然不明白爲什麼一個類級別的變量不起作用。 – user8314628

+0

因爲當你到達'self.last_name'的'parse_item'時,最後一個是'Zhao'。 Scrapy沒有按順序運行你的方法,因此你不應該在這樣的函數之間共享變量。這就是爲什麼你應該使用可以傳遞響應的元 –