2017-10-12 278 views
0
def get_user_data(self,start_url): 
    html = self.session.get(url=start_url,headers=self.headers,cookies=self.cookies).content 
    selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8')) 
    all_user = selector.xpath('//div[contains(@class,"c") and contains(@id,"M")]') 
    for i in all_user: 
     user_id = i.xpath('./div[1]/a[@class="nk"]/@href')[0] 
     content = i.xpath('./div[1]/span[1]')[0] 
     contents = content.xpath('string(.)') 
     times = i.xpath('./div/span[@class="ct"]/text()')[0] 
     if len(i.xpath('./div[3]')): 
      imgages = i.xpath('./div[2]/a/img/@src') 
      praise_num = i.xpath('./div[3]/a[2]/text()') 
      transmit_num = i.xpath('./div[3]/a[3]/text()') 
     elif len(i.xpath('./div[2]')): 
      imgages = i.xpath('./div[2]/a/img/@src') 
      praise_num = i.xpath('./div[2]/a[3]/text()') 
      transmit_num = i.xpath('./div[2]/a[4]/text()') 
     else : 
      imgages = '' 
      praise_num = i.xpath('./div[1]/a[2]/text()') 
      transmit_num = i.xpath('./div[1]/a[3]/text()') 
     try: 
      if re.search('from',times.encode().decode('utf-8')): 
       month_day, time, device = times.split(maxsplit=2) 
       self.data['mobile_phone'] = device 
      else: 
       time,device = times.split(maxsplit=1) 
       self.data['month_day'] = '' 
      self.data['create_time'] = month_day + ' ' + time 
     except Exception as e: 
      print('failure:',e) 
     self.data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S') 
     self.data['user_id'] = user_id 
     self.data['contents'] = contents.encode().decode('utf-8').replace('\u200b','') 
     self.data['imgages'] = imgages 
     self.data['praise_num'] = praise_num 
     self.data['transmit_num'] = transmit_num 
    with open('a.txt','a',encoding='utf-8') as f: 
     f.write(json.dumps(self.data)+'\n') 

我試圖抓住每一頁數據並將其保存到data.But我寫錯了,因爲我只在'a.txt'的每個頁面上保存了一塊數據,那麼我該如何編寫才能在'a.txt'中正確保存每一頁數據?Python只保存一行數據

回答

0

寫操作外的for循環這就是爲什麼它只是將最後一次迭代數據保存到文件

with open('a.txt','a',encoding='utf-8') as f: 
    f.write(json.dumps(self.data)+'\n') 
-1

你在循環的每次迭代中覆蓋的self.data各種值。

相反,self.data應該是一個列表。您應該在每次迭代中創建一個新字典,並在末尾將其附加到數據。

self.data = [] 
for i in all_user: 
    values = {} 
    ... 
    values['crawl_time'] = ... 
    values['user_id'] = ... 
    ... 
    self.data.append(values)