2017-04-21 360 views
0

嗨,我嘗試使用下面的代碼抓取digg.com上的首頁圖像。問題是0.jpg到6.jpg是正常的。從7.jpg開始到47.jpg都是腐敗的。不知道爲什麼。損壞的圖像已損壞

這是代碼。 Github上的位置:https://github.com/kenpeter/py_mm

# os 
import os 
# http request 
import requests 
# 
import pprint 

import time 

# import html from lxml 
from lxml import html 

# global 
global_page_num = 0 
pp = pprint.PrettyPrinter(indent=4) 

# write to file 
def download_image(img_urls): 
    # total img urls 
    amount = len(img_urls) 

    # loop 
    for index, value in enumerate(img_urls, start=0): 
     # file name 
     filename = 'img/%s.jpg' % (index) 
     # dir 
     os.makedirs(os.path.dirname(filename), exist_ok=True) 

     print('--- start ---') 
     print('filename: %s' % filename) 
     print('Downloading: %s out of %s' % (index, amount)) 

     # open file 
     with open(filename, 'wb') as f: 
      # f write 
      # time.sleep(1) 
      f.write(requests.get(value).content) 


def get_page_number(num): 
    url = 'http://digg.com' 
    response = requests.get(url).content 
    selector = html.fromstring(response) 

    img_urls = [] 
    img_urls = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src") 

    news_texts = [] 
    news_texts = selector.xpath("//div[@itemprop='description']/text()") 

    # test 
    # print('--- something ---') 
    # pp.pprint(img_urls) 
    # pp.pprint(news_texts) 

    download_image(img_urls) 

    return img_urls 


if __name__ == '__main__': 
    # input, page_number, everything into the var 
    # page_number = input('Please enter the page number that you want to scrape:') 

    # global_page_num 
    # global_page_num = page_number; 
    # print('hell world!'); 

    page_number = 4 # hardcode 
    get_page_number(page_number) 

回答

0

爲什麼圖像是「腐敗」的原因是,在頁面內的方案變化和圖像開始「隱藏」在你與你的代碼抓取其內容屬性的data-src代替src 。看到這裏所抓取網頁的源代碼的例子有兩個屬性:

<img 
class="digg-story__image-img js--digg-story__image-img lazy-image-img need-offset" 
data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg" 
src="http://static.digg.com/static/fe/944294/images/x_455x248.png" 
width="312" 
height="170" 
alt="" 
/> 

在您需要檢查這兩個屬性srcdata-src給予data-src優先src在創建圖像的URL列表等字樣。

此代碼的「絕招」,並下載正確的圖像:

# os 
import os 
# http request 
import requests 
# 
import pprint 

import time 

# import html from lxml 
from lxml import html 

# global 
global_page_num = 0 
pp = pprint.PrettyPrinter(indent=4) 

# write to file 
def download_image(img_urls): 
    # total img urls 
    amount = len(img_urls) 

    # loop 
    for index, value in enumerate(img_urls, start=0): 
     # file name 
     filename = 'img/%s.jpg' % (index) 
     # dir 
     os.makedirs(os.path.dirname(filename), exist_ok=True) 

     print('--- start ---') 
     print('filename: %s' % filename) 
     print('Downloading: %s out of %s' % (index, amount)) 

     # open file 
     with open(filename, 'wb') as f: 
      # f write 
      # time.sleep(1) 
      f.write(requests.get(value).content) 


def get_page_number(num): 
    url = 'http://digg.com' 
    response = requests.get(url).content 
    selector = html.fromstring(response) 

    img_urls = [] 
    img_urls_1a = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src") 
    img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item] 
    img_urls_2 = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@data-src") 
    img_urls = img_urls_1b + img_urls_2 
    # print(img_urls) 
    news_texts = [] 
    news_texts = selector.xpath("//div[@itemprop='description']/text()") 

    # test 
    # print('--- something ---') 
    # pp.pprint(img_urls) 
    # pp.pprint(news_texts) 

    download_image(img_urls) 

    return img_urls 


if __name__ == '__main__': 
    # input, page_number, everything into the var 
    # page_number = input('Please enter the page number that you want to scrape:') 

    # global_page_num 
    # global_page_num = page_number; 
    # print('hell world!'); 

    page_number = 4 # hardcode 
    get_page_number(page_number)