2015-09-27 65 views
1

我在爬行維基百科轉儲json文件時遇到了UnicodeEncodeError。 這是我的代碼片段和錯誤消息。 看起來像角色'é'會導致這個問題。但是,我不知道如何解決這個問題。urllib2中的UnicodeEncodeError

import urllib2 
import json 

# List of philosopher's name: mergel list 
# print mergel 
i = 0 
for name in mergel: 
# Use the API to get the page content in a format that we like. 
# https://en.wikipedia.org/w/api.php?action=query&titles=Spider-Man&prop=revisions&rvprop=content&format=json 
# set the parameters (https://www.mediawiki.org/wiki/API:Tutorial) 
    i = i+1 
    baseurl = "https://en.wikipedia.org/w/api.php?" 
    action = "action=query" 
    titlename = name.replace(" ", "_") 
    print titlename 
    title = "titles="+titlename 
    content = "prop=revisions&rvprop=content" 
    dataformat = "format=json" 

# construct the query 
    query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat) 
    print query 
    wikiresponse = urllib2.urlopen(query) 
    wikisource = wikiresponse.read() 
#  print wikisource 
    wikijson = json.loads(wikisource) 
    jsonfilename = './json/'+titlename+'.json' 
    with open(jsonfilename, 'w') as outfile: 
     json.dump(wikijson, outfile) 

錯誤消息:

Tenzin_Gyatso 
https://en.wikipedia.org/w/api.php?action=query&titles=Tenzin_Gyatso&prop=revisions&rvprop=content&format=json 
Claude_Lévi-Strauss 
https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json 
--------------------------------------------------------------------------- 
UnicodeEncodeError      Traceback (most recent call last) 
<ipython-input-203-8430fc805550> in <module>() 
    21  query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat) 
    22  print query 
---> 23  wikiresponse = urllib2.urlopen(query) 
    24  wikisource = wikiresponse.read() 
    25 #  print wikisource 

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context) 
    152  else: 
    153   opener = _opener 
--> 154  return opener.open(url, data, timeout) 
    155 
    156 def install_opener(opener): 

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout) 
    429    req = meth(req) 
    430 
--> 431   response = self._open(req, data) 
    432 
    433   # post-process response 

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _open(self, req, data) 
    447   protocol = req.get_type() 
    448   result = self._call_chain(self.handle_open, protocol, protocol + 
--> 449         '_open', req) 
    450   if result: 
    451    return result 

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args) 
    407    func = getattr(handler, meth_name) 
    408 
--> 409    result = func(*args) 
    410    if result is not None: 
    411     return result 

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in https_open(self, req) 
    1238   def https_open(self, req): 
    1239    return self.do_open(httplib.HTTPSConnection, req, 
-> 1240     context=self._context) 
    1241 
    1242   https_request = AbstractHTTPHandler.do_request_ 

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args) 
    1192 
    1193   try: 
-> 1194    h.request(req.get_method(), req.get_selector(), req.data, headers) 
    1195   except socket.error, err: # XXX what error? 
    1196    h.close() 

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in request(self, method, url, body, headers) 
    1051  def request(self, method, url, body=None, headers={}): 
    1052   """Send a complete request to the server.""" 
-> 1053   self._send_request(method, url, body, headers) 
    1054 
    1055  def _set_content_length(self, body, method): 

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers) 
    1091   for hdr, value in headers.iteritems(): 
    1092    self.putheader(hdr, value) 
-> 1093   self.endheaders(body) 
    1094 
    1095  def getresponse(self, buffering=False): 

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in endheaders(self, message_body) 
    1047   else: 
    1048    raise CannotSendHeader() 
-> 1049   self._send_output(message_body) 
    1050 
    1051  def request(self, method, url, body=None, headers={}): 

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_output(self, message_body) 
    891    msg += message_body 
    892    message_body = None 
--> 893   self.send(msg) 
    894   if message_body is not None: 
    895    #message_body was not a string (i.e. it is a file) and 

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in send(self, data) 
    867     datablock = data.read(blocksize) 
    868   else: 
--> 869    self.sock.sendall(data) 
    870 
    871  def _output(self, s): 

/Users/sundong/anaconda/lib/python2.7/ssl.pyc in sendall(self, data, flags) 
    719    count = 0 
    720    while (count < amount): 
--> 721     v = self.send(data[count:]) 
    722     count += v 
    723    return amount 

/Users/sundong/anaconda/lib/python2.7/ssl.pyc in send(self, data, flags) 
    685      self.__class__) 
    686    try: 
--> 687     v = self._sslobj.write(data) 
    688    except SSLError as x: 
    689     if x.args[0] == SSL_ERROR_WANT_READ: 

UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 43: ordinal not in range(128) 

然而,下面沒有從列表中得到一個標題簡單直接&代碼,只是工作沒有任何問題。

import urllib2 
import json 
query = 'https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json' 
wikiresponse = urllib2.urlopen(query) 
wikisource = wikiresponse.read() 
wikijson = json.loads(wikisource) 
jsonfilename = './json/'+'Claude_Lévi-Strauss'+'.json' 
with open(jsonfilename, 'w') as outfile: 
    json.dump(wikijson, outfile) 
+0

我發現我的代碼通過在'title =「titles =」+ titlename'後添加一行'title = title.encode('utf-8')'來工作。但是,我不清楚它爲什麼會變成好的。 – SUNDONG

+0

雖然它不是你的問題的答案 - 使用urllib2執行HTTP請求是有點pythonic。我建議使用[requests](http://www.python-requests.org/en/latest/)讓你的生活變得更輕鬆。所以如果使用它可以解決你的問題。關於你的實際問題,你可能需要使用'idna'編碼來編碼你的標題https://docs.python.org/2/library/codecs.html#python-specific-encodings –

+0

@MichaelAquilina:'idna'與它無關:'en.wikipedia.org'域名是純粹的ascii。另外,不需要使用'requests',在這裏做一個簡單的http請求。這個問題可能與[如何在content-type爲「application/xml」](http://stackoverflow.com/q/7993175/4279)時使用httplib發佈非ASCII字符相同的問題,即bytestrings和unicode字符串混合。 – jfs

回答

0

不要混合Unicode和字節串:使用Unicode字符串來處理Python中的文本。

不要手動創建網址,請使用urllib函數,如quote(),urlencode()。另外,請考慮urlparse模塊的功能,例如urljoin(),urlunsplit()

你已經請求了json格式,不需要解析它,只是立即使用相同的格式轉儲它;你可以使用shutil.copyfileobj()複製類似文件的對象。您可以稍後檢查結果文件,以確保它已正確下載。

全部放在一起,這裏是如何與給定標題保存維基頁面以JSON格式的文件:

#!/usr/bin/env python2 
# -*- coding: utf-8 -*- 
import os 
from contextlib import closing 
from urllib import quote 
from urllib2 import urlopen 
from shutil import copyfileobj 

def urlretrieve(url, filename, chunksize=8096): 
    with closing(urlopen(url)) as response, open(filename, 'wb') as file: 
     copyfileobj(response, file, chunksize) 

#XXX for name in mergel: 
name = u"Claude Lévi-Strauss" #NOTE: Unicode string 
urlretrieve("https://en.wikipedia.org/w/api.php?" 
      "action=query&prop=revisions&rvprop=content&format=json&" 
      "titles=" + quote(name.encode('utf-8')), 
      os.path.join('json', name + '.json')) 

注:

  • 你不需要在這種情況下.replace(' ', '_')

  • os.path.join('json', name + '.json')線混合字節串('json''.json')和Unicode(type(name) == unicode)。它是確定在這裏,因爲這兩個'json''.json'ASCII -only 文字源代碼

  • # -*- coding: utf-8 -*-編碼聲明只會影響您的Python源代碼如字面上出現的人物,這是偶然的查詢字符串在這種特殊情況下也使用相同的編碼。源代碼的編碼與可能用於文件名的字符編碼,或通過http傳輸數據,或將Unicode文本寫入終端等(所有這些編碼可能彼此不同)都沒有關係。

  • 原則上,你可以在這裏使用urllib.urlretrieve(url, filename)代替urlopen + copyfileurllib.urlretrieve()行爲不同於urllib2.urlopen() Python的2

下面是一個使用requests相同的代碼:

#!/usr/bin/env python2 
# -*- coding: utf-8 -*-  
import os 
from urllib import quote 
import requests # $ pip install requests 

def urlretrieve(url, filename, chunksize=8096): 
    r = requests.get(url, stream=True) 
    r.raise_for_status() # raise on http error 
    with open(filename, 'wb') as f: 
     for chunk in r.iter_content(chunksize): 
      f.write(chunk) 

#XXX for name in mergel: 
name = u"Claude Lévi-Strauss" #NOTE: Unicode string 
urlretrieve("https://en.wikipedia.org/w/api.php?" 
      "action=query&prop=revisions&rvprop=content&format=json&" 
      "titles=" + quote(name.encode('utf-8')), 
      os.path.join('json', name + '.json')) 

但是,在簡單的&直接代碼下不需要從列表中獲得標題,只需作品沒有任何問題。

你的代碼使用非ASCII字節串文字(在Python 3中是非法的)。沒有編碼錯誤,因爲所有數據都是字節。使用字節串的問題在於,如果不同的環境可能使用不同的字符編碼並且它們確實會中斷(你不能指望所有的東西都使用utf-8,但它可能是合乎需要的)。另外,the query part should be properly encoded e.g., é should be sent as '%C3%A9'


無關:以一次下載幾個網站的頁面,你可以使用一個線程池:

from multiprocessing.dummy import Pool # use threads 

def download(name): 
    urlretrieve("https://en.wikipedia.org/w/api.php?" 
       "action=query&prop=revisions&rvprop=content&format=json&" 
       "titles=" + quote(name.encode('utf-8')), 
       os.path.join('json', name + '.json')) 

pool = Pool(4) # download 4 titles concurrently 
for _ in pool.imap_unordered(download, mergel, chunksize=100): 
    pass 

這是禮貌set maxlag query parameter and respect Retry-After http header。維基百科API有幾種包裝可以爲你做。