1
我從https://github.com/davidadamojr/TextRank獲取了此代碼,我正面臨此問題。我試圖通過將utf-8放在「keyphrases = decode('utf-8').extensionKeyphrases(文本)」但是失敗。UnicodeDecodeError:'ascii'編碼解碼器無法解碼位置4中的字節0xe2:序號不在範圍內(128)
這裏是代碼:
"""
From this paper: http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf
External dependencies: nltk, numpy, networkx
Based on https://gist.github.com/voidfiles/1646117
"""
import nltk
import itertools
from operator import itemgetter
import networkx as nx
import sys
import os
#apply syntactic filters based on POS tags
def filter_for_tags(tagged, tags=['NN', 'JJ', 'NNP']):
return [item for item in tagged if item[1] in tags]
def normalize(tagged):
return [(item[0].replace('.', ''), item[1]) for item in tagged]
def unique_everseen(iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
seen = set()
seen_add = seen.add
if key is None:
for element in itertools.ifilterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def lDistance(firstString, secondString):
"Function to find the Levenshtein distance between two words/sentences - gotten from http://rosettacode.org/wiki/Levenshtein_distance#Python"
if len(firstString) > len(secondString):
firstString, secondString = secondString, firstString
distances = range(len(firstString) + 1)
for index2, char2 in enumerate(secondString):
newDistances = [index2 + 1]
for index1, char1 in enumerate(firstString):
if char1 == char2:
newDistances.append(distances[index1])
else:
newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1])))
distances = newDistances
return distances[-1]
def buildGraph(nodes):
"nodes - list of hashables that represents the nodes of the graph"
gr = nx.Graph() #initialize an undirected graph
gr.add_nodes_from(nodes)
nodePairs = list(itertools.combinations(nodes, 2))
#add edges to the graph (weighted by Levenshtein distance)
for pair in nodePairs:
firstString = pair[0]
secondString = pair[1]
levDistance = lDistance(firstString, secondString)
gr.add_edge(firstString, secondString, weight=levDistance)
return gr
def extractKeyphrases(text):
#tokenize the text using nltk
wordTokens = nltk.word_tokenize(text)
#assign POS tags to the words in the text
tagged = nltk.pos_tag(wordTokens)
textlist = [x[0] for x in tagged]
tagged = filter_for_tags(tagged)
tagged = normalize(tagged)
unique_word_set = unique_everseen([x[0] for x in tagged])
word_set_list = list(unique_word_set)
#this will be used to determine adjacent words in order to construct keyphrases with two words
graph = buildGraph(word_set_list)
#pageRank - initial value of 1.0, error tolerance of 0,0001,
calculated_page_rank = nx.pagerank(graph, weight='weight')
#most important words in ascending order of importance
keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
#the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
aThird = len(word_set_list)/3
keyphrases = keyphrases[0:aThird+1]
#take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them
#together
modifiedKeyphrases = set([])
dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
i = 0
j = 1
while j < len(textlist):
firstWord = textlist[i]
secondWord = textlist[j]
if firstWord in keyphrases and secondWord in keyphrases:
keyphrase = firstWord + ' ' + secondWord
modifiedKeyphrases.add(keyphrase)
dealtWith.add(firstWord)
dealtWith.add(secondWord)
else:
if firstWord in keyphrases and firstWord not in dealtWith:
modifiedKeyphrases.add(firstWord)
#if this is the last word in the text, and it is a keyword,
#it definitely has no chance of being a keyphrase at this point
if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
modifiedKeyphrases.add(secondWord)
i = i + 1
j = j + 1
return modifiedKeyphrases
def extractSentences(text):
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentenceTokens = sent_detector.tokenize(text.strip())
graph = buildGraph(sentenceTokens)
calculated_page_rank = nx.pagerank(graph, weight='weight')
#most important sentences in ascending order of importance
sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
#return a 100 word summary
summary = ' '.join(sentences)
summaryWords = summary.split()
summaryWords = summaryWords[0:101]
summary = ' '.join(summaryWords)
return summary
def writeFiles(summary, keyphrases, fileName):
"outputs the keyphrases and summaries to appropriate files"
print "Generating output to " + 'keywords/' + fileName
keyphraseFile = open('keywords/' + fileName, 'w')
for keyphrase in keyphrases:
keyphraseFile.write(keyphrase + '\n')
keyphraseFile.close()
print "Generating output to " + 'summaries/' + fileName
summaryFile = open('summaries/' + fileName, 'w')
summaryFile.write(summary)
summaryFile.close()
print "-"
#retrieve each of the articles
articles = os.listdir("articles")
for article in articles:
print 'Reading articles/' + article
articleFile = open('articles/' + article, 'r')
text = articleFile.read()
keyphrases = decode('utf-8').extractKeyphrases(text)
summary = extractSentences(text)
writeFiles(summary, keyphrases, article)
錯誤:
Reading articles/1.txt
Traceback (most recent call last):
File "C:\Users\DELL\Desktop\python\s\fyp\Relevancy\test\TextRank-master\textrank.py", line 166, in <module>
keyphrases = extractKeyphrases(text).setdefaultencoding("utf-8")
File "C:\Users\DELL\Desktop\python\s\fyp\Relevancy\test\TextRank-master\textrank.py", line 72, in extractKeyphrases
wordTokens = nltk.word_tokenize(text)
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\__init__.py", line 93, in word_tokenize
return [token for sent in sent_tokenize(text)
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\__init__.py", line 82, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1270, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1318, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1309, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1348, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 354, in _pair_iter
prev = next(it)
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1324, in _slices_from_text
if self.text_contains_sentbreak(context):
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1369, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1504, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 354, in _pair_iter
prev = next(it)
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 621, in _annotate_first_pass
for aug_tok in tokens:
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 586, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 4: ordinal not in range(128)
任何想法? (抱歉壞英文)
是的,非常感謝 – user3162878