2016-03-15 54 views
-1

嘿,大家我對Python和編碼都很新。在需要閱讀txt的地方進行分配,統計單詞,排列它們並將其繪製成圖形。 我已經設法做所有的事情,除了把它全部放入圖表的部分。 這是我的代碼,以及列表存儲需要繪製的字符的示例。從Python中的信息列表中繪製圖表

import nltk 
import nltk.tokenize 
import collections 
import numpy as np 

from nltk.tokenize import word_tokenize 

with open("en.txt") as file:  
    data = file.read() 

word_tokenize_list = word_tokenize(data) 

from collections import Counter 
counts = Counter(word_tokenize_list) 
print(counts)      

Counter({',': 54224, 'the': 45990, '.': 42529, 'of': 25608, 'to': 24869, 'a': 21351, 'and': 17807, 'in': 17037, "'s": 10335, 'that': 8990, 'for': 8936, '$': 8218, '``': 7733, 'The': 7724, 'is': 7695, "''": 7510, 'said': 6462, 'on': 5718, '%': 5613, 'it': 5177, 'by': 5035, 'from': 4939, 'million': 4883}) 

我的列表是非常大的BTW 所有我要求的是一個提示,可以用什麼,因爲plt.plot不爲我在這種情況下工作

+0

Stack Overflow is **不**代碼編寫服務。請不要要求爲您寫代碼。 – MarkyPython

+0

我不是要求爲我寫的代碼,我想問什麼應該使用當我需要繪製這樣的東西,因爲在這種情況下,簡單的plt.plot不適合我 –

+0

你應該澄清你的問題,然後。 – MarkyPython

回答

0

對此最有用的圖很可能是條形圖,可以直接使用此字典從字典中繪製出來answer,

import matplotlib.pyplot as plt 

Counter = {',': 54224, 'the': 45990, '.': 42529, 'of': 25608, 'to': 24869, 'a': 21351, 'and': 17807, 'in': 17037, "'s": 10335, 'that': 8990, 'for': 8936, '$': 8218, '``': 7733, 'The': 7724, 'is': 7695, "''": 
7510, 'said': 6462, 'on': 5718, '%': 5613, 'it': 5177, 'by': 5035, 'from': 4939, 'million': 4883} 

#Plot bar with values from dict and label with keys 
plt.bar(range(len(Counter)), Counter.values(), align='center') 
plt.xticks(range(len(Counter)), Counter.keys()) 

#Rotate labels by 90 degrees so you can see them 
locs, labels = plt.xticks() 
plt.setp(labels, rotation=90) 

plt.show() 

它看起來像,

enter image description here

0

Matplotlib是一個非常廣泛使用的圖形庫,可以使用Python使用。

你可能會想首先基於一些排名標準,下面有兩種可能的解決您的計數器數據進行排序:

from collections import Counter 
import matplotlib.pyplot as plt 

data = Counter({',': 54224, 'the': 45990, '.': 42529, 'of': 25608, 'to': 24869, 'a': 21351, 'and': 17807, 'in': 17037, "'s": 10335, 'that': 8990, 'for': 8936, '$': 8218, '``': 7733, 'The': 7724, 'is': 7695, "''": 7510, 'said': 6462, 'on': 5718, '%': 5613, 'it': 5177, 'by': 5035, 'from': 4939, 'million': 4883})  
xaxis = range(len(data)) 

keys_freq = [] 
values_freq = [] 

keys_length = [] 
values_length = [] 

# Rank depending on frequency 
for key, value in data.most_common()[::-1]: 
    keys_freq.append(key) 
    values_freq.append(value) 

# Rank depending on word length 
for key in sorted(data.keys(), key=lambda x: (len(x), x)): 
    keys_length.append(key) 
    values_length.append(data[key]) 

fig = plt.figure() 

plt.subplot(211) 
plt.bar(xaxis, values_freq, align='center') 
plt.xticks(xaxis, keys_freq) 
locs, labels = plt.xticks() 
plt.setp(labels, rotation=90) 

plt.subplot(212) 
plt.bar(xaxis, values_length, align='center') 
plt.xticks(xaxis, keys_length) 
locs, labels = plt.xticks() 
plt.setp(labels, rotation=90) 

fig.tight_layout() 
plt.show() 

給你:

Matplotlib screenshot

0

最後,在這裏是我在朋友的幫助下做的

#Importing all the necessary libraries 
from collections import Counter 
import matplotlib.pyplot as plt 
import numpy as np 
import string 

#Opening/reading/editing file 

filename=raw_input('Filename (e.g. yourfile.txt): ') 
cond=raw_input('What do you want to count? \n A) Words.\n B) Characters and  Punctuation. \n Choice: ') 
file=open(filename,'r') 
#'r' allows us to read the file 
text=file.read() 
#This allows us to view the entire text and assign it as a gigantic string 
text=text.lower() 
'''We make the entire case lowercase to account for any words that have a capital letter due to sentence structure''' 
if cond in ['A','a','A)','a)']: 
    set=['!', '#', '"', '%', '$',"''" '&', ')', '(', '+', '*', '--', ',', '/', '.', ';', ':', '=', '<', '?', '>', '@', '[', ']', '\\', '_', '^', '`', '{', '}', '|', '~'] 
    text="".join(l for l in text if l not in set) 
    '''Hyphenated words are secure, since the text has set '--' as the dash.''' 
    #Splitting the text into sepereate words, thus creating a big string array. 
    text=text.split() 
    #We then use the Counter function to calculate the frequency of each word appearing in the text. 
    count=Counter(text) 
    '''This is not enough, since count is now a function dependant from speicifc strings. We use the .most_common function to create an array which contains the word and it's frequency in each element.''' 
    count=count.most_common() 
    #Creating empty arrays, replace the 0 with our frequency values and plot it. Along with the experimental data, we will take the averaged proportionality constant (K) and plot the curve y=K/x 
    y=np.arange(len(count)) 
    x=np.arange(1,len(count)+1) 
    yn=["" for m in range(len(count))] 
    '''it is important to change the range from 1 to len(count), since the value 'Rank' always starts from 1.''' 
    for i in range(len(count)): 
     y[i]=count[i][1] 
     yn[i]=count[i][0] 
    K,Ks=round(np.average(x*y),2),round(np.std(x*y),2) 
    plt.plot(x,y,color='red',linewidth=3) 
    plt.plot(x,K/x,color='green',linewidth=2) 
    plt.xlabel('Rank') 
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0)) 
    plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) 
    plt.plot(0,0,'o',alpha=0) 
    plt.ylabel('Frequency') 
    plt.grid(True) 
    plt.title("Testing Zipf's Law: the relationship between the frequency and rank of a word in a text") 
    plt.legend(['Experimental data', 'y=K/x, K=%s, $\delta_{K}$ = %s'%(K,Ks),  'Most used word=%s, least used=%s'%(count[0],count[-1])], loc='best',numpoints=1) 
    plt.show() 
elif cond in ['B','b','B)','b)']: 
    text=text.translate(None, string.whitespace) 
    count=Counter(text) 
    count=count.most_common() 
    y=np.arange(len(count)) 
    x=np.arange(1,len(count)+1) 
    yn=["" for m in range(len(count))] 
    for i in range(len(count)): 
     y[i]=count[i][1] 
     yn[i]=count[i][0] 
    K,Ks=round(np.average(x*y),2),round(np.std(x*y),2) 
    plt.plot(x,y,color='red',linewidth=3) 
    plt.plot(x,K/x,color='green',linewidth=2) 
    plt.xlabel('Rank') 
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0)) 
    plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) 
    plt.plot(0,0,'o',alpha=0) 
    plt.ylabel('Frequency') 
    plt.grid(True) 
    plt.title("Testing Zipf's Law: the relationship between the frequency and rank of a character/punctuation, in a text") 
    plt.legend(['Experimental data', 'y=K/x, K=%s, $\delta_{K}$ = %s'%(K,Ks), 'Most used character=%s, least used=%s'%(count[0],count[-1])],  loc='best',numpoints=1) 
    plt.show()