0
我有一個程序,它將文本分爲句子,然後將句子分爲單詞,然後計算語音部分的數量並將數據寫入csv文件。問題是這樣的:我需要按類別劃分句子。在輸入上我想要接收一組句子。然後在句子末尾用標點符號標出每個句子,確定其類型。如果這是一個肯定的句子,那麼csv中的標誌將爲零,如果這是一個 疑問句,那麼標誌將是1.我該怎麼做?按其類型劃分句子(疑問/肯定答案)
這是代碼:
# -*- coding: utf-8 -*-
import json
import pymorphy2
import csv
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import re
# with open('kuprin.txt', 'r') as myfile:
# text = myfile.read().replace('\n', '')
text="Hi!How are you?My name is Jack.What is your name?"
sentences = sent_tokenize(text)
morph = pymorphy2.MorphAnalyzer();
s = set(sentences)
for sentences in s:
# print('-'+sentences)
words = word_tokenize(sentences)
print(words)
json_data = []
i = 0
for item in s:
if item == '':
continue
word_list = item.split(' ')
data = {
"id": i,
"sentences": item,
"ADJF": 0,
"NOUN": 0,
"INTJ": 0,
"ADJS": 0,
"COMP": 0,
"VERB": 0,
"INFN": 0,
"PRTF": 0,
"PRTS": 0,
"GRND": 0,
"NUMR": 0,
"ADVB": 0,
"NPRO": 0,
"PRED": 0,
"PREP": 0,
"CONJ": 0,
"PRCL": 0,
"FLAG": 0
}
for word in word_list:
res = morph.parse(word)
pos = res[0].tag.POS
if pos == None:
continue
print(word + "---" + str(pos))
data[pos] += 1
json_data.append(data)
i = i+1
for el in json_data:
print(el)
with open('test.json', 'w') as f:
json.dump(json_data, f, ensure_ascii=False, sort_keys=False, indent=4,
separators=(',', ': '))
txt_file = r"test.json"
csv_file = r"test.csv"
in_txt = csv.reader(open(txt_file, "rt"))
out_csv = csv.writer(open(csv_file, 'w'))
out_csv.writerow(
["id", "sentences", "ADJF", "NOUN", "INTJ", "ADJS", "COMP", "VERB",
"INFN", "PRTF", "PRTS", "GRND", "NUMR",
"ADVB", "NPRO", "PRED", "PREP", "CONJ", "PRCL"])
for el in json_data:
csv_str =[]
for value in el.values():
csv_str += [value]
print(csv_str)
out_csv.writerow(csv_str)