0
所以,現在我想愛沙尼亞語單詞列表約20米的小寫字母。要獲取wordlist的輸入,可以使用corpus of Estonian。語料庫文件採用文本編碼倡議(TEI)格式。我嘗試使用正則表達式來查找單詞。如何建立一個詞彙表
這就是我所做的:效率低下,mcv全部搞砸了,如果單詞的哈希集合不適合內存,它就會剎車,它不知道輸入編碼 - 所以可能像字母š會產生問題,但不會顯示預計完成時間,一些控件有默認名稱,有些則沒有,它不使用多任務(不知道是否應該),它使用了一些奇怪的修復和大量的鎖定界面,以便它看起來不會被「凍結」。至少它很短,你幾乎沒有注意到沒有評論。
上行是,它幾乎可以從.tei,.txt,.csv,smgl,xhtml或任何類似的格式輸入中讀取沒有太多錯誤的單詞。
現在你知道我想做什麼,我怎麼嘗試過(有什麼問題),並且我只是試圖找出如何做到這一點(用最少的手工勞動)。
形象的例子:
代碼示例& Gui:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Data.SqlClient;
using System.IO;
using System.Text.RegularExpressions;
namespace Reader
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void listView1_DragEnter(object sender, DragEventArgs e)
{
if (e.Data.GetDataPresent(DataFormats.FileDrop, false) == true)
{
e.Effect = DragDropEffects.All;
}
}
private void listView1_DragDrop(object sender, DragEventArgs e)
{
setguiLock(true);
this.loading.Visible = true;
ignorechecking = true;
string[] files = (string[])e.Data.GetData(DataFormats.FileDrop, false);
Dictionary<String, ListViewGroup> listviewgroups = new Dictionary<string,ListViewGroup>();
int filenamesi = 0;
foreach (string file in files)
{
progresslabel.Text = string.Format("Progress: \t[ {0}/{1} ]", filenamesi++, files.Length);
Application.DoEvents();
if (File.Exists(file))
{
FileInfo ff = new System.IO.FileInfo(file);
if (!listviewgroups.ContainsKey(ff.DirectoryName))
{
listviewgroups.Add(ff.DirectoryName, new ListViewGroup(ff.DirectoryName, HorizontalAlignment.Left));
listView1.Groups.Add(listviewgroups[ff.DirectoryName]);
}
ListViewItem item = new ListViewItem(ff.Name);
listviewgroups[ff.DirectoryName].Items.Add(item);
item.Checked = true;
item.SubItems.Add("" +((int)ff.Length/1024)+" KB");
// item.Group.Header = ff.DirectoryName;
// listviewgroups[ff.DirectoryName].Items.Add(item);
listView1.Items.Add(item);
}
}
setguiLock(false);
ignorechecking = false;
this.loading.Visible = false;
updatechecked();
}
private void listView1_ItemChecked(object sender, ItemCheckedEventArgs e)
{
updatechecked();
}
private bool ignorechecking = false;
private void updatechecked(){
if (ignorechecking)
return;
long size = 0;
int count = 0;
foreach (ListViewItem item in this.listView1.Items)
{
if (item.Checked)
{
count++;
size += Int32.Parse(item.SubItems[1].Text.Split(" ".ToArray())[0]);
}
}
this.text1.Text = ""+count;
this.text2.Text = ""+size + " KB";
}
private void putHashset(HashSet<string> d, string filename)
{
StringBuilder sb = new StringBuilder();
foreach (string key in d)
sb.Append(key).Append("\n");
File.WriteAllText(filename, sb.ToString());
}
private HashSet<string> getHashset(string filename)
{
return new HashSet<string>(new Regex("\\n+").Split(File.ReadAllText(filename)));
}
private void removefilefromlistview(string fullfilename) {
foreach (ListViewItem item in this.listView1.Items)
{
String file = item.Group.Header + "\\" + item.SubItems[0].Text;
if (fullfilename.CompareTo(file) == 0)
{
item.Checked = false;
this.listView1.Items.Remove(item);
}
}
}
private void starter(object sender, EventArgs e)
{
HashSet<string> filenames = new HashSet<string>();
StringBuilder data = null;
setguiLock(true);
this.time2.Text = "";
this.time1.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);
foreach (ListViewItem item in this.listView1.Items) {
if (item.Checked) {
String file = item.Group.Header + "\\" + item.SubItems[0].Text;
if (File.Exists(file))
filenames.Add(file);
}
}
string outputfile = output.Text;
HashSet<string> words = null;
if (File.Exists(output.Text))
words = getHashset(outputfile);
else
words = new HashSet<string>();
int filenamesnr = filenames.Count;
int filenamesi = 0;
foreach (String str in filenames){
progresslabel.Text = string.Format("Progress: \t[ {0}/{1} ]", filenamesi++, filenamesnr);
Application.DoEvents();
data = new StringBuilder(System.IO.File.ReadAllText(str, Encoding.UTF7).ToLower());
data = data.Replace("ä", "ä");
data = data.Replace("ö", "ö");
data = data.Replace("ü", "ü");
data = data.Replace("õ", "õ");
String sdata = new Regex(@"<(.|\n)*?>|%[a-zA-Z0-9]+?;|&[#a-zA-Z0-9]+?;").Replace(data.ToString(), "");
foreach (string word in new Regex("[^A-Za-zšžõäöüŠŽÕÄÖÜ]+").Split(sdata))
if(word.Length>1)
words.Add(word);
removefilefromlistview(str);
}
progresslabel.Text = "Progress:";
putHashset(words, outputfile);
foreach (ListViewItem item in this.listView1.Items)
if (item.Checked)
{
item.Checked = false;
listView1.Items.Remove(item);
}
this.time2.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);
setguiLock(false);
}
private void setguiLock(bool value){
if(value){
this.Enabled = false;
this.button1.Enabled = false;
this.listView1.Enabled = false;
this.output.Enabled = false;
this.openoutput.Enabled = false;
this.progresslabel.Visible = true;
this.Enabled = true;
}else{
this.Enabled = false;
this.openoutput.Enabled = true;
this.output.Enabled = true;
this.listView1.Enabled = true;
this.button1.Enabled = true;
this.progresslabel.Visible = false;
this.Enabled = true;
}
}
private void button2_Click(object sender, EventArgs e)
{
if (!File.Exists(output.Text))
File.WriteAllText(output.Text, " ");
System.Diagnostics.Process.Start(output.Text);
}
}
}
vanilla數據庫有什麼問題? – Juliet 2010-09-19 21:55:02
您是否要求我們只查看您的代碼並修復它?我會說幾個更加嚴格定義的問題會更加適合,比如「如何在內存中存儲20萬個單詞並搜索它們」等。關於問題的主題,您可以使用SQLite3或其他數據庫來存儲文件,然後使用SQL搜索和連接來修改您的數據。 – 2010-09-19 22:10:41
@Callum Rogers - 你的權利。 – Margus 2010-09-20 09:31:06