2010-09-19 154 views
0

所以,現在我想愛沙尼亞語單詞列表約20米的小寫字母。要獲取wordlist的輸入,可以使用corpus of Estonian。語料庫文件採用文本編碼倡議(TEI)格式。我嘗試使用正則表達式來查找單詞。如何建立一個詞彙表

這就是我所做的:效率低下,mcv全部搞砸了,如果單詞的哈希集合不適合內存,它就會剎車,它不知道輸入編碼 - 所以可能像字母š會產生問題,但不會顯示預計完成時間,一些控件有默認名稱,有些則沒有,它不使用多任務(不知道是否應該),它使用了一些奇怪的修復和大量的鎖定界面,以便它看起來不會被「凍結」。至少它很短,你幾乎沒有注意到沒有評論。

上行是,它幾乎可以從.tei,.txt,.csv,smgl,xhtml或任何類似的格式輸入中讀取沒有太多錯誤的單詞。

現在你知道我想做什麼,我怎麼嘗試過(有什麼問題),並且我只是試圖找出如何做到這一點(用最少的手工勞動)。

形象的例子:

alt text

代碼示例& Gui

using System; 
using System.Collections.Generic; 
using System.ComponentModel; 
using System.Data; 
using System.Drawing; 
using System.Linq; 
using System.Text; 
using System.Windows.Forms; 
using System.Data.SqlClient; 
using System.IO; 
using System.Text.RegularExpressions; 

namespace Reader 
{ 
    public partial class Form1 : Form 
    { 
     public Form1() 
     { 
      InitializeComponent(); 
     } 


     private void listView1_DragEnter(object sender, DragEventArgs e) 
     { 
      if (e.Data.GetDataPresent(DataFormats.FileDrop, false) == true) 
      { 
       e.Effect = DragDropEffects.All; 
      } 
     } 

     private void listView1_DragDrop(object sender, DragEventArgs e) 
     { 
      setguiLock(true); 
      this.loading.Visible = true; 
      ignorechecking = true; 
      string[] files = (string[])e.Data.GetData(DataFormats.FileDrop, false); 
      Dictionary<String, ListViewGroup> listviewgroups = new Dictionary<string,ListViewGroup>(); 

      int filenamesi = 0; 

      foreach (string file in files) 
      { 
       progresslabel.Text = string.Format("Progress: \t[ {0}/{1} ]", filenamesi++, files.Length); 
       Application.DoEvents(); 
       if (File.Exists(file)) 
       { 
        FileInfo ff = new System.IO.FileInfo(file); 
        if (!listviewgroups.ContainsKey(ff.DirectoryName)) 
        { 
         listviewgroups.Add(ff.DirectoryName, new ListViewGroup(ff.DirectoryName, HorizontalAlignment.Left)); 
         listView1.Groups.Add(listviewgroups[ff.DirectoryName]); 
        } 
        ListViewItem item = new ListViewItem(ff.Name); 
        listviewgroups[ff.DirectoryName].Items.Add(item); 
        item.Checked = true; 

        item.SubItems.Add("" +((int)ff.Length/1024)+" KB"); 

        // item.Group.Header = ff.DirectoryName; 
        // listviewgroups[ff.DirectoryName].Items.Add(item); 
        listView1.Items.Add(item); 
       } 
      } 
      setguiLock(false); 
      ignorechecking = false; 
      this.loading.Visible = false; 
      updatechecked(); 
     } 

     private void listView1_ItemChecked(object sender, ItemCheckedEventArgs e) 
     { 
      updatechecked(); 
     } 
     private bool ignorechecking = false; 
     private void updatechecked(){ 
      if (ignorechecking) 
       return; 
      long size = 0; 
      int count = 0; 
      foreach (ListViewItem item in this.listView1.Items) 
      { 
       if (item.Checked) 
       { 
        count++; 
        size += Int32.Parse(item.SubItems[1].Text.Split(" ".ToArray())[0]); 
       } 
      } 
      this.text1.Text = ""+count; 
      this.text2.Text = ""+size + " KB"; 
     } 
     private void putHashset(HashSet<string> d, string filename) 
     { 
      StringBuilder sb = new StringBuilder(); 
      foreach (string key in d) 
       sb.Append(key).Append("\n"); 

      File.WriteAllText(filename, sb.ToString()); 
     } 
     private HashSet<string> getHashset(string filename) 
     { 
      return new HashSet<string>(new Regex("\\n+").Split(File.ReadAllText(filename))); 
     } 

     private void removefilefromlistview(string fullfilename) { 
      foreach (ListViewItem item in this.listView1.Items) 
      { 
       String file = item.Group.Header + "\\" + item.SubItems[0].Text; 
       if (fullfilename.CompareTo(file) == 0) 
       { 
        item.Checked = false; 
        this.listView1.Items.Remove(item); 
       } 
      } 
     } 
     private void starter(object sender, EventArgs e) 
     { 
      HashSet<string> filenames = new HashSet<string>(); 
      StringBuilder data = null; 

      setguiLock(true); 
      this.time2.Text = ""; 
      this.time1.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now); 

      foreach (ListViewItem item in this.listView1.Items) { 
       if (item.Checked) { 
        String file = item.Group.Header + "\\" + item.SubItems[0].Text; 
        if (File.Exists(file)) 
         filenames.Add(file); 
       } 
      } 

      string outputfile = output.Text; 
      HashSet<string> words = null; 
      if (File.Exists(output.Text)) 
       words = getHashset(outputfile); 
      else 
       words = new HashSet<string>(); 

      int filenamesnr = filenames.Count; 
      int filenamesi = 0; 
      foreach (String str in filenames){ 
       progresslabel.Text = string.Format("Progress: \t[ {0}/{1} ]", filenamesi++, filenamesnr); 
       Application.DoEvents(); 
       data = new StringBuilder(System.IO.File.ReadAllText(str, Encoding.UTF7).ToLower()); 

       data = data.Replace("&auml;", "ä"); 
       data = data.Replace("&ouml;", "ö"); 
       data = data.Replace("&uuml;", "ü"); 
       data = data.Replace("&otilde;", "õ"); 

       String sdata = new Regex(@"<(.|\n)*?>|%[a-zA-Z0-9]+?;|&[#a-zA-Z0-9]+?;").Replace(data.ToString(), ""); 

       foreach (string word in new Regex("[^A-Za-zšžõäöüŠŽÕÄÖÜ]+").Split(sdata)) 
        if(word.Length>1) 
          words.Add(word); 

       removefilefromlistview(str); 
      } 
      progresslabel.Text = "Progress:"; 
      putHashset(words, outputfile); 

      foreach (ListViewItem item in this.listView1.Items) 
       if (item.Checked) 
       { 
        item.Checked = false; 
        listView1.Items.Remove(item); 
       } 

      this.time2.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now); 
      setguiLock(false); 
     } 

     private void setguiLock(bool value){ 
      if(value){ 
       this.Enabled = false; 
       this.button1.Enabled = false; 
       this.listView1.Enabled = false; 
       this.output.Enabled = false; 
       this.openoutput.Enabled = false; 
       this.progresslabel.Visible = true; 
       this.Enabled = true; 
      }else{ 
       this.Enabled = false; 
       this.openoutput.Enabled = true; 
       this.output.Enabled = true; 
       this.listView1.Enabled = true; 
       this.button1.Enabled = true; 
       this.progresslabel.Visible = false; 
       this.Enabled = true; 
      } 
     } 

     private void button2_Click(object sender, EventArgs e) 
     { 
      if (!File.Exists(output.Text)) 
        File.WriteAllText(output.Text, " "); 
      System.Diagnostics.Process.Start(output.Text); 
     } 
    } 
} 
+1

vanilla數據庫有什麼問題? – Juliet 2010-09-19 21:55:02

+2

您是否要求我們只查看您的代碼並修復它?我會說幾個更加嚴格定義的問題會更加適合,比如「如何在內存中存儲20萬個單詞並搜索它們」等。關於問題的主題,您可以使用SQLite3或其他數據庫來存儲文件,然後使用SQL搜索和連接來修改您的數據。 – 2010-09-19 22:10:41

+0

@Callum Rogers - 你的權利。 – Margus 2010-09-20 09:31:06

回答

1

你需要獲得這份工作的合適工具。像這樣的語言語料庫中的數據和標記數量意味着您需要一個適當的XML感知索引解決方案。例子包括eXist,XAIRA,CQP ...