2017-09-25 111 views
0

我試圖在一個熊貓類中實現Peter Norvig's spell checker,這個類包含從SQL數據庫中提取的單詞。數據包含用戶查詢,其中經常包含一些拼寫錯誤,我希望這個類將返回最有可能的查詢(拼寫正確)。熊貓中的拼寫檢查程序

該類使用返回熊貓數據框的數據庫查詢進行初始化。例如:

query  count 
0 foo bar  1864 
1 super foo  73 
2 bar of foos 1629 
3 crazy foos  940 

大部分低於直接從彼得的工作拉昇,但我似乎已經對類所做的修改就不能正常工作。我的猜測是,這與刪除計數器功能有關(WORDS = Counter(words(open('big.txt').read()))),但我不確定從數據框中獲取相同功能的最佳方式。

低於當前等級:

class _SpellCheckClient(object): 
    """Wraps functionality to check the spelling of a query.""" 

    def __init__(self, team, table, dremel_connection): 
    self.df = database_connection.ExecuteQuery(
     'SELECT query, COUNT(query) AS count FROM table GROUP BY 1;' 

    def expected_word(self, word): 
    """Most probable spelling correction for word.""" 
    return max(self._candidates(word), key=self._probability) 

    def _probability(self, query): 
    """Probability of a given word within a query.""" 
    query_count = self.df.loc[self.df['query'] == query]['count'].values 
    return query_count/self.df['count'].sum() 

    def _candidates(self, word): 
    """Generate possible spelling corrections for word.""" 
    return (self._known([word]) 
      or self._known(self._one_edits_from_word(word)) 
      or self._known(self._two_edits_from_word(word)) 
      or [word]) 

    def _known(self, query): 
    """The subset of `words` that appear in the dictionary of WORDS.""" 
    # return set(w for w in query if w in WORDS) 
    return set(w for w in query if w in self.df['query'].value_counts) 

    def _one_edits_from_word(self, word): 
    """All edits that are one edit away from `word`.""" 
    splits = [(word[:i], word[i:]) for i in xrange(len(word) + 1)] 
    deletes = [left + right[1:] for left, right in splits if right] 
    transposes = [left + right[1] + right[0] + right[2:] 
        for left, right in splits 
        if len(right) > 1] 
    replaces = [left + center + right[1:] 
       for left, right in splits 
       if right for center in LETTERS] 
    inserts = [left + center + right 
       for left, right in splits 
       for center in LETTERS] 
    return set(deletes + transposes + replaces + inserts) 

    def _two_edits_from_word(self, word): 
    """All edits that are two edits away from `word`.""" 
    return (e2 for e1 in self._one_edits_from_word(word) 
      for e2 in self._one_edits_from_word(e1)) 

提前感謝!

回答

0

爲尋找一個答案,以下是我工作:

def _words(df): 
    """Returns the total count of each word within a dataframe.""" 
    return df['query'].str.get_dummies(sep=' ').T.dot(df['count']) 


class _SpellCheckClient(object): 
    """Wraps functionality to check the spelling of a query.""" 

    def __init__(self, team, table, database_connection): 
    self.df = database_connection 
    self.words = _words(self.df) 

    def expected_word(self, query): 
    """Most probable spelling correction for word.""" 
    return max(self._candidates(query), key=self._probability) 

    def _probability(self, query): 
    """Probability of a given word within a query.""" 
    return self.words.pipe(lambda x: x/x.sum()).get(query, 0.0) 

    def _candidates(self, query): 
    """Generate possible spelling corrections for word.""" 
    return (self._known(self._one_edits_from_query(query)) 
      or self._known(self._two_edits_from_query(query)) 
      or [query]) 

    def _known(self, query): 
    """The subset of `query` that appear in the search console database.""" 
    return set(w for w in query if self.words.get(w)) 

    def _one_edits_from_query(self, query): 
    """All edits that are one edit away from `query`.""" 
    splits = [(query[:i], query[i:]) for i in xrange(len(query) + 1)] 
    deletes = [left + right[1:] for left, right in splits if right] 
    transposes = [left + right[1] + right[0] + right[2:] 
        for left, right in splits 
        if len(right) > 1] 
    replaces = [left + center + right[1:] 
       for left, right in splits 
       if right for center in LETTERS] 
    inserts = [left + center + right 
       for left, right in splits 
       for center in LETTERS] 
    return set(deletes + transposes + replaces + inserts) 

    def _two_edits_from_query(self, query): 
    """All edits that are two edits away from `query`.""" 
    return (e2 for e1 in self._one_edits_from_query(query) 
      for e2 in self._one_edits_from_query(e1))