2015-08-08 55 views
1

你好我試圖做一些與Okapi BM25算法的文件相似度計算。如果使用BM25算法計算文檔相似度,爲什麼Lucene不會返回匹配結果?

但我遇到了查詢類型的問題。除非使用默認的Queryparser,否則我無法獲得結果。

其基本思想是通過建立一個包含文檔內容的查詢來爲目標文檔建立索引並將它們與源文檔進行比較。

這是一個非常簡約的方法,但我必須讓它工作。如果我在做一些愚蠢的事情,請糾正我。

我的代碼看起來如下:

package de.paul.bm25; 

import java.io.IOException; 
import java.util.ArrayList; 
import java.util.List; 

import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.core.KeywordAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.document.TextField; 
import org.apache.lucene.index.DirectoryReader; 
import org.apache.lucene.index.IndexReader; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.index.IndexWriterConfig; 
import org.apache.lucene.index.Term; 
import org.apache.lucene.queryparser.classic.ParseException; 
import org.apache.lucene.search.BooleanClause.Occur; 
import org.apache.lucene.search.BooleanQuery; 
import org.apache.lucene.search.DisjunctionMaxQuery; 
import org.apache.lucene.search.IndexSearcher; 
import org.apache.lucene.search.PhraseQuery; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.search.ScoreDoc; 
import org.apache.lucene.search.TopDocs; 
import org.apache.lucene.search.similarities.Similarity; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.RAMDirectory; 

public class DocumentSimilarityBM25 { 

    Analyzer analyzer; 
    Directory index; 
    IndexWriterConfig config; 
    IndexWriter writer; 
    IndexReader reader; 
    IndexSearcher searcher; 
    Similarity similarity = new DefaultSimilarity(); 
    String FIELD_CONTENT = "CONTENT"; 

    public DocumentSimilarityBM25() throws IOException { 
     analyzer = new KeywordAnalyzer(); 
     index = new RAMDirectory(); 
     config = new IndexWriterConfig(analyzer); 
     writer = new IndexWriter(index, config); 
     similarity = new BM25Similarity(); 
    } 

    public void start() { 
     try { 
      index(); 
      List<TopDocs> candidates = search(); 
      printResults(candidates); 
     } catch (IOException | ParseException e) { 
      e.printStackTrace(); 
     } 
    } 

    String[] srcDocuments = new String[]{ 
     "apples are tastefull", 
     "apples and oranges grow an trees", 
     "banana are yellow and very sweet", 
     "this is a zero" 
    }; 

    String[] trgDocuments = new String[]{ 
     "apples oranges and banana", 
     "apples grow on appletrees", 
     "bananes have much suga. " + 
     "so they are high caloric", 
     "bananas have a curvy form", 
     "oranges have the orangecolor and are bigger than apples" 
    }; 

    private void index() throws IOException { 
     for(String target :trgDocuments) { 
      addDoc(createDoc(target)); 
     } 
     System.out.println("Number of indexed Files:" + writer.maxDoc()); 
     writer.close(); 
    } 

    private Query createQuery(Document doc) { 
     final DisjunctionMaxQuery qry = new DisjunctionMaxQuery(0.0f); 
     BooleanQuery bQuery = new BooleanQuery(); 
     PhraseQuery pQuery = new PhraseQuery(); 
     //MultiPhraseQuery mPhrase = new MultiPhraseQuery(); 

     String content = doc.get(FIELD_CONTENT); 
     String[] terms = content.split("\\s"); 
     for(String term : terms) { 
      pQuery = new PhraseQuery(); 
      pQuery.add(new Term(FIELD_CONTENT, term)); 
      bQuery.add(pQuery, Occur.SHOULD); 
     } 

     qry.add(bQuery); 
     return qry; 
    } 

    private List<TopDocs> search() throws IOException, ParseException { 
     List<TopDocs> candidates = new ArrayList<>(); 
     //Query query = new org.apache.lucene.queryparser.classic.QueryParser(FIELD_CONTENT, analyzer).parse(srcDocument); 
     reader = DirectoryReader.open(index); 
     searcher = new IndexSearcher(reader); 
     searcher.setSimilarity(similarity); 

     for(String source : srcDocuments) { 
      Query query = createQuery(createDoc(source)); 

      System.out.println("Query:"+query.toString()); 
      TopDocs candidate = searcher.search(query, reader.maxDoc()); 
      candidates.add(candidate); 
     } 

     return candidates; 
    } 

    private void printResults(List<TopDocs> candidates) throws IOException { 
     for(TopDocs candidate : candidates) { 
      prinCandidate(candidate); 
     } 
     reader.close(); 
    } 

    private void prinCandidate(TopDocs candidate) throws IOException { 
     float maxScore = candidate.getMaxScore(); 
     ScoreDoc[] hits = candidate.scoreDocs; 

     System.out.println("Found " + hits.length + " hits."); 
     System.out.println("MaxScore:" + maxScore); 

     for (int i = 0; i < hits.length; ++i) { 
      int docId = hits[i].doc; 
      Document d = searcher.doc(docId); 
      float score = hits[i].score; 

      System.out.println((i + 1) 
        + ". Score: " + score 
        + " " + d.get(FIELD_CONTENT) + "\t" 
      ); 
     } 
    } 

    private void addDoc(Document doc) throws IOException { 
     writer.addDocument(doc); 
     writer.commit(); 
    } 

    private Document createDoc(String content) throws IOException { 
     Document doc = new Document(); 
     doc.add(new TextField(FIELD_CONTENT, content, Field.Store.YES)); 
     return doc; 
    } 

} 

回答

1

你分析的問題。 KeywordAnalyzer將整個字段索引爲單個標記。它應該用於關鍵字,唯一標識符,零件號碼等等。

雖然您正在嘗試搜索文本。改爲使用StandardAnalyzer,您將看到結果:

public DocumentSimilarityBM25() throws IOException { 
    analyzer = new StandardAnalyzer(); 
    index = new RAMDirectory(); 
    ... 
+0

謝謝!你是對的。 如果已經完成諸如(標記化,詞幹化,停用詞過濾等)的任務,使用關鍵字分析器是否可能或有用。 – PaulSchell

+0

我不會推薦它,但可以肯定,它是*可能的*,只要你分別添加每個標記到字段(例如'doc.add(new StringField(「field」,「apples」,Field.Store。 YES)); doc.add(new StringField(「field」,「oranges」,Field.Store.YES)); doc.add(new StringField(「field」,「and」,Field.Store.YES)); '.....)(注意:'StringField'跳過分析,所以基本上和使用'KeywordAnalyzer'相同)。但是,如果你這樣做,短語和跨度查詢將不起作用。此外,該字段的存儲表示也可能有點難以處理。再次,不是我會推薦的方法。 – femtoRgon