0
我試圖使用Lucene(5.4.1)MoreLikeThis來標記(分類)文本。這是一種工作,但我得到的結果很差,我認爲這個問題與查詢對象有關。Lucene正在限制查詢條件
示例波紋管的作品,但最高的topdoc不是我所期望的。通過調試查詢對象,它僅顯示content:erro
。從一個完整的葡萄牙語短語(參見示例),查詢只用一個詞建立。
我沒有使用停用詞或任何其他類型的過濾器。
那麼爲什麼lucene只挑選erro作爲查詢條件?
至Init,主要對象
Analyzer analyzer = new PortugueseAnalyzer();
Directory indexDir = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(OpenMode.CREATE_OR_APPEND);
索引
try (IndexWriter indexWriter = new IndexWriter(indexDir, config)) {
FieldType type = new FieldType();
type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
type.setStored(true);
type.setStoreTermVectors(true);
Document doc = new Document();
doc.add(new StringField("id", "880b2bbc", Store.YES));
doc.add(new Field("content", "erro", type));
doc.add(new Field("tag", "atag", type));
indexWriter.addDocument(doc);
indexWriter.commit();
}
要搜索
try (IndexReader idxReader = DirectoryReader.open(indexDir)) {
IndexSearcher indexSearcher = new IndexSearcher(idxReader);
MoreLikeThis mlt = new MoreLikeThis(idxReader);
mlt.setMinTermFreq(0);
mlt.setMinDocFreq(0);
mlt.setFieldNames(new String[] { "content" });
mlt.setAnalyzer(analyzer);
Reader sReader = new StringReader("Melhorias no controle de sessão no sistema qquercoisa quando expira, ao logar novamente no sistema é exibido o erro "xpto");
Query query = mlt.like("content", sReader);
TopDocs topDocs = indexSearcher.search(query, 3);
}