2012-01-08 73 views

我使用的是Lucene 3.5.0,我想輸出每個文檔的術語矢量。例如,我想知道所有文檔和每個特定文檔中術語的頻率。 我的索引代碼是:如何在Lucene 3.5.0中提取文檔術語矢量

import java.io.FileFilter; 
import java.io.FileReader; 
import java.io.IOException; 

import java.io.File; 
import java.io.FileReader; 
import java.io.BufferedReader; 

import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.store.RAMDirectory; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.FSDirectory; 
import org.apache.lucene.util.Version; 

public class Indexer { 
public static void main(String[] args) throws Exception { 
     if (args.length != 2) { 
     throw new IllegalArgumentException("Usage: java " + Indexer.class.getName() + " <index dir> <data dir>"); 

    String indexDir = args[0]; 
    String dataDir = args[1]; 
    long start = System.currentTimeMillis(); 
    Indexer indexer = new Indexer(indexDir); 
    int numIndexed; 
    try { 
     numIndexed = indexer.index(dataDir, new TextFilesFilter()); 
    } finally { 
    long end = System.currentTimeMillis(); 
    System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); 

private IndexWriter writer; 

public Indexer(String indexDir) throws IOException { 
    Directory dir = FSDirectory.open(new File(indexDir)); 
    writer = new IndexWriter(dir, 
     new StandardAnalyzer(Version.LUCENE_35), 

public void close() throws IOException { 

public int index(String dataDir, FileFilter filter) throws Exception { 
    File[] files = new File(dataDir).listFiles(); 
    for (File f: files) { 
     if (!f.isDirectory() && 
     !f.isHidden() && 
     f.exists() && 
     f.canRead() && 
     (filter == null || filter.accept(f))) { 
      BufferedReader inputStream = new BufferedReader(new FileReader(f.getName())); 
      String url = inputStream.readLine(); 
      indexFile(f, url); 
    return writer.numDocs(); 

private static class TextFilesFilter implements FileFilter { 
    public boolean accept(File path) { 
     return path.getName().toLowerCase().endsWith(".txt"); 

protected Document getDocument(File f, String url) throws Exception { 
    Document doc = new Document(); 
    doc.add(new Field("contents", new FileReader(f))); 
    doc.add(new Field("urls", url, Field.Store.YES, Field.Index.NOT_ANALYZED)); 
    doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); 
    doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); 
    return doc; 

private void indexFile(File f, String url) throws Exception { 
    System.out.println("Indexing " + f.getCanonicalPath()); 
    Document doc = getDocument(f, url); 




首先,您不需要存儲術語向量以便僅僅知道文檔中術語的頻率。 Lucene將這些數字存儲在TF-IDF計算中。您可以通過調用IndexReader.termDocs(term)並遍歷結果來訪問此信息。



謝謝,它解決了我的問題 – orezvani 2012-01-09 08:00:35


它有助於找到tf-idf嗎? – orezvani 2012-02-08 07:30:05


對不起,你的意思是? – 2012-02-08 14:46:34



* Sums the term frequency vector of each document into a single term frequency map 
* @param indexReader the index reader, the document numbers are specific to this reader 
* @param docNumbers document numbers to retrieve frequency vectors from 
* @param fieldNames field names to retrieve frequency vectors from 
* @param stopWords terms to ignore 
* @return a map of each term to its frequency 
* @throws IOException 
private Map<String,Integer> getTermFrequencyMap(IndexReader indexReader, List<Integer> docNumbers, String[] fieldNames, Set<String> stopWords) 
throws IOException { 
    Map<String,Integer> totalTfv = new HashMap<String,Integer>(1024); 

    for (Integer docNum : docNumbers) { 
     for (String fieldName : fieldNames) { 
      TermFreqVector tfv = indexReader.getTermFreqVector(docNum, fieldName); 
      if (tfv == null) { 
       // ignore empty fields 

      String terms[] = tfv.getTerms(); 
      int termCount = terms.length; 
      int freqs[] = tfv.getTermFrequencies(); 

      for (int t=0; t < termCount; t++) { 
       String term = terms[t]; 
       int freq = freqs[t]; 

       // filter out single-letter words and stop words 
       if (StringUtils.length(term) < 2 || 
        stopWords.contains(term)) { 
        continue; // stop 

       Integer totalFreq = totalTfv.get(term); 
       totalFreq = (totalFreq == null) ? freq : freq + totalFreq; 
       totalTfv.put(term, totalFreq); 

    return totalTfv; 

PS您必須配置每個字段以提前存儲術語頻率向量! \t @Field(指數= Index.TOKENIZED,termVector = TermVector.YES) \t公共字符串益得書摘(){ \t \t返回this.abstract_; \t} – 2012-01-17 21:44:17


非常感謝你,有沒有什麼方法可以計算這些數字內的tf-idf值? http://stackoverflow.com/questions/9189179/extract-tf-idf-vectors-with-lucene – orezvani 2012-02-08 07:17:04


不適用於lucene 4.x – Umingo 2014-10-26 22:01:10