2012-04-28 78 views
1

這是我的程序,用於計算文檔集合中文檔的TF-IDF值。 這工作正常,但計算「IDF」值(找到包含特定術語的文檔的編號)需要很長時間。使用Lucene計算TFIDF分數

是否有更有效的方式找到包含特定術語的文檔的否?

freq = termsFreq.getTermFrequencies(); 

terms = termsFreq.getTerms(); 

int noOfTerms = terms.length; 
score = new float[noOfTerms]; 
DefaultSimilarity simi = new DefaultSimilarity(); 

     for (i = 0; i < noOfTerms; i++) { 

      int noofDocsContainTerm = noOfDocsContainTerm(terms[i]); 
      float tf = simi.tf(freq[i]); 
      float idf = simi.idf(noofDocsContainTerm, noOfDocs); 
      score[i] = tf * idf ; 

     } 

////

public int noOfDocsContainTerm(String querystr) throws CorruptIndexException, IOException, ParseException{ 

QueryParser qp=new QueryParser(Version.LUCENE_35, "docuemnt", new StandardAnalyzer(Version.LUCENE_35)); 

Query q=qp.parse(querystr); 

int hitsPerPage = docNames.length; //minumum number or search results 
IndexSearcher searcher = new IndexSearcher(ramMemDir, true); 
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); 

searcher.search(q, collector); 

ScoreDoc[] hits = collector.topDocs().scoreDocs; 

    return hits.length; 
} 
+0

您好,我一直在努力實現將計算的文檔集合在一個文檔的TF-IDF值的程序。我一直堅持了幾天。如果你不介意的話,請你分享一下你是如何做到的?謝謝。 – fuschia 2015-02-10 16:28:56

+0

@fuschia看到我發佈了我的答案 – Kasun 2015-02-16 07:43:59

回答

3
/* 
* To change this template, choose Tools | Templates 
* and open the template in the editor. 
*/ 



import java.io.*; 
import java.util.*; 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.Token; 
import org.apache.lucene.analysis.TokenStream; 
import org.apache.lucene.analysis.WhitespaceAnalyzer; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.analysis.tokenattributes.TermAttribute; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.index.*; 
import org.apache.lucene.queryParser.ParseException; 
import org.apache.lucene.queryParser.QueryParser; 
import org.apache.lucene.search.*; 
import org.apache.lucene.store.NIOFSDirectory; 
import org.apache.lucene.util.Version; 


/* 
* Date Author Changes April 14, 2012 Kasun Perera Created 
*/ 

/* 
* 
* Class contains methods for indexing documents with Lucene, and calculating 
* TFIDF weights 
*/ 
public class DocIndexer { 

private String docNames[]; 
private String docIDS[]; 
private String pathToIndex; 
private String pathToDocumentCollection; 
private String fiboTermList[]; //marked up fibo terms 
private String taxoTermList[]; // marked up taxonomy terms 
private RAMDirectory ramMemDir; 
private String fileNames[]; 
private byte files[][]; 
private String filesInText[]; 
int noOfWordsOfDOc[]; 
int noOfSentencesOfDoc[]; 
ArrayList<String> ArrLstSentencesOfDoc[]; 
String removedTermsOfDOc[][]; 
int freqAfterRemovalOfDoc[][]; 
//int queryDocIndex ; 
private int curDocNo; 
private final int maxTerms = 1000000; 




/** 
* Constructor used when indexing directory is a RAM memory directory, We 
* need RAM directory because Stratoes Server dosen't allow access local 
* files 
* 
* @param pathToIndex- doc index path 
* @param pathToDocumentCollection - doccollection path 
*/ 
public DocIndexer(String pathToIndex, String pathToDocumentCollection) { 
    // this.docNames = docNames; 

    //this.bufPathToIndex= new RandomAccessBuffer() ; 
    // this.ramMemDir = new RAMDirectory(); 
    this.pathToIndex = pathToIndex; 
    this.pathToDocumentCollection= pathToDocumentCollection; 
    // this.files = files; 
    // this.filesInText = docContent; 

} 




/** 
* Count the number of words in a given String 
* 
* @param line- Input String 
* @return - number of words in the input String 
*/ 
private int wordCount(String line) { 
    int numWords = 0; 
    int index = 0; 
    boolean prevWhiteSpace = true; 
    while (index < line.length()) { 
     char c = line.charAt(index++); 
     boolean currWhiteSpace = Character.isWhitespace(c); 
     if (prevWhiteSpace && !currWhiteSpace) { 
      numWords++; 
     } 
     prevWhiteSpace = currWhiteSpace; 
    } 
    return numWords; 
} 

/* 
*given it's URL this methods read the text files 
*/ 
public static String fileReader(String filename) throws IOException { 

    String filetext = null; 
    BufferedReader reader = null; 
    //BufferedReader namesReader; //reader for followers 
    //Extractor extractor = new Extractor(); 
    File inFile = new File(filename); 
    //File namesFile = new File(args[1]); //get followers file 
    //File userFile = new File(args[1]); 

    //READING FROM USERS FILE 
    reader = new BufferedReader(new FileReader(inFile)); 
    String line = null; 

    int numLine = 0; 

    while ((line = reader.readLine()) != null) { 
     // numLine++; 
     filetext = filetext + " " + line; 

     // System.out.println(line); 
    } 

    reader.close(); 
    return filetext; 
} 

/** 
* Method to index the documents only using the content of the document 
* "docid" field is used for indexing, since Lucene Dosen't retrieve the 
* documents in the indexed order 
* 
* @param docNo- document number of the document to be indexed 
* @throws IOException 
*/ 
public void indexDocs() throws IOException { 
    //String pathToDocumentCollection = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\msrb_fibo_stopwords_replaced_term_docs\\"; 
    // String pathToIndex = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\INDEX_msrb_fibo_stopwords_replaced_term_docs"; 
    File folder = new File(pathToDocumentCollection); 
    File[] listOfFiles = folder.listFiles(); 
    int noOfFiles = listOfFiles.length; 
    System.out.println("Number of files : " + noOfFiles); 

    IndexWriter iW; 
    int indexDocCount = 0; 
    try { 
     NIOFSDirectory dir = new NIOFSDirectory(new File(pathToIndex)); 
     iW = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_36, new WhitespaceAnalyzer(Version.LUCENE_36))); 

     for (int i = 0; i < noOfFiles; i++) { 
      if (listOfFiles[i].isFile()) { 
       String docName = listOfFiles[i].getName(); 
       System.out.println("doc name: " + docName + "length - " + listOfFiles[i].length()); 
       if (listOfFiles[i].length() > 1) { 
        String filesInText = fileReader(pathToDocumentCollection + docName); 

        //docIds[i] = docNames[i].substring(0, docName.length() - 4); 
        System.out.println("Added to index : " + docName); 

        // StringReader strRdElt = new StringReader(filesInText[i]); 
        //filesInText = filesInText.replaceAll("[^A-Za-z_]", " "); 
        //System.out.println("Added to index : " + docName); 
        StringReader strRdElt = new StringReader(filesInText.replaceAll("\\d+(?:[.,]\\d+)*\\s*", "")); 
        StringReader docId = new StringReader(docName.substring(0, docName.length() - 4)); // give a unique doc Id here 

        org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); 

        doc.add(new Field("doccontent", strRdElt, Field.TermVector.YES)); 
        doc.add(new Field("docid", docId, Field.TermVector.YES)); 
        iW.addDocument(doc); 
        indexDocCount++; 
       } 
      } 
     } 

     System.out.println("no of documents added to index : " + indexDocCount); 

     iW.close(); 
     // dir.close() ; 
    } catch (CorruptIndexException e) { 
     e.printStackTrace(); 
    } catch (IOException e) { 
     e.printStackTrace(); 
    } 
} 



/** 
* This method calculates the TF-IDF score for each terms in the indexed 
* documents 
* 
* @param numberOfDocs 
* @return - Hashmap of TF-IDF score per each term in document wise 
* @throws CorruptIndexException 
* @throws ParseException 
*/ 
public HashMap<Integer, HashMap> tfIdfScore(int numberOfDocs) throws CorruptIndexException, ParseException { 

    int noOfDocs = docNames.length; 

    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>(); 
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>(); 


    try { 

     IndexReader re = IndexReader.open(NIOFSDirectory.open(new File(pathToIndex)), true) ; 
     // IndexReader re = IndexReader.open(ramMemDir); 

     int i = 0; 
     for (int k = 0; k < numberOfDocs; k++) { 
      int freq[]; 
      TermFreqVector termsFreq; 
      TermFreqVector termsFreqDocId; 
      //TermFreqVector termsFreq3[]; 
      HashMap<String, Float> wordMap = new HashMap<String, Float>(); 
      String terms[]; 
      float score[] = null; 

      //termsFreq3=re.getTermFreqVectors(currentDocID); 
      termsFreq = re.getTermFreqVector(k, "doccontent"); 
      termsFreqDocId = re.getTermFreqVector(k, "docid"); 

      int aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]); 
      freq = termsFreq.getTermFrequencies(); 

      terms = termsFreq.getTerms(); 

      int noOfTerms = terms.length; 
      score = new float[noOfTerms]; 
      DefaultSimilarity simi = new DefaultSimilarity(); 
      for (i = 0; i < noOfTerms; i++) { 
       int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i])); 
       // System.out.println(terms[i]+"\t"+freq[i]); 
       //int noofDocsContainTerm = docsContainTerm(terms[i], "docnames"); 
       float tf = simi.tf(freq[i]); 
       float idf = simi.idf(noofDocsContainTerm, noOfDocs); 
       wordMap.put(terms[i], (tf * idf)); 

      } 
      scoreMap.put(aInt, wordMap); 
     } 


    } catch (IOException e) { 
     // score = null; 
     e.printStackTrace(); 
    } 



    //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); 


    return scoreMap; 
} 


public HashMap<Integer, HashMap> getTFIDF() throws IOException, CorruptIndexException, ParseException, ClassNotFoundException { 
    int noOfDocs = docNames.length; 
    float tfIdfScore[][] = new float[noOfDocs][]; 
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>(); 
    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>(); 


    scoreMap = tfIdfScore(noOfDocs); 




    return scoreMap; 
} 

}

6

如果你有一個術語,希望它的文檔頻率,即包含這個詞的文檔的數量:電話IndexReader.termEnum(Term)方法。它給你一個TermEnum對象。然後,請撥打TermEnum.docFreq()方法。它爲您提供索引中的術語文檔頻率。