2014-12-03 117 views
-1

公共類ExtractText {PDF格式轉換爲HTML頁面明智使用PDFBOX庫

/** 
* private constructor. 
*/ 
private ExtractText() 
{ 
    //static class 
} 


public static void main(String[] args) throws Exception 
{ 


    if(l!=null) 
    { 
     System.out.println("HERE"+l.length); 
     deleteSubs(op); 
     System.out.println("Then"+l.length); 
    } 
    else 
    { 
     System.out.println("WHERE"); 
    } 

    File y=new File(imgDes); 

    if(!y.exists()) 
    { 
     y.mkdirs(); 
    } 
    File z=new File(imgDestination); 

    if(!z.exists()) 
    { 
     z.mkdirs(); 
    } 
    File fr=new File(outputFile); 

    if(!fr.isDirectory()) 
    { 
     fr.delete(); 
    } 
    // Defaults to text files 
    String ext = ".txt"; 
    int startPage = 1; 
    int endPage = Integer.MAX_VALUE; 
    Writer output = null; 
    PDDocument document =null;  
    try 
    { 
     try 
     { 
      URL url = new URL(pdfFile); 

      document = PDDocument.load(url, force); 

      String fileName = url.getFile(); 
      if(outputFile == null && fileName.length() >4) 
      { 
       outputFile = new File(fileName.substring(0, fileName.length() -4) + ext).getName(); 
      } 
     } 
     catch(MalformedURLException e) 
     { 
      document = PDDocument.load(pdfFile, force); 

      if(outputFile == null && pdfFile.length() >4) 
      { 
       outputFile = pdfFile.substring(0, pdfFile.length() -4) + ext; 
      } 
     } 

      //document.print(); 
     if(document.isEncrypted()) 
     { 
      StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password); 
      document.openProtection(sdm); 
      AccessPermission ap = document.getCurrentAccessPermission(); 
      if(! ap.canExtractContent()) 
      { 
       throw new IOException("You do not have permission to extract text"); 
      } 
     } 

     if ((encoding == null) && (toHTML)) 
     { 
      encoding = "UTF-8"; 
     } 
     if(toConsole) 
     { 
      output = new OutputStreamWriter(System.out);             
     } 
     else 
     { 
      if(encoding != null) 
      { 
       output = new OutputStreamWriter(new FileOutputStream(outputFile), encoding); 
      } 
     else 
      { 
        //use default encoding 
       output = new OutputStreamWriter(new FileOutputStream(outputFile)); 
      } 
     } 

     PDFTextStripper4 stripper = null; 

     if(toHTML) 
     { 
      stripper = new PDFText2HTML(encoding); 
     } 

     else 
     { 
      stripper = new PDFTextStripper4(encoding); 
     } 
     File f= new File(imgDestination); 
     PDDocument pd; 

     int i=0; 
     if(f.exists()) 
     { 
      pd=PDDocument.load(pdfFile); 
      PDFontDescriptor fd; 
      fd = new PDFontDescriptorDictionary(); 

      List<PDPage> li=pd.getDocumentCatalog().getAllPages(); 
      for(PDPage page:li) 
      { 
       PDResources pdr=page.getResources(); 

       Map<String, PDFont> m=pdr.getFonts(); 
       PDStream pst; 
      for(PDFont pdd:m.values()) 
      { 
        System.out.println("----------"+pdd.getBaseFont()); 
        pdd.getFontDescriptor(); 
        fd = pdd.getFontDescriptor(); 

        pdd.setFontDescriptor((PDFontDescriptorDictionary)fd); 
        System.out.println("tititititi"+pdd.getFontEncoding()); 
        if(pdd.isType1Font()) 
        { 
        pst=((PDFontDescriptorDictionary) fd).getFontFile3(); 
        System.out.println("In If "+pst); 
        if(pst!= null) 
        { 
         FileOutputStream fos = new FileOutputStream(new File(imgDestination+pdd.getBaseFont().toString()+".pfb")); 
         IOUtils.copy(pst.createInputStream(), fos); 
         i++; 
         System.out.println(i); 
         fos.close(); 
        } 
        } 
        else 
         if(pdd.isTrueTypeFont()) 
         { 
          pst= ((PDFontDescriptorDictionary) fd).getFontFile2(); 
          System.out.println("In Else-if"+pst); 
          if (pst!= null) 
          { 
           FileOutputStream fos = new FileOutputStream(new File(imgDestination+pdd.getBaseFont().toString()+".ttf")); 
           IOUtils.copy(pst.createInputStream(), fos); 
           i++; 
           System.out.println(i); 
           fos.close(); 
          } 
         } 
         else 
          if(pdd.isSymbolicFont()) 
          { 
           System.out.println("Symbol......."); 
          } 
        else 
        { 

         System.out.println("In Else"); 



        } 
       } 

      } 

     int pageCount = document.getDocumentCatalog().getAllPages().size(); 
     for (int p = 0; p < pageCount; ++p) 
     { 
      System.out.println("I am in for loop"); 
      stripper.setForceParsing(force); 
      stripper.setSortByPosition(true); 
      stripper.setShouldSeparateByBeads(separateBeads); 
      stripper.setStartPage(p); 
      stripper.setEndPage(p); 
      stripper.writeText(document, output); 
      FileOutputStream fos = new FileOutputStream(new File(f5+(p+1)+".html")); 
      output.close(); 


     } 

     PDDocumentInformation info = document.getDocumentInformation(); 
     System.out.println("Page Count=" + document.getNumberOfPages()); 
     System.out.println("Title=" + info.getTitle()); 
     System.out.println("Author=" + info.getAuthor()); 
     System.out.println("Subject=" + info.getSubject()); 
     System.out.println("Keywords=" + info.getKeywords()); 
     System.out.println("Creator=" + info.getCreator()); 
     System.out.println("Producer=" + info.getProducer()); 
     System.out.println("Creation Date=" + info.getCreationDate()); 
     System.out.println("Modification Date=" + info.getModificationDate()); 
     System.out.println("Trapped=" + info.getTrapped()); 


    } 
    }catch(Exception e) 
    { 
     e.printStackTrace(); 
    } 
    finally 
    { 
     if(output != null) 
     { 
      output.close(); 
     } 
     if(document != null) 
     { 
      document.close(); 
     } 
    } 
} 


private static void deleteSubs(File op) 
{ 
    // TODO Auto-generated method stub 
    File[] files = op.listFiles(); 
    System.out.print("In delete folder"); 
     if(files!=null) 
     { 
      //some JVMs return null for empty dirs 
      for(File f: files) 
      { 
       if(f.isDirectory()) 
       { 
        deleteSubs(f); 
       } 
       else 
       { 
        f.delete(); 
       } 
      } 
     } 
     op.delete(); 
} 

}

現在我能得到整個PDF到HTML文件即我中提取文本不僅沒有圖片但我希望得到一個PDF的每一頁中單個HTML所以這方面的任何解決方案是對我非常有幫助..三江源

回答

0

答案就在你的問題:剛剛成立

stripper.setStartPage(p); 
    stripper.setEndPage(p); 

。所以,你會循環有點像這樣:

int pageCount = document.getDocumentCatalog().getAllPages().size(); 
for (int p = 0; p < pageCount; ++p) 
{ 
    //... your options 
    stripper.setStartPage(p); 
    stripper.setEndPage(p); 
    FileOutputStream fos = new FileOutputStream(new File(f5+(p+1)+".html")); 
    stripper.writeText(document, fos); 
    fos.close(); 
} 

順便說一句,如果你得到關於排序比較異常,使用setSortByPosition(假的),或者等待1.8.8版本,其中這個問題是固定的。

+0

先生先分裂我的pdf,然後將每個頁面轉換爲.html這是一個可能的方式? – user095736 2014-12-03 12:15:26

+0

當然,如果你喜歡它的硬和慢的方式:-)下載命令行應用程序,並使用PDFSplit命令。 https://pdfbox.apache.org/commandline/#pdfSplit但我提出的解決方案會更快。 – 2014-12-03 12:18:20

+0

for(int p = 0; p user095736 2014-12-03 12:21:10