2017-05-05 118 views
1

從下面的代碼我得到了pdf元數據在xmp如下所示。從 這個輸出如何讓每個屬性的值(例如: 「PDF:關鍵詞」)如何從PDMetadata(xmpmeta)使用java搜索<pdf:關鍵字>

InputStream in=new FileInputStream(pdfFile); 
    PDFParser parser=new PDFParser(in); 
    parser.parse(); 
    PDMetadata metadata = parser.getPDDocument().getDocumentCatalog().getMetadata(); 
    if(metadata!=null) 
    { 
    System.out.println(metadata.getInputStreamAsString()); 
    } 

輸出像

metadata<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?> 
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core 5.2-c001 63.139439, 2010/09/27-13:37:26  "> 
    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> 
     <rdf:Description rdf:about="" 
      xmlns:pdf="http://ns.adobe.com/pdf/1.3/"> 
     <pdf:Producer>Acrobat Distiller 7.0 (Windows)</pdf:Producer> 
     <pdf:Keywords>F3392|4|Name 2016|02|2016|04|Sub111 |Three Hours|30|(5X1=5),(5X2=10), (3X5=15)&#xD;&#xA;</pdf:Keywords> 
     </rdf:Description> 
     <rdf:Description rdf:about="" 
      xmlns:pdfx="http://ns.adobe.com/pdfx/1.3/" 
     <pdfx:semester>02</pdfx:semester> 
    </rdf:RDF> 
</x:xmpmeta>   
<?xpacket end="w"?> 

回答

0

你需要Apache PDFBox的和它的不贊成使用JempBox庫(實現Adobe的XMP(TM)規範 - 他們現在使用Xmpbox,但它更加嚴格並且不能處理大多數pdf格式)

PDDocument document = PDDocument.load(inputStream); 
PDDocumentCatalog cat = document.getDocumentCatalog(); 
PDMetadata metadata = cat.getMetadata(); 

if(metadata != null) 
{ 
    try 
    { 
     XMPMetadata meta = XMPMetadata.load(metadata.exportXMPMetadata()); 

     XMPSchema xs = meta.getDublinCoreSchema(); 
     if (xs != null) 
     { 
      System.out.println("About: " + xs.getAbout()); 
      List<String> bagList = xs.getBagList("author"); 
      if(bagList != null) 
      { 
       System.out.println("BagList: "); 
       ShowString(bagList); 
      } 
      List<String> creators = xs.getBagList("creator"); 
      if(creators != null) 
      { 
       System.out.println("Creators: "); 
       ShowString(creators); 
      } 
     } 

     XMPSchemaDublinCore dc = meta.getDublinCoreSchema(); 
     if (dc != null) 
     { 
      System.out.println("Title: " + dc.getTitle()); 
      System.out.println("Description:" + dc.getDescription()); 
      System.out.println("Source:" + dc.getSource()); 
      System.out.println("Identifier:" + dc.getIdentifier()); 
      System.out.println("Coverage:" + dc.getCoverage()); 
      System.out.println("About:" + dc.getAbout()); 
      List<String> relationships = dc.getRelationships(); 
      if(relationships != null) 
      { 
       System.out.println("Relationships: "); 
       ShowString(relationships); 
      } 
      List<String> contributors = dc.getContributors(); 
      if(contributors != null) 
      { 
       System.out.println("Contributors: "); 
       ShowString(contributors); 
      } 
      List<String> creators = dc.getCreators(); 
      if(creators != null) 
      { 
       System.out.println("Creators: "); 
       ShowString(creators); 
      } 
      List<Calendar> dates = dc.getDates(); 
      if(dates != null) 
      { 
       System.out.println("Dates: "); 
       ShowCalendar(dates); 
      } 
      List<String> subjects = dc.getSubjects(); 
      if(subjects != null) 
      { 
       System.out.println("Subjects: "); 
       ShowString(subjects); 
      } 
      List<String> publishers = dc.getPublishers(); 
      if(publishers != null) 
      { 
       System.out.println("Publishers: "); 
       ShowString(publishers); 
      } 
      List<String> languages = dc.getLanguages(); 
      if(languages != null) 
      { 
       System.out.println("Languages: "); 
       ShowString(languages); 
      } 
     } 

     XMPSchemaPDF pdf = meta.getPDFSchema(); 
     if (pdf != null) 
     { 
      System.out.println("Keywords:" + pdf.getKeywords()); 
      System.out.println("PDF Version:" + pdf.getPDFVersion()); 
      System.out.println("PDF Producer:" + pdf.getProducer()); 
     } 

     XMPSchemaBasic basic = meta.getBasicSchema(); 
     if (basic != null) 
     { 
      System.out.println("Create Date:" + basic.getCreateDate().getTime()); 
      System.out.println("Modify Date:" + basic.getModifyDate().getTime()); 
      System.out.println("Creator Tool:" + basic.getCreatorTool()); 
      System.out.println("Label:" + basic.getLabel()); 
      System.out.println("About:" + basic.getAbout()); 
      System.out.println("Nickname:" + basic.getNickname()); 
      System.out.println("Title:" + basic.getTitle()); 
     } 
    } 
    catch (Exception e) 
    { 
     System.err.println("An error occurred while parsing the meta data: " 
       + e.getMessage()); 
    }     
} 
else 
{ 
    PDDocumentInformation info = document.getDocumentInformation(); 
    System.out.println("Title:" + info.getTitle()); 
    System.out.println("Author:" + info.getAuthor()); 
    System.out.println("Subject:" + info.getSubject()); 
    System.out.println("Keywords:" + info.getKeywords()); 
    System.out.println("Creator:" + info.getCreator()); 
    System.out.println("Producer:" + info.getProducer()); 
    System.out.println("Creation Date:" + info.getCreationDate().getTime()); 
    System.out.println("Modification Date:" + info.getModificationDate().getTime()); 
    System.out.println("Trapped:" + info.getTrapped()); 
} 

document.close(); 

Direct download of PDFBox jar

PDFBox Maven link

Direct download of JempBox jar

JempBox Maven link