2016-08-02 51 views
0

這涉及到使用iTextSharp的5.5.8或5.5.9,我的測試工具是:iTextSharp的GetTextFromPage不返回

{ 
    PdfReader pdfReader = null; 
    StringBuilder actual = new StringBuilder(); 

    try 
    { 
    pdfReader = new PdfReader(@"Quotation for Macbook 6-16.pdf"); 
    } 
    catch (iTextSharp.text.exceptions.BadPasswordException bpe) 
    { 
    actual.AppendLine(string.Format("Exception: Bad Password {0}", bpe)); 
    } 
    catch (Exception ex) 
    { 
    actual.AppendLine(string.Format("Exception: PDFReader {0}", ex)); 
    } 

    int pages = pdfReader.NumberOfPages; 
    for (int page = 1; page <= pages; page++) 
    { 
    try 
    { 
     String s = PdfTextExtractor.GetTextFromPage(pdfReader, page); 
     actual.AppendLine(string.Format("{0}", s)); 
    } 
    catch (Exception ex) 
    { 
     actual.AppendLine(string.Format("Exception PDF Page {0}: {1}", page, ex)); 
    } 
    } 

    foreach (var field in pdfReader.AcroFields.Fields) 
    { 
    actual.AppendLine(string.Format("{0}: {1}", field.Key, pdfReader.AcroFields.GetField(field.Key))); 
    } 
} 

我已經處理了成千上萬的PDF文件調用GetTextFromPage的,但遇到特定的PDF根本不返回。我從GitHub下載代碼,並通過它走了處理文件,它看起來像時,它調用InitFirst導致這裏的連續循環的LineDashPattern的條件是LineDashPattern.cs

 private void InitFirst(float phase) { 
     if (dashArray.Size > 0) { 
      while (phase > 0) { 
       phase -= dashArray.GetAsNumber(currentIndex).FloatValue; 
       currentIndex = (currentIndex + 1) % DashArray.Size; 
       elemOrdinalNumber++; 
      } 

      if (phase < 0) { 
       --elemOrdinalNumber; 
       --currentIndex; 
       currentElem = new DashArrayElem(-phase, IsEven(elemOrdinalNumber)); 
      } else { 
       currentElem = new DashArrayElem(dashArray.GetAsNumber(currentIndex).FloatValue, 
        IsEven(elemOrdinalNumber)); 
      } 
     } 
    } 

傳遞相位碼in是6.44245E + 8在dashArray 28.8中有兩個入口,而對於這個階段有9.6這樣大的數字導致第一個,而因爲28.8不足以根據float的分辨率降低相位而被阻塞。

我對內部知識不夠了解,或者我會考慮進行更改。

我真的只對提取文本感興趣,所以如果有一個設置我可以實現來過濾出對我來說也適用的行處理。

+0

好的。作爲循環條件的浮點運算是不可取的...... – mkl

+0

我糾正並測試了這個問題,並將修改後的LineDashPattern.cs文件發送到[email protected]。修訂版本基本上將模式長度除以階段,並通過現有例程的其餘部分處理剩餘部分。 – Lee

+0

我會建議你在這裏發佈修改後的代碼,以便其他需要修復的人也能在手邊。 iText 7是目前主要使用的版本,因此可能需要一些時間才能在官方發行版中應用iText 5.5.x修復程序。 – mkl

回答

1

我更新了LineDashPattern.cs文件。我正在使用iTextSharp,據我所知5.5.9是最新版本,所以iText 7可能是Java。

無論如何,這裏是我更新的代碼。我添加了一個elts(線元素的總和)作爲該類中的私有字段,更新了dashArray屬性設置例程以基於當前的dashArray更新elts,並最終更新了InitFirst方法以將該階段除以elts做一個批量在一個語句中的計算然後落入原始代碼中以找到實際元素。

我一般都不知道相位值通常會傳遞到例程中,但是如果他們可以調整相位,我的價值將會循環近1700萬次,所以這種變化應該快得多,因爲它被稱爲多次爲這個PDF它變得更大的性能改進,更不用說解決錯誤。完整的文件代碼如下:

/* 
* $Id$ 
* 
* This file is part of the iText (R) project. 
* Copyright (c) 1998-2016 iText Group NV 
* Authors: Bruno Lowagie, Paulo Soares, et al. 
* 
* This program is free software; you can redistribute it and/or modify 
* it under the terms of the GNU Affero General Public License version 3 
* as published by the Free Software Foundation with the addition of the 
* following permission added to Section 15 as permitted in Section 7(a): 
* FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 
* ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT 
* OF THIRD PARTY RIGHTS 
* 
* This program is distributed in the hope that it will be useful, but 
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
* or FITNESS FOR A PARTICULAR PURPOSE. 
* See the GNU Affero General Public License for more details. 
* You should have received a copy of the GNU Affero General Public License 
* along with this program; if not, see http://www.gnu.org/licenses or write to 
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 
* Boston, MA, 02110-1301 USA, or download the license from the following URL: 
* http://itextpdf.com/terms-of-use/ 
* 
* The interactive user interfaces in modified source and object code versions 
* of this program must display Appropriate Legal Notices, as required under 
* Section 5 of the GNU Affero General Public License. 
* 
* In accordance with Section 7(b) of the GNU Affero General Public License, 
* a covered work must retain the producer line in every PDF that is created 
* or manipulated using iText. 
* 
* You can be released from the requirements of the license by purchasing 
* a commercial license. Buying such a license is mandatory as soon as you 
* develop commercial activities involving the iText software without 
* disclosing the source code of your own applications. 
* These activities include: offering paid services to customers as an ASP, 
* serving PDFs on the fly in a web application, shipping iText with a closed 
* source product. 
* 
* For more information, please contact iText Software Corp. at this 
* address: [email protected] 
*/ 

using System.util; 
using iTextSharp.awt.geom; 

namespace iTextSharp.text.pdf.parser { 

    /** 
    * Represents the line dash pattern. The line dash pattern shall control the pattern 
    * of dashes and gaps used to stroke paths. It shall be specified by a dash array and 
    * a dash phase. 
    * 
    * @since 5.5.6 
    */ 
    public class LineDashPattern { 

     private PdfArray dashArray; 
     private float dashPhase; 

     private int currentIndex; 
     private int elemOrdinalNumber = 1; 
     private DashArrayElem currentElem; 
     private float elts = 0.0F; 

     /** 
     * Creates new {@link LineDashPattern} object. 
     * @param dashArray The dash array. See {@link #getDashArray()} 
     * @param dashPhase The dash phase. See {@link #getDashPhase()} 
     */ 
     public LineDashPattern(PdfArray dashArray, float dashPhase) { 
      this.dashArray = new PdfArray(dashArray); 
      this.dashPhase = dashPhase; 
      InitFirst(dashPhase); 
     } 

     /** 
     * Getter and setter for the dash array. 
     * 
     * The dash array’s elements is number that specify the lengths of 
     * alternating dashes and gaps; the numbers are nonnegative. The 
     * elements are expressed in user space units. 
     * 
     * @return The dash array. 
     */ 
     public PdfArray DashArray { 
      get { return dashArray; } 
      set 
      { 
       dashArray = value; 
       float elts = 0.0F; 
       for (int i = 0; i < dashArray.Size; i++) 
       { 
       elts += dashArray.GetAsNumber(i).FloatValue; 
       } 
      } 
     } 

     /** 
     * Getter and setter for the dash phase. 
     * 
     * The dash phase shall specify the distance into the dash pattern at which 
     * to start the dash. The elements are expressed in user space units. 
     * 
     * @return The dash phase. 
     */ 
     public float DashPhase { 
      get { return dashPhase; } 
      set { dashPhase = value; } 
     } 

     /** 
     * Calculates and returns the next element which is either gap or dash. 
     * @return The next dash array's element. 
     */ 
     public DashArrayElem Next() { 
      DashArrayElem ret = currentElem; 

      if (dashArray.Size > 0) { 
       currentIndex = (currentIndex + 1) % DashArray.Size; 
       currentElem = new DashArrayElem(dashArray.GetAsNumber(currentIndex).FloatValue, 
        IsEven(++elemOrdinalNumber)); 
      } 

      return ret; 
     } 

     /** 
     * Checks whether the dashed pattern is solid or not. It's solid when the 
     * size of a dash array is even and sum of all the units off in the array 
     * is 0.<br/> 
     * For example: [3 0 4 0 5 0 6 0] (sum is 0), [3 0 4 0 5 1] (sum is 1). 
     */ 
     public bool IsSolid() { 
      if (dashArray.Size % 2 != 0) { 
       return false; 
      } 

      float unitsOffSum = 0; 

      for (int i = 1; i < dashArray.Size; i += 2) { 
       unitsOffSum += dashArray.GetAsNumber(i).FloatValue; 
      } 

      return Util.Compare(unitsOffSum, 0) == 0; 
     } 

     /** 
     * Resets the dash array so that the {@link #next()} method will start 
     * from the beginning of the dash array. 
     */ 
     public void Reset() { 
      currentIndex = 0; 
      elemOrdinalNumber = 1; 
      InitFirst(dashPhase); 
     } 

     private void InitFirst(float phase) { 
      if (dashArray.Size > 0) { 
       // handle a bulk of the line pattern 
       // 
       if (elts > 0.0) 
       { 
       int occurances = (int)(phase/elts); 
       elemOrdinalNumber = occurances * dashArray.Size; 
       phase -= occurances * elts; 

       // adjust for the final set of pattern elements 
       // 
       while (phase > 0) 
       { 
        phase -= dashArray.GetAsNumber(currentIndex).FloatValue; 
        currentIndex = (currentIndex + 1) % DashArray.Size; 
        elemOrdinalNumber++; 
       } 

       if (phase < 0) 
       { 
        --elemOrdinalNumber; 
        --currentIndex; 
        currentElem = new DashArrayElem(-phase, IsEven(elemOrdinalNumber)); 
       } 
       else 
       { 
        currentElem = new DashArrayElem(dashArray.GetAsNumber(currentIndex).FloatValue, 
         IsEven(elemOrdinalNumber)); 
       } 
       } 
      } 
     } 

     private bool IsEven(int num) { 
      return (num % 2) == 0; 
     } 

     public class DashArrayElem { 

      private float val; 
      private bool isGap; 

      public DashArrayElem(float val, bool isGap) { 
       this.val = val; 
       this.isGap = isGap; 
      } 

      public float Value 
      { 
       get { return val; } 
       set { val = value; } 
      } 

      public bool IsGap 
      { 
       get { return isGap; } 
       set { isGap = value; } 
      } 
     } 
    } 
}