2016-10-04 157 views
3

我跟了周圍的幾個教程,但我不能得到這個代碼塊的運行,我的確從StringIO的必要切換到BytesIO(我相信嗎?)Pdfminer蟒蛇3.5

我不確定爲什麼「香蕉'沒有印刷任何東西,我認爲這些錯誤可能是紅鯡魚?是不是跟着一個python2.7教程並試圖將它翻譯成python3?

errors: File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 28, in <module> 
    banana = convert("A1.pdf") 
    File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 19, in convert 
    infile = file(fname, 'rb') 
NameError: name 'file' is not defined 

腳本

from io import BytesIO 

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.converter import TextConverter 
from pdfminer.layout import LAParams 
from pdfminer.pdfpage import PDFPage 

def convert(fname, pages=None): 
    if not pages: 
     pagenums = set() 
    else: 
     pagenums = set(pages) 

    output = BytesIO() 
    manager = PDFResourceManager() 
    converter = TextConverter(manager, output, laparams=LAParams()) 
    interpreter = PDFPageInterpreter(manager, converter) 

    infile = file(fname, 'rb') 
    for page in PDFPage.get_pages(infile, pagenums): 
     interpreter.process_page(page) 
    infile.close() 
    converter.close() 
    text = output.getvalue() 
    output.close 
    return text 

banana = convert("A1.pdf") 
print(banana) 

同樣的事情發生這種變異:

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.converter import TextConverter 
from pdfminer.layout import LAParams 
from pdfminer.pdfpage import PDFPage 
from io import BytesIO 

def convert_pdf_to_txt(path): 
    rsrcmgr = PDFResourceManager() 
    retstr = BytesIO() 
    codec = 'utf-8' 
    laparams = LAParams() 
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) 
    fp = file(path, 'rb') 
    interpreter = PDFPageInterpreter(rsrcmgr, device) 
    password = "" 
    maxpages = 0 
    caching = True 
    pagenos=set() 

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): 
     interpreter.process_page(page) 

    text = retstr.getvalue() 

    fp.close() 
    device.close() 
    retstr.close() 
    return text 

Banana = convert_pdf_to_txt("A1.pdf") 
print(Banana) 

我試圖尋找這個(大部分pdfminer代碼是從thisthis),但有沒有運氣。

任何洞察力是讚賞。

乾杯

+0

請確認由要麼upvoting或接受我的答案 – animal

回答

4

的Python 3.5的解決方案:你需要pdfminer.six。在win10我可以容易

pip install pdfminer.six 

安裝它,您可以用

pdfminer.__version__ 

我沒有測試它仍然集中檢查安裝的版本。但我可以運行轉換PDF文本→和 PDF 下面的代碼→HTML

0

pdfminer不支持Python版本3.5。它僅適用於Python 2.6或更新版本。我面臨同樣的問題嘗試使用蟒蛇版本2.6它會解決你的問題。

4

改進方案(費爾南德斯2016)

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter 
from pdfminer.layout import LAParams 
from pdfminer.pdfpage import PDFPage 
import io 

def convert(case,fname, pages=None): 
    if not pages: pagenums = set(); 
    else:   pagenums = set(pages);  
    manager = PDFResourceManager() 
    codec = 'utf-8' 
    caching = True 

    if case == 'text' : 
     output = io.StringIO() 
     converter = TextConverter(manager, output, codec=codec, laparams=LAParams())  
    if case == 'HTML' : 
     output = io.BytesIO() 
     converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams()) 

    interpreter = PDFPageInterpreter(manager, converter) 
    infile = open(fname, 'rb') 

    for page in PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True): 
     interpreter.process_page(page) 

    convertedPDF = output.getvalue() 

    infile.close(); converter.close(); output.close() 
    return convertedPDF 

#//////////// main /////////////////////// 
filePDF = 'myDir//myPDF.pdf'  # input 
fileHTML = 'myDir//myHTML.html' # output 
fileTXT = 'myDir//myTXT.txt'  # output 

case = "HTML" 

if case == 'HTML' : 
    convertedPDF = convert('HTML', filePDF, pages=[0,1]) 
    fileConverted = open(fileHTML, "wb") 
if case == 'text' : 
    convertedPDF = convert('text', filePDF, pages=[0,1]) 
    fileConverted = open(fileTXT, "w") 

fileConverted.write(convertedPDF) 
fileConverted.close() 
#print(convertedPDF)