Pdfminer蟒蛇3.5

问题描述:

我跟了周围的几个教程,但我不能得到这个代码块的运行,我的确从StringIO的必要切换到BytesIO(我相信吗?)Pdfminer蟒蛇3.5

我不确定为什么“香蕉'没有印刷任何东西,我认为这些错误可能是红鲱鱼?是不是跟着一个python2.7教程并试图将它翻译成python3?

errors: File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 28, in <module> 
    banana = convert("A1.pdf") 
    File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 19, in convert 
    infile = file(fname, 'rb') 
NameError: name 'file' is not defined 

脚本

from io import BytesIO 

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.converter import TextConverter 
from pdfminer.layout import LAParams 
from pdfminer.pdfpage import PDFPage 

def convert(fname, pages=None): 
    if not pages: 
     pagenums = set() 
    else: 
     pagenums = set(pages) 

    output = BytesIO() 
    manager = PDFResourceManager() 
    converter = TextConverter(manager, output, laparams=LAParams()) 
    interpreter = PDFPageInterpreter(manager, converter) 

    infile = file(fname, 'rb') 
    for page in PDFPage.get_pages(infile, pagenums): 
     interpreter.process_page(page) 
    infile.close() 
    converter.close() 
    text = output.getvalue() 
    output.close 
    return text 

banana = convert("A1.pdf") 
print(banana) 

同样的事情发生这种变异:

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.converter import TextConverter 
from pdfminer.layout import LAParams 
from pdfminer.pdfpage import PDFPage 
from io import BytesIO 

def convert_pdf_to_txt(path): 
    rsrcmgr = PDFResourceManager() 
    retstr = BytesIO() 
    codec = 'utf-8' 
    laparams = LAParams() 
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) 
    fp = file(path, 'rb') 
    interpreter = PDFPageInterpreter(rsrcmgr, device) 
    password = "" 
    maxpages = 0 
    caching = True 
    pagenos=set() 

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): 
     interpreter.process_page(page) 

    text = retstr.getvalue() 

    fp.close() 
    device.close() 
    retstr.close() 
    return text 

Banana = convert_pdf_to_txt("A1.pdf") 
print(Banana) 

我试图寻找这个(大部分pdfminer代码是从thisthis),但有没有运气。

任何洞察力是赞赏。

干杯

+0

请确认由要么upvoting或接受我的答案 – animal

的Python 3.5的解决方案:你需要pdfminer.six。在win10我可以容易

pip install pdfminer.six 

安装它,您可以用

pdfminer.__version__ 

我没有测试它仍然集中检查安装的版本。但我可以运行转换PDF文本→和 PDF 下面的代码→HTML

pdfminer不支持Python版本3.5。它仅适用于Python 2.6或更新版本。我面临同样的问题尝试使用蟒蛇版本2.6它会解决你的问题。

改进方案(费尔南德斯2016)

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter 
from pdfminer.layout import LAParams 
from pdfminer.pdfpage import PDFPage 
import io 

def convert(case,fname, pages=None): 
    if not pages: pagenums = set(); 
    else:   pagenums = set(pages);  
    manager = PDFResourceManager() 
    codec = 'utf-8' 
    caching = True 

    if case == 'text' : 
     output = io.StringIO() 
     converter = TextConverter(manager, output, codec=codec, laparams=LAParams())  
    if case == 'HTML' : 
     output = io.BytesIO() 
     converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams()) 

    interpreter = PDFPageInterpreter(manager, converter) 
    infile = open(fname, 'rb') 

    for page in PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True): 
     interpreter.process_page(page) 

    convertedPDF = output.getvalue() 

    infile.close(); converter.close(); output.close() 
    return convertedPDF 

#//////////// main /////////////////////// 
filePDF = 'myDir//myPDF.pdf'  # input 
fileHTML = 'myDir//myHTML.html' # output 
fileTXT = 'myDir//myTXT.txt'  # output 

case = "HTML" 

if case == 'HTML' : 
    convertedPDF = convert('HTML', filePDF, pages=[0,1]) 
    fileConverted = open(fileHTML, "wb") 
if case == 'text' : 
    convertedPDF = convert('text', filePDF, pages=[0,1]) 
    fileConverted = open(fileTXT, "w") 

fileConverted.write(convertedPDF) 
fileConverted.close() 
#print(convertedPDF)