使用PDFMiner处理单色页面
问题描述:
我有一些PDF文档,我无法使用PyPDF仅使用PDFMiner提取文本。以下代码可以正常工作以从PDF中提取所有文本,它会遍历整个文档,然后返回所有文本。 有没有办法只能使用PDF的某些页面? 我拥有的PDF格式都是2000-3000多长,我只需要每隔一页就完成一次。使用PDFMiner处理单色页面
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec,laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
答
你不能使用enumerate
获得页面数和同时通过所有页面遍历网页的内容?如果您只需要每隔一页,请使用模数。如果您只想要特定页面,请使用范围。
例子:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec,laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for pagenumber, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)):
print pagenumber
if pagenumber % 2 == 0:
print("even page number")
interpreter.process_page(page)
else:
print("odd page number")
if 5 <= pagenumber <= 10:
print("pages 5 to 10")
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
谢谢,这就是我一直在寻找。 – user2665140