PDF 矿工 python 3.5
Pdfminer python 3.5
我已经遵循了一些教程,但我无法将此代码块转换为 运行,我进行了从 StringIO 到 BytesIO 的必要切换(我相信?)
我不确定为什么 'banana' 什么都不打印,我认为这些错误可能是转移注意力的错误?这与我遵循 python2.7 教程并试图将其翻译成 python3 有什么关系吗?
errors: File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 28, in <module>
banana = convert("A1.pdf")
File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 19, in convert
infile = file(fname, 'rb')
NameError: name 'file' is not defined
脚本
from io import BytesIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = BytesIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
banana = convert("A1.pdf")
print(banana)
同样的事情发生在这个变体上:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
Banana = convert_pdf_to_txt("A1.pdf")
print(Banana)
我试过搜索这个(大部分 pdfminer 代码来自 this or this)但没有成功。
如有任何见解,我们将不胜感激。
干杯
pdfminer 不支持 python 版本 3.5。它仅适用于 Python 2.6 或更新版本。我遇到了同样的问题尝试使用 python 版本 2.6 它会解决你的问题。
Python 3.5 的解决方案:您需要 pdfminer.six。在 win10 下,我可以使用
轻松安装它
pip install pdfminer.six
您可以通过
查看安装的版本
pdfminer.__version__
我还没有深入测试过。但是我可以 运行 以下代码进行转换 pdf→text 和 pdf→html
改进的解决方案(Dez 2016)
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io
def convert(case,fname, pages=None):
if not pages: pagenums = set();
else: pagenums = set(pages);
manager = PDFResourceManager()
codec = 'utf-8'
caching = True
if case == 'text' :
output = io.StringIO()
converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
if case == 'HTML' :
output = io.BytesIO()
converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True):
interpreter.process_page(page)
convertedPDF = output.getvalue()
infile.close(); converter.close(); output.close()
return convertedPDF
#//////////// main ///////////////////////
filePDF = 'myDir//myPDF.pdf' # input
fileHTML = 'myDir//myHTML.html' # output
fileTXT = 'myDir//myTXT.txt' # output
case = "HTML"
if case == 'HTML' :
convertedPDF = convert('HTML', filePDF, pages=[0,1])
fileConverted = open(fileHTML, "wb", encoding="utf-8")
if case == 'text' :
convertedPDF = convert('text', filePDF, pages=[0,1])
fileConverted = open(fileTXT, "w", encoding="utf-8")
fileConverted.write(convertedPDF)
fileConverted.close()
#print(convertedPDF)
在 Python 3.7 的情况下,我尝试使用它,它对我来说就像一个魅力!
这是我使用的代码:
def convert_pdf_to_txt(path_to_file):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path_to_file, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
函数 file()
是 Python 3.5 中的 built-in function in Python 2.7. But it is not a built-in function。
您应该将 file()
更改为 open()
。
我已经遵循了一些教程,但我无法将此代码块转换为 运行,我进行了从 StringIO 到 BytesIO 的必要切换(我相信?)
我不确定为什么 'banana' 什么都不打印,我认为这些错误可能是转移注意力的错误?这与我遵循 python2.7 教程并试图将其翻译成 python3 有什么关系吗?
errors: File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 28, in <module>
banana = convert("A1.pdf")
File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 19, in convert
infile = file(fname, 'rb')
NameError: name 'file' is not defined
脚本
from io import BytesIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = BytesIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
banana = convert("A1.pdf")
print(banana)
同样的事情发生在这个变体上:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
Banana = convert_pdf_to_txt("A1.pdf")
print(Banana)
我试过搜索这个(大部分 pdfminer 代码来自 this or this)但没有成功。
如有任何见解,我们将不胜感激。
干杯
pdfminer 不支持 python 版本 3.5。它仅适用于 Python 2.6 或更新版本。我遇到了同样的问题尝试使用 python 版本 2.6 它会解决你的问题。
Python 3.5 的解决方案:您需要 pdfminer.six。在 win10 下,我可以使用
轻松安装它pip install pdfminer.six
您可以通过
查看安装的版本pdfminer.__version__
我还没有深入测试过。但是我可以 运行 以下代码进行转换 pdf→text 和 pdf→html
改进的解决方案(Dez 2016)
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io
def convert(case,fname, pages=None):
if not pages: pagenums = set();
else: pagenums = set(pages);
manager = PDFResourceManager()
codec = 'utf-8'
caching = True
if case == 'text' :
output = io.StringIO()
converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
if case == 'HTML' :
output = io.BytesIO()
converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True):
interpreter.process_page(page)
convertedPDF = output.getvalue()
infile.close(); converter.close(); output.close()
return convertedPDF
#//////////// main ///////////////////////
filePDF = 'myDir//myPDF.pdf' # input
fileHTML = 'myDir//myHTML.html' # output
fileTXT = 'myDir//myTXT.txt' # output
case = "HTML"
if case == 'HTML' :
convertedPDF = convert('HTML', filePDF, pages=[0,1])
fileConverted = open(fileHTML, "wb", encoding="utf-8")
if case == 'text' :
convertedPDF = convert('text', filePDF, pages=[0,1])
fileConverted = open(fileTXT, "w", encoding="utf-8")
fileConverted.write(convertedPDF)
fileConverted.close()
#print(convertedPDF)
在 Python 3.7 的情况下,我尝试使用它,它对我来说就像一个魅力!
这是我使用的代码:
def convert_pdf_to_txt(path_to_file):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path_to_file, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
函数 file()
是 Python 3.5 中的 built-in function in Python 2.7. But it is not a built-in function。
您应该将 file()
更改为 open()
。