我想使用 PDFminer 将文本从 PDF 提取到 .text 文件。我找到了代码,但我不知道如何使用它
I want to extract text from a PDF to a .text file using PDFminer. I have found the code but I have no idea how to use it
这是我在这里找到的代码。我不知道如何使用它。有人可以引导我完成此操作并帮助我转换示例 pdf 吗?
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
如果您使用 pdfminer 并使用他们页面上的代码并阅读他们的文档 https://www.binpress.com/tutorial/manipulating-pdfs-with-python/167:
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
我认为您在使用时不会遇到任何问题:
def convert(fname, pages=None): 基本上为你转换 pdf
使用如下:
some_variable = convert("filename.pdf")
print(some_variable)
#do something with your variable
使用您的示例 pdf:
终于我找到了解决这个问题的方法。最好的库是 PDfminer,在 pdf2txt.py 中稍作修改即可有效使用。 pdf2text.py 位于 pdfminer/tools
在终端上安装 PDfminer 使用
pip install PDfminer
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import re
def convert(fname):
pages=None
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
print text
# write Content to .txt
text_file = open("Output_1.txt", "w")
text = re.sub("\s\s+", " ", text)
text_file.write("%s" % text)
text_file.close()
convert("xyz.pdf")
这是我在这里找到的代码。我不知道如何使用它。有人可以引导我完成此操作并帮助我转换示例 pdf 吗?
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
如果您使用 pdfminer 并使用他们页面上的代码并阅读他们的文档 https://www.binpress.com/tutorial/manipulating-pdfs-with-python/167:
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
我认为您在使用时不会遇到任何问题:
def convert(fname, pages=None): 基本上为你转换 pdf
使用如下:
some_variable = convert("filename.pdf")
print(some_variable)
#do something with your variable
使用您的示例 pdf:
终于我找到了解决这个问题的方法。最好的库是 PDfminer,在 pdf2txt.py 中稍作修改即可有效使用。 pdf2text.py 位于 pdfminer/tools
在终端上安装 PDfminer 使用
pip install PDfminer
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import re
def convert(fname):
pages=None
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
print text
# write Content to .txt
text_file = open("Output_1.txt", "w")
text = re.sub("\s\s+", " ", text)
text_file.write("%s" % text)
text_file.close()
convert("xyz.pdf")