python pdfminer 将 pdf 文件转换为一大块字符串,单词之间没有空格
python pdfminer converts pdf file into one chunk of string with no spaces between words
我使用以下代码主要取自 DuckPuncher 对此 post Extracting text from a PDF file using PDFMiner in python? 的回答,将 pdf 文件转换为文本文件:
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
使用以下代码将pdf下载并存储在我的本地目录中,并存储在我的本地目录中。它运作良好。
import requests
url = 'link_to_the_pdf'
file_name = './name.pdf'
response = requests.get(url)
with open(file_name, 'wb') as f:
f.write(response.content)
但是,对于某些 pdf,convert_pdf_to_txt() 返回的内容几乎是一大块字符串,单词之间没有空格。例如,从 http://www.ece.rochester.edu/~gsharma/papers/LocalImageRegisterEI2005.pdf 下载以下 pdf 并应用 convert_pdf_to_txt() 函数后,我得到了一个文本文件,其中的单词没有用空格分隔。文本文件的摘录是
3Predominantmethodsinthelattergrouparefromcomputervisionarea,e.g.,plane+p
arallax4methodfor3-Dscenestructurecomputation.Inthispaper,weproposeanewlocalimageregistrationtechnique,inthefirstclass,basedonadaptivefilteringtechniques.Adaptivefiltershavebeenutilizedsuccessfullyforsystemidentificationpurposesin1-D.
有人可以帮我解决这个问题吗?是这个特定 pdf 的格式导致了问题还是其他原因,因为对于其他一些 pdf,convert_pdf_to_txt() 函数工作正常。
根据这个 thread 一些 pdf 将整个文本标记为图形,默认情况下 PDFMiner 不会尝试对图形文本执行布局分析。要覆盖此行为,需要将 all_texts 参数设置为 True。
这是一个基于 this post 对我有用的例子。
import io
import pdfminer
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
# Perform layout analysis for all text
laparams = pdfminer.layout.LAParams()
setattr(laparams, 'all_texts', True)
def extract_text_from_pdf(pdf_path):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, laparams=laparams)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
# close open handles
converter.close()
fake_file_handle.close()
if text:
return text
text = extract_text_from_pdf('test.pdf')
我使用以下代码主要取自 DuckPuncher 对此 post Extracting text from a PDF file using PDFMiner in python? 的回答,将 pdf 文件转换为文本文件:
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
使用以下代码将pdf下载并存储在我的本地目录中,并存储在我的本地目录中。它运作良好。
import requests
url = 'link_to_the_pdf'
file_name = './name.pdf'
response = requests.get(url)
with open(file_name, 'wb') as f:
f.write(response.content)
但是,对于某些 pdf,convert_pdf_to_txt() 返回的内容几乎是一大块字符串,单词之间没有空格。例如,从 http://www.ece.rochester.edu/~gsharma/papers/LocalImageRegisterEI2005.pdf 下载以下 pdf 并应用 convert_pdf_to_txt() 函数后,我得到了一个文本文件,其中的单词没有用空格分隔。文本文件的摘录是
3Predominantmethodsinthelattergrouparefromcomputervisionarea,e.g.,plane+p arallax4methodfor3-Dscenestructurecomputation.Inthispaper,weproposeanewlocalimageregistrationtechnique,inthefirstclass,basedonadaptivefilteringtechniques.Adaptivefiltershavebeenutilizedsuccessfullyforsystemidentificationpurposesin1-D.
有人可以帮我解决这个问题吗?是这个特定 pdf 的格式导致了问题还是其他原因,因为对于其他一些 pdf,convert_pdf_to_txt() 函数工作正常。
根据这个 thread 一些 pdf 将整个文本标记为图形,默认情况下 PDFMiner 不会尝试对图形文本执行布局分析。要覆盖此行为,需要将 all_texts 参数设置为 True。
这是一个基于 this post 对我有用的例子。
import io
import pdfminer
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
# Perform layout analysis for all text
laparams = pdfminer.layout.LAParams()
setattr(laparams, 'all_texts', True)
def extract_text_from_pdf(pdf_path):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, laparams=laparams)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
# close open handles
converter.close()
fake_file_handle.close()
if text:
return text
text = extract_text_from_pdf('test.pdf')