使用 Python PDFMiner 将多个 PDF 提取到文本文件的循环脚本
Loop script to extract multiple PDFs to text files using Python PDFMiner
感谢您的帮助。我发现这个示例脚本可以将 PDF 提取到文本文件:
https://gist.github.com/vinovator/c78c2cb63d62fdd9fb67
这有效,而且它可能是我发现的最准确的提取。我想编辑它以循环遍历多个 PDF 并将它们写入多个文本文件,所有文本文件都与创建它们的 PDF 同名。我正在努力这样做,要么只写一个文本文件,要么覆盖我试图从中提取的 PDF。任何人都可以帮助我循环遍历单个文件夹中的所有 PDF,并将它们提取到与 PDF 同名的单个文本文件中吗?
在此先感谢您的帮助!
import os
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
# From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# Import this to raise exception whenever text extraction from PDF is not allowed
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
base_path = "C://some_folder"
my_file = os.path.join(base_path + "/" + "test_pdf.pdf")
log_file = os.path.join(base_path + "/" + "pdf_log.txt")
password = ""
extracted_text = ""
# Open and read the pdf file in binary mode
fp = open(my_file, "rb")
# Create parser object to parse the pdf content
parser = PDFParser(fp)
# Store the parsed content in PDFDocument object
document = PDFDocument(parser, password)
# Check if document is extractable, if not abort
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create PDFResourceManager object that stores shared resources such as fonts or images
rsrcmgr = PDFResourceManager()
# set parameters for analysis
laparams = LAParams()
# Create a PDFDevice object which translates interpreted information into desired format
# Device needs to be connected to resource manager to store shared resources
# device = PDFDevice(rsrcmgr)
# Extract the decive to page aggregator to get LT object elements
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create interpreter object to process page content from PDFDocument
# Interpreter needs to be connected to resource manager for shared resources and device
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Ok now that we have everything to process a pdf document, lets process it page by page
for page in PDFPage.create_pages(document):
# As the interpreter processes the page stored in PDFDocument object
interpreter.process_page(page)
# The device renders the layout from interpreter
layout = device.get_result()
# Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
extracted_text += lt_obj.get_text()
#close the pdf file
fp.close()
# print (extracted_text.encode("utf-8"))
with open(log_file, "wb") as my_log:
my_log.write(extracted_text.encode("utf-8"))
print("Done !!")
脚本作者在开头用两个参数指定了输入和输出文件:my_file
和log_file
您可以将脚本转换为将这些作为输入并执行提取的函数,然后多次循环此函数。
# import statemates as in the original script
base_path = "C://some_folder"
# Define a pair of tuples with lists of your file names
my_files = ("pdf1.pdf","pdf2.pdf")
log_files = ("log1.txt","log2.txt")
# This is called a list comprehension, it takes each of the
# files listed above and generates the complete file path
my_files = [os.path.join(base_path,x) for x in my_files]
log_files = [os.path.join(base_path,x) for x in log_files]
# Function to extract the file
def extract(my_file,log_file):
# code to perform the file extraction as in the original script
# loop through the file names,
# as we have two list, use a range of indices instead of for name in my_files
for i in range(len(my_files)):
extract(my_files[i],log_files[i])
您还应该查看 os.path.join
的文档,因为您的用法不是最佳实践(切换操作系统时可能会中断)。
假设您有以下目录结构:
script.py
pdfs
├─a.pdf
├─b.pdf
└─c.pdf
txts
其中 script.py
是您的 Python 脚本,pdfs
是包含您的 PDF 文档的文件夹,而 txts
是提取的文本文件应存放的空文件夹.
我们可以使用pathlib.Path.glob
来发现给定目录中所有PDF 文档的路径。我们遍历路径,对于每个路径,我们打开相应的 PDF 文档,解析它,提取文本并将文本保存在 txts
文件夹中的文本文档(同名)中。
def main():
from pathlib import Path
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
for path in Path("pdfs").glob("*.pdf"):
with path.open("rb") as file:
parser = PDFParser(file)
document = PDFDocument(parser, "")
if not document.is_extractable:
continue
manager = PDFResourceManager()
params = LAParams()
device = PDFPageAggregator(manager, laparams=params)
interpreter = PDFPageInterpreter(manager, device)
text = ""
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
for obj in device.get_result():
if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
text += obj.get_text()
with open("txts/{}.txt".format(path.stem), "w") as file:
file.write(text)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
感谢您的帮助。我发现这个示例脚本可以将 PDF 提取到文本文件: https://gist.github.com/vinovator/c78c2cb63d62fdd9fb67
这有效,而且它可能是我发现的最准确的提取。我想编辑它以循环遍历多个 PDF 并将它们写入多个文本文件,所有文本文件都与创建它们的 PDF 同名。我正在努力这样做,要么只写一个文本文件,要么覆盖我试图从中提取的 PDF。任何人都可以帮助我循环遍历单个文件夹中的所有 PDF,并将它们提取到与 PDF 同名的单个文本文件中吗?
在此先感谢您的帮助!
import os
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
# From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# Import this to raise exception whenever text extraction from PDF is not allowed
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
base_path = "C://some_folder"
my_file = os.path.join(base_path + "/" + "test_pdf.pdf")
log_file = os.path.join(base_path + "/" + "pdf_log.txt")
password = ""
extracted_text = ""
# Open and read the pdf file in binary mode
fp = open(my_file, "rb")
# Create parser object to parse the pdf content
parser = PDFParser(fp)
# Store the parsed content in PDFDocument object
document = PDFDocument(parser, password)
# Check if document is extractable, if not abort
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create PDFResourceManager object that stores shared resources such as fonts or images
rsrcmgr = PDFResourceManager()
# set parameters for analysis
laparams = LAParams()
# Create a PDFDevice object which translates interpreted information into desired format
# Device needs to be connected to resource manager to store shared resources
# device = PDFDevice(rsrcmgr)
# Extract the decive to page aggregator to get LT object elements
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create interpreter object to process page content from PDFDocument
# Interpreter needs to be connected to resource manager for shared resources and device
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Ok now that we have everything to process a pdf document, lets process it page by page
for page in PDFPage.create_pages(document):
# As the interpreter processes the page stored in PDFDocument object
interpreter.process_page(page)
# The device renders the layout from interpreter
layout = device.get_result()
# Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
extracted_text += lt_obj.get_text()
#close the pdf file
fp.close()
# print (extracted_text.encode("utf-8"))
with open(log_file, "wb") as my_log:
my_log.write(extracted_text.encode("utf-8"))
print("Done !!")
脚本作者在开头用两个参数指定了输入和输出文件:my_file
和log_file
您可以将脚本转换为将这些作为输入并执行提取的函数,然后多次循环此函数。
# import statemates as in the original script
base_path = "C://some_folder"
# Define a pair of tuples with lists of your file names
my_files = ("pdf1.pdf","pdf2.pdf")
log_files = ("log1.txt","log2.txt")
# This is called a list comprehension, it takes each of the
# files listed above and generates the complete file path
my_files = [os.path.join(base_path,x) for x in my_files]
log_files = [os.path.join(base_path,x) for x in log_files]
# Function to extract the file
def extract(my_file,log_file):
# code to perform the file extraction as in the original script
# loop through the file names,
# as we have two list, use a range of indices instead of for name in my_files
for i in range(len(my_files)):
extract(my_files[i],log_files[i])
您还应该查看 os.path.join
的文档,因为您的用法不是最佳实践(切换操作系统时可能会中断)。
假设您有以下目录结构:
script.py
pdfs
├─a.pdf
├─b.pdf
└─c.pdf
txts
其中 script.py
是您的 Python 脚本,pdfs
是包含您的 PDF 文档的文件夹,而 txts
是提取的文本文件应存放的空文件夹.
我们可以使用pathlib.Path.glob
来发现给定目录中所有PDF 文档的路径。我们遍历路径,对于每个路径,我们打开相应的 PDF 文档,解析它,提取文本并将文本保存在 txts
文件夹中的文本文档(同名)中。
def main():
from pathlib import Path
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
for path in Path("pdfs").glob("*.pdf"):
with path.open("rb") as file:
parser = PDFParser(file)
document = PDFDocument(parser, "")
if not document.is_extractable:
continue
manager = PDFResourceManager()
params = LAParams()
device = PDFPageAggregator(manager, laparams=params)
interpreter = PDFPageInterpreter(manager, device)
text = ""
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
for obj in device.get_result():
if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
text += obj.get_text()
with open("txts/{}.txt".format(path.stem), "w") as file:
file.write(text)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())