遍历文件 (PDF) 以 运行 一个函数
Iterate Over Files (PDFs) to Run a Function
我正在尝试从目录 (path
) 读取 PDF 文件以从每个 PDF 中提取单独的图像并写入同一目录。但是,我无法对每个文件执行以下功能,因为我的脚本只解析目录中的最后一个文件。我使用的代码如下所示:
pip install pymupdf
import os
import PyPDF2
import fitz # from pymupdf
import glob
path = "C:\Users\mdl518\Desktop\"
def pdf_extract():
for filename in glob.glob(os.path.join(path, "*.pdf"), recursive=True): # file path specifying the location of the PDF files
with open(os.path.join(os.getcwd(), filename),'rb') as f: # open/read the PDF files
pdf_document=fitz.open(filename)
for current_page in range(len(pdf_document)): # iterate over the total number of pages in each PDF
for image in pdf_document.getPageImageList(current_page):
xref=image[0] # initiates the cross-reference number for objects on the first page of the PDF
pix=fitz.Pixmap(pdf_document, xref)
if pix.n < 5: # capture all images and write to the file path
pix.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
else:
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
pix1 = None
pix = None
pdf_extract()
我曾尝试使用 glob
、os.listdir()
和 os.walk()
来解析各个 PDF,但我得到的最好结果只是将图像从最后一个 PDF 文件中提取到read/written 到文件路径。有没有更简单的方法来解决这个问题,或者它只是对我的“glob”声明的一个小调整?非常感谢任何帮助!
有两个问题
with open(os.path.join(os.getcwd(), filename),'rb') as f: # open/read the PDF files
不需要,f
从不使用
- 主要问题是您覆盖了每个文件的图像
pix.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
pix1.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
current_page
& xref
不一定对每个文件都是唯一的
更新
- 代码更新为
pathlib
, part of the standard library, because it treats paths as objects with methods, unlike glob
and os
, which treat paths as strings. Also see Python 3's pathlib Module: Taming the File System
- 在保存路径中添加了
_{file.stem}
,以创建一个唯一的文件名
- 使用
f-strings
for string formatting. Also see PEP 498 - Literal String Interpolation
from pathlib import Path
import PyPDF2
import fitz
def pdf_extract(path_to_files: str):
path_to_files = Path(path_to_files) # convert the str to a pathlib object
for file in path_to_files.rglob('*.pdf'): # pathlib has rglob
pdf = fitz.open(file)
for current_page in range(len(pdf)):
for image in pdf.getPageImageList(current_page):
xref = image[0]
pix = fitz.Pixmap(pdf, xref)
if pix.n < 5:
pix.writeImage(str(file.parent / f'page{current_page}-{xref}_{file.stem}.jpg')) # updated filename
else:
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.writeImage(str(file.parent / f'page{current_page}-{xref}_{file.stem}.jpg')) # updated filename
# path to files
path_to_files = r'C:\Users\mdl518\Desktop' # do not include the trailing backslash '\'
# call the function
pdf_extract(path_to_files)
我正在尝试从目录 (path
) 读取 PDF 文件以从每个 PDF 中提取单独的图像并写入同一目录。但是,我无法对每个文件执行以下功能,因为我的脚本只解析目录中的最后一个文件。我使用的代码如下所示:
pip install pymupdf
import os
import PyPDF2
import fitz # from pymupdf
import glob
path = "C:\Users\mdl518\Desktop\"
def pdf_extract():
for filename in glob.glob(os.path.join(path, "*.pdf"), recursive=True): # file path specifying the location of the PDF files
with open(os.path.join(os.getcwd(), filename),'rb') as f: # open/read the PDF files
pdf_document=fitz.open(filename)
for current_page in range(len(pdf_document)): # iterate over the total number of pages in each PDF
for image in pdf_document.getPageImageList(current_page):
xref=image[0] # initiates the cross-reference number for objects on the first page of the PDF
pix=fitz.Pixmap(pdf_document, xref)
if pix.n < 5: # capture all images and write to the file path
pix.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
else:
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
pix1 = None
pix = None
pdf_extract()
我曾尝试使用 glob
、os.listdir()
和 os.walk()
来解析各个 PDF,但我得到的最好结果只是将图像从最后一个 PDF 文件中提取到read/written 到文件路径。有没有更简单的方法来解决这个问题,或者它只是对我的“glob”声明的一个小调整?非常感谢任何帮助!
有两个问题
with open(os.path.join(os.getcwd(), filename),'rb') as f: # open/read the PDF files
不需要,f
从不使用- 主要问题是您覆盖了每个文件的图像
pix.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
pix1.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
current_page
&xref
不一定对每个文件都是唯一的
更新
- 代码更新为
pathlib
, part of the standard library, because it treats paths as objects with methods, unlikeglob
andos
, which treat paths as strings. Also see Python 3's pathlib Module: Taming the File System - 在保存路径中添加了
_{file.stem}
,以创建一个唯一的文件名 - 使用
f-strings
for string formatting. Also see PEP 498 - Literal String Interpolation
from pathlib import Path
import PyPDF2
import fitz
def pdf_extract(path_to_files: str):
path_to_files = Path(path_to_files) # convert the str to a pathlib object
for file in path_to_files.rglob('*.pdf'): # pathlib has rglob
pdf = fitz.open(file)
for current_page in range(len(pdf)):
for image in pdf.getPageImageList(current_page):
xref = image[0]
pix = fitz.Pixmap(pdf, xref)
if pix.n < 5:
pix.writeImage(str(file.parent / f'page{current_page}-{xref}_{file.stem}.jpg')) # updated filename
else:
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.writeImage(str(file.parent / f'page{current_page}-{xref}_{file.stem}.jpg')) # updated filename
# path to files
path_to_files = r'C:\Users\mdl518\Desktop' # do not include the trailing backslash '\'
# call the function
pdf_extract(path_to_files)