在 python 中使用 Adob​​e Readers 导出为文本功能

Using Adobe Readers Export as text function in python

我想将大量 PDF 转换为文本文件。 格式非常重要,似乎只有 Adob​​e Reader 正确(PDFMiner 或 PyPDF2 不正确。)

有没有办法从 Adob​​e Reader 中自动执行 "export as text" 功能?

以下代码将对一个文件执行您想要的操作。我建议将脚本组织成几个小函数,然后循环调用这些函数来处理许多文件。您需要使用 pip 或其他工具安装 keyboard 库。

import pathlib as pl
import os
import keyboard
import time
import io


KILL_KEY = 'esc'
read_path  = pl.Path("C:/Users/Sam/Downloads/WS-1401-IP.pdf")
####################################################################


write_path = pl.Path(str(read_path.parent/read_path.stem) + ".txt")
overwrite_file = os.path.exists(write_path)

# alt      -- activate keyboard shortcuts
# `F`      -- open file menu
# `v`      -- select "save as text" option
# keyboard.write(write_path)
# `alt+s`  -- save button
# `ctrl+w` -- close file


os.startfile(read_path)
time.sleep(1)
keyboard.press_and_release('alt')
time.sleep(1)
keyboard.press_and_release('f') # -- open file menu
time.sleep(1)
keyboard.press_and_release('v') # -- select "save as text" option
time.sleep(1)
keyboard.write(str(write_path))
time.sleep(1)
keyboard.press_and_release('alt+s')
time.sleep(2)
if overwrite_file:
    keyboard.press_and_release('y')

# wait for program to finish saving
waited_too_long = True
for _ in range(5):
    time.sleep(1)
    if os.path.exists(write_path):
        waited_too_long = False
        break

if waited_too_long:
    with io.StringIO() as ss:
        print(
            "program probably saved to somewhere other than",
            write_path,
            file = ss
        )
        msg = ss.getvalue()
    raise ValueError(msg)

keyboard.press_and_release('ctrl+w') # close the file