将 PDF 文件转换为 .txt python 3

Convert PDF file to .txt python 3

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt

#converts pdf, returns its text content as a string
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
        pagenums = set(pages)

    output = StringIO
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    filepath = open(fname, 'rb')
    for page in PDFPage.get_pages(filepath, pagenums):
    text = output.getvalue()
    return text 

def convertMultiple(pdfDir, txtDir):
    if pdfDir == "": pdfDir = os.getcwd() + "\" #if no pdfDir passed in 
    for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
        fileExtension = pdf.split(".")[-1]
        if fileExtension == "pdf":
            pdfFilename = pdfDir + pdf 
            text = convert(pdfFilename) #get string of text content of pdf
            textFilename = txtDir + pdf + ".txt"
            textFile = open(textFilename, "w") #make text file
            textFile.write(text) #write text to text file

pdfDir = (r"FK_EPPS")
txtDir = (r"FK_txt")
convertMultiple(pdfDir, txtDir)

我尝试将多个名为 FK_EPPS 的 pdf 文件转换为 txt 文件,并将其写入名为 FK_txt 的不同文件夹中。但是它说没有这样的文件或目录。我把文件夹正好放在那些路径中。我尝试找到解决方案,但仍然存在错误。你能帮我看看为什么会这样吗?

/usr/local/lib/python2.7/dist-packages/pdfminer/__init__.py:20: UserWarning: On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For more information see https://github.com/pdfminer/pdfminer.six/issues/194
  warnings.warn('On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For '
Traceback (most recent call last):
  File "/home/a1-re/Documents/pdftotext/1.py", line 44, in <module>
    convertMultiple(pdfDir, txtDir)
  File "/home/a1-re/Documents/pdftotext/1.py", line 36, in convertMultiple
    text = convert(pdfFilename) #get string of text content of pdf
  File "/home/a1-re/Documents/pdftotext/1.py", line 21, in convert
    filepath = file(fname, 'rb')
IOError: [Errno 2] No such file or directory: 'pdf1831150030.pdf'

(您显示的回溯不可能是正确的。对于您的示例输入,错误应该在开头包含 FK_EPPS。)

您忘记了路径和文件名必须使用适合您的 OS 的 separator 相互分隔。

如果您在 convert 函数的开头打印出 fname 的值,您可能会立即看到这一点。您在文本输出文件名上犯了同样的错误,但这很难注意到,因为它不会产生错误,而只会创建错误的文件名。