使用 Python 从 PDF 中仅提取特定文本

Extract only specific text from PDF using Python

只需要使用 python 从具有不同 PDF 结构的发票 PDF 文件中提取特定文本,并将输出数据存储到特定的 excel 列中。所有 PDF 文件的结构不同,但内容值相同。

试图解决它,但无法仅提取特定的文本值。

示例 PDF 文件:

Click to view the sample file

需要从整个 PDF 文件中提取发票 ID、签发日期、主题、应付金额。

到目前为止我使用的脚本:

import PyPDF2
import re
pdfFileObj = open('test.pdf','rb') 
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)         
text = str(pageObj.extractText())

quotes = re.findall(r'"[^"]*"',text)
print(quotes)

你有一个非常好的pdf文档,因为你的pdf有表单字段,所以你可以直接使用它们来读取数据:

import PyPDF2


pdfFileObj = open('test.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

fields = pdfReader.getFormTextFields()

print(fields["Invoice ID"])
print(fields["Issue Date"])
print(fields["Subject"])
print(fields["Amount Due"])

编辑: 我将您请求的数据(来自此处:How to extract only specific text from PDF file using python)合并到一个小脚本中,并有 3 次解析 pdf 的机会(对于您的 3 个 pdf)。问题是您的 pdf 有很多差异,并且包在不同的 pdf 上有一些优势,所以我认为您必须结合这些东西。问题是,您尝试所有功能,直到得到结果。我希望这对您来说是一个好的开始。如果您有更多不同的 pdf,您可能必须更改正则表达式,并且可能必须将所有正则表达式(每个字段)存储在一个数组中并将它们用于不同的函数,这样您就有 3 个用于解析的函数和 4 个要使用的正则表达式列表在 2 个函数中。

import PyPDF2
import re
import os

from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser


def parse_pdf_by_regex_2(filename: str) -> dict:
    output_string = StringIO()
    with open(filename, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    regex_invoice_no = re.compile(r"Invoice No.:\s*(\w+)\s")
    regex_order_no = re.compile(r"IRN:\s*(\d+)")
    regex_due_date = re.compile(r"Due Date: (\d{2}\.\d{2}\.\d{4})")
    regex_total_due = re.compile(r"([\d,.]+) \n\nTotal Invoice Value\(in words\)")

    try:
        return {"invoice_id": re.search(regex_invoice_no, output_string.getvalue()).group(1),
                "issue_date": re.search(regex_due_date, output_string.getvalue()).group(1),
                "subject": re.search(regex_order_no, output_string.getvalue()).group(1),
                "amount": re.search(regex_total_due, output_string.getvalue()).group(1)}

    except AttributeError as err:
        print("Not all elements have been found")
        return {}


def parse_pdf_by_form_fields(filename: str) -> dict:
    with open(filename, 'rb') as file:
        pdf_reader = PyPDF2.PdfFileReader(file)
        try:
            fields = pdf_reader.getFormTextFields()
        except TypeError as err:
            # print("No FormFields available")
            return {}

    try:
        # You can also check if onyly missing some values, maybe this can happen, but this is up to your data
        return {"invoice_id": fields["Invoice ID"],
                "issue_date": fields["Issue Date"],
                "subject": fields["Subject"],
                "amount": fields["Amount Due"]}
    except KeyError as err:
        # print(f"Key not found: '{err.args[0]}'")
        return {}


def parse_pdf_by_regex(filename: str) -> dict:
    with open(filename, 'rb') as file:
        pdf_reader = PyPDF2.PdfFileReader(file)
        text_data = ""
        for page_no in range(pdf_reader.getNumPages()):
            text_data += pdf_reader.getPage(page_no).extractText()

    regex_invoice_no = re.compile(r"Invoice Number\s*(INV-\d+)")
    regex_order_no = re.compile(r"Order Number(\d+)")
    regex_due_date = re.compile(r"Due Date(\S+ \d{1,2}, \d{4})")
    regex_total_due = re.compile(r"Total Due($\d+\.\d{1,2})")

    try:
        return {"invoice_id": re.search(regex_invoice_no, text_data).group(1),
                "issue_date": re.search(regex_due_date, text_data).group(1),
                "subject": re.search(regex_order_no, text_data).group(1),
                "amount": re.search(regex_total_due, text_data).group(1)}

    except AttributeError as err:
        # print("Not all elements have been found")
        return {}


def parse_pdf(filename: str) -> dict:
    # Hint: ':=' is available since pythoon 3.8
    if data := parse_pdf_by_form_fields(filename=fname):
        return data
    elif data := parse_pdf_by_regex(filename=fname):
        return data
    elif data := parse_pdf_by_regex_2(filename=fname):
        return data
    else:
        print("No data found")
        return {}


if __name__ == '__main__':
    for fname in os.listdir("."):
        if fname.startswith("testfile"):
            print(f"check {fname}")
            print(parse_pdf(filename=fname))