将 PDF 数据提取到 Dataframe 中

Extracting PDF Data into a Dataframe

我正在尝试获取此数据并将其转换为 pandas 中的数据框:

我正在使用 camelot,它正在“工作”但是,我只得到 2 列代码:

import camelot


tables = camelot.read_pdf('Inventory_Summary.pdf', flavor='stream')
print(tables[0])

正在发生的事情是它正在考虑左侧 1 列中的所有内容,而涂黑的信息是第 2 列中的唯一信息

我只想将日期下方的信息放入数据框中

如果您能提供任何帮助,那就太好了!

谢谢!

-小吉弗

你有一个看起来很理想的表格来源来设置你的兴趣区,你也应该有在 python 中使用 poppler pdftotext 的回退(我不使用) 您没有提供用于测试的最小输入,因此采用较差的类似输入我建议您在需要可靠的固定区域时可以做这样的事情,最坏的情况下 re-print 作为您输入的新 pdf。

所以这里有一个类似的不良来源(不是我的,所以无法控制页面外的裁剪 pdf 数据,但如果需要,我可以更改宽度以裁剪隐藏的数据。

所以这可能是屏幕上显示的所需输出(包括隐藏列),但可以输出到文本文件以添加(post 提取)字符分隔,例如 csv 文件或更简单地导入为普通文件列文本到 excel。

pdftotext -nopgbrk -x 0 -y 120 -W 1000 -H 300 -fixed 3.8 inventory.pdf -

可以在任何相关命令行pdftotext -h中看到 pdftotext 选项

我就是这样解决的...

import PyPDF2
import pandas as pd
import numpy as np
 
 
lines = []
sites = []
kinds = []
total_offqc_wip_inv = []
total_offqc_scale_inv = []
total_offqc_truck_inv = []
total_offqc_rail_inv = []
total_offqc_boat_inv = []
 
 
# creating a pdf file object
pdfFileObj = open('PDFs/Inventory_Summary.pdf', 'rb')
 
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
count = pdfReader.numPages
 
# creating a page object
 
pageObj0 = pdfReader.getPage(0)
pageObj1 = pdfReader.getPage(1)
pageObj2 = pdfReader.getPage(2)
pageObj3 = pdfReader.getPage(3)
pageObj4 = pdfReader.getPage(4)
pageObj5 = pdfReader.getPage(5)
 
# extracting text from page
page0 = pageObj0.extractText().strip()
page1 = pageObj1.extractText().strip()
page2 = pageObj2.extractText().strip()
page3 = pageObj3.extractText().strip()
page4 = pageObj4.extractText().strip()
page5 = pageObj5.extractText().strip()
 
corrected_page0 = page0.split('07:43am')[+1]
corrected_page1 = page1.split('07:43am')[+1]
corrected_page2 = page2.split('07:43am')[+1]
corrected_page3 = page3.split('07:43am')[+1]
corrected_page4 = page4.split('07:43am')[+1]
corrected_page5 = page5.split('07:43am')[+1]
 
for line in page0.splitlines():
    if 'Site' in line:
       for word in line.split():
           if word != 'Site': 
                sites.append(word)
    if 'All Shifts' in line:
        for word in line.split():
            if word != 'All':
                if word != 'Shifts': 
                    kinds.append(word)
    if 'Total OffQc WIP Inv' in line:
        for word in line.split():
            if word != 'Total':
                if word != 'OffQc':
                    if word != 'WIP':
                        if word != 'Inv':
                            total_offqc_wip_inv.append(word)
    if 'Total OffQc Scale Inv' in line:
        for word in line.split():
            if word != 'Total':
                if word != 'OffQc':
                    if word != 'Scale':
                        if word != 'Inv':
                            total_offqc_scale_inv.append(word)
    if 'Total OffQc Truck Inv' in line:
        for word in line.split():
            if word != 'Total':
                if word != 'OffQc':
                    if word != 'Truck':
                        if word != 'Inv':
                            total_offqc_truck_inv.append(word)
for line in page1.splitlines():
    if 'Total OffQc Rail Inv' in line:
        for word in line.split():
            if word != 'Total':
                if word != 'OffQc':
                    if word != 'Rail':
                        if word != 'Inv':
                            total_offqc_rail_inv.append(word)
    if 'Total OffQc Boat Inv' in line:
        for word in line.split():
            if word != 'Total':
                if word != 'OffQc':
                    if word != 'Boat':
                        if word != 'Inv':
                            total_offqc_boat_inv.append(word)
for line in page3.splitlines():
    if 'Site' in line:
        for word in line.split():
           if word != 'Site': 
                sites.append(word)
    if 'All Shifts' in line:
        for word in line.split():
            if word != 'All':
                if word != 'Shifts': 
                    kinds.append(word)
    if 'Total OffQc WIP Inv' in line:
        for word in line.split():
            if word != 'Total':
                if word != 'OffQc':
                    if word != 'WIP':
                        if word != 'Inv':
                            total_offqc_wip_inv.append(word)
    if 'Total OffQc Scale Inv' in line:
        for word in line.split():
            if word != 'Total':
                if word != 'OffQc':
                    if word != 'Scale':
                        if word != 'Inv':
                            total_offqc_scale_inv.append(word)
    if 'Total OffQc Truck Inv' in line:
        for word in line.split():
            if word != 'Total':
                if word != 'OffQc':
                    if word != 'Truck':
                        if word != 'Inv':
                            total_offqc_truck_inv.append(word)
for line in page4.splitlines():
    if 'Total OffQc Rail Inv' in line:
        for word in line.split():
            if word != 'Total':
                if word != 'OffQc':
                    if word != 'Rail':
                        if word != 'Inv':
                            total_offqc_rail_inv.append(word)
    if 'Total OffQc Boat Inv' in line:
        for word in line.split():
            if word != 'Total':
                if word != 'OffQc':
                    if word != 'Boat':
                        if word != 'Inv':
                            total_offqc_boat_inv.append(word)
sites.append("Total")
 
d = np.column_stack([sites, kinds, total_offqc_wip_inv, total_offqc_scale_inv, total_offqc_truck_inv, total_offqc_rail_inv, total_offqc_boat_inv])
            
 
df = pd.DataFrame(d)
 
# closing the pdf file object
pdfFileObj.close()