如何将捕获的数据从 PDF 导出到 DataFrame? [正则表达式]
How to export captured data from PDF into a DataFrame? [RegEx]
import re
import pdfplumber
import pandas as pd
from collections import namedtuple
path = r"C:\Users\x\Documents\Python Scripts\Files\x.pdf"
Line = namedtuple('Line', 'print_date order_no pos item_no issue qty UM price req_date line_amt')
with pdfp.open(path) as pdf:
page = pdf.pages[2]
text = page.extract_text()
new_vend_re = re.compile(r'^\d{1,}\s[A-Z].*')
for line in text.split('\n'):
if new_vend_re.match(line):
print(line)
这会找到并打印如下内容:
53 AB839-11 0002 31.00 EA 58.5300 1814.43
有些页面和页面必须提取类似的值。那只是一个例子。
执行处理的剩余代码:
line_items = []
with pdfplumber.open(path) as pdf:
pages = pdf.pages
for page in pdf.pages:
text = page.extract_text()
for line in text.split('\n'):
line = new_vend_re.search(line)
if line:
pos = line.group(1)
item_no = line.group(2)
issue = line.group(3)
qty = line.group(4)
UM = line.group(5)
price = line.group(6)
amt = line.group(7)
line_items.append(Inv(pos, item_no, issue, qty, UM, price, amt))
df = pd.DataFrame(line_items)
df.head()
我有这段代码,但它似乎无法将提取的数据放入它们各自的元组中。
我的程序基本上应该遍历具有多个页面的 PDF,并精确地从正则表达式中提取的各种项目中获取值,并将它们放入元组中,但我的代码由于某种原因不起作用。
您的正则表达式是错误的 - 它以 "^\d+"
开头 - 意思是行首后跟数字。
您文件中的行以 "(......)"
开头 - 更改正则表达式:
import re
from collections import namedtuple
Inv = namedtuple('Inv', 'pos, item_no, issue, qty, UM, price, amt')
new_vend_re = re.compile(r'\d+\s[A-Z].*')
text = "some\nmore (53 AB839-11 0002 31.00 EA 58.5300 1814.43) things \ntext\n"
line_items = []
for line in text.split('\n'):
searched = new_vend_re.search(line)
if searched:
print(line)
# get the matched part of the line and remove ( ) from start/end
m = searched.group(0).strip("()")
# now its as simple as splitting it into variables
pos, item_no, issue, qty, UM, price, amt, *crap = m.split()
# and use a namedtuple that works with that amount of data
line_items.append(Inv(pos, item_no, issue, qty, UM, price, amt))
if crap:
print(crap, "that was also captured but not used")
print(*line_items)
import pandas as pd
df = pd.DataFrame(line_items)
print(df.head())
输出:
# line
more (53 AB839-11 0002 31.00 EA 58.5300 1814.43) things
# crap catchall
['things'] that was also captured but not used
# named tuple
Inv(pos='53', item_no='AB839-11', issue='0002', qty='31.00', UM='EA', price='58.5300', amt='1814.43)')
# df
pos item_no issue qty UM price amt
0 53 AB839-11 0002 31.00 EA 58.5300 1814.43)
import re
import pdfplumber
import pandas as pd
from collections import namedtuple
path = r"C:\Users\x\Documents\Python Scripts\Files\x.pdf"
Line = namedtuple('Line', 'print_date order_no pos item_no issue qty UM price req_date line_amt')
with pdfp.open(path) as pdf:
page = pdf.pages[2]
text = page.extract_text()
new_vend_re = re.compile(r'^\d{1,}\s[A-Z].*')
for line in text.split('\n'):
if new_vend_re.match(line):
print(line)
这会找到并打印如下内容:
53 AB839-11 0002 31.00 EA 58.5300 1814.43
有些页面和页面必须提取类似的值。那只是一个例子。 执行处理的剩余代码:
line_items = []
with pdfplumber.open(path) as pdf:
pages = pdf.pages
for page in pdf.pages:
text = page.extract_text()
for line in text.split('\n'):
line = new_vend_re.search(line)
if line:
pos = line.group(1)
item_no = line.group(2)
issue = line.group(3)
qty = line.group(4)
UM = line.group(5)
price = line.group(6)
amt = line.group(7)
line_items.append(Inv(pos, item_no, issue, qty, UM, price, amt))
df = pd.DataFrame(line_items)
df.head()
我有这段代码,但它似乎无法将提取的数据放入它们各自的元组中。 我的程序基本上应该遍历具有多个页面的 PDF,并精确地从正则表达式中提取的各种项目中获取值,并将它们放入元组中,但我的代码由于某种原因不起作用。
您的正则表达式是错误的 - 它以 "^\d+"
开头 - 意思是行首后跟数字。
您文件中的行以 "(......)"
开头 - 更改正则表达式:
import re
from collections import namedtuple
Inv = namedtuple('Inv', 'pos, item_no, issue, qty, UM, price, amt')
new_vend_re = re.compile(r'\d+\s[A-Z].*')
text = "some\nmore (53 AB839-11 0002 31.00 EA 58.5300 1814.43) things \ntext\n"
line_items = []
for line in text.split('\n'):
searched = new_vend_re.search(line)
if searched:
print(line)
# get the matched part of the line and remove ( ) from start/end
m = searched.group(0).strip("()")
# now its as simple as splitting it into variables
pos, item_no, issue, qty, UM, price, amt, *crap = m.split()
# and use a namedtuple that works with that amount of data
line_items.append(Inv(pos, item_no, issue, qty, UM, price, amt))
if crap:
print(crap, "that was also captured but not used")
print(*line_items)
import pandas as pd
df = pd.DataFrame(line_items)
print(df.head())
输出:
# line
more (53 AB839-11 0002 31.00 EA 58.5300 1814.43) things
# crap catchall
['things'] that was also captured but not used
# named tuple
Inv(pos='53', item_no='AB839-11', issue='0002', qty='31.00', UM='EA', price='58.5300', amt='1814.43)')
# df
pos item_no issue qty UM price amt
0 53 AB839-11 0002 31.00 EA 58.5300 1814.43)