如何使用 python 从 pdf 文件中使用 camelot 提取 table 名称和 table?
How to extract table name along with table using camelot from pdf files using python?
我正在尝试使用 python 中的 camelot 从 pdf 文件中提取 table 和 table 名称。虽然我知道如何使用 camelot 提取 tables(这非常简单),但我正在努力寻找有关如何提取 table 名称的任何帮助。目的是提取此信息并向用户显示 table 及其名称的视觉效果,以显示列表中的 table 相关 table。
我试过提取 tables,然后也从 pdf 中提取文本。我在这两个方面都成功了,但在将 table 名称连接到 table.
方面却没有成功
def tables_from_pdfs(filespath):
pdffiles = glob.glob(os.path.join(filespath, "*.pdf"))
print(pdffiles)
dictionary = {}
keys = []
for file in pdffiles:
print(file)
n = PyPDF2.PdfFileReader(open(file, 'rb')).getNumPages()
print(n)
tables_dict = {}
for i in range(n):
tables = camelot.read_pdf(file, pages = str(i))
tables_dict[i] = tables
head, tail = os.path.split(file)
tail = tail.replace(".pdf", "")
keys.append(tail)
dictionary[tail] = tables_dict
return dictionary, keys
预期结果是 table 和 pdf 文件中所述的 table 的名称。例如:
Table 在 pdf 名称的第 x 页上:Table 1. Blah Blah blah
'''Table'''
Tables 与 TableList 和 Table 函数一起在此处找到的 camelot API 中列出:
https://camelot-py.readthedocs.io/en/master/api.html#camelot.core.TableList
从网页开始:
下层-下层类
Camelot 没有引用 table 名称,只有单元格数据描述。
它确实使用了 python 的熊猫数据库 API,尽管其中可能包含 table 名称。
结合使用 Camelot 和 Pandas 得到 table 名称。
Get the name of a pandas DataFrame
附加更新回答
来自
https://camelot-py.readthedocs.io/en/master/
import camelot
tables = camelot.read_pdf('foo.pdf')
tables
<TableList n=1>
tables.export('foo.csv', f='csv', compress=True) # json, excel, html
tables[0]
<Table shape=(7, 7)>
tables[0].parsing_report
{
'accuracy': 99.02,
'whitespace': 12.24,
'order': 1,
'page': 1
}
tables[0].to_csv('foo.csv') # to_json, to_excel, to_html
df_table = tables[0].df # get a pandas DataFrame!
#add
df_table.name = 'name here'
#from
import pandas as pd
df = pd.DataFrame( data=np.ones([4,4]) )
df.name = 'Ones'
print df.name
注意:添加的 'name' 属性不是 df 的一部分。序列化df时,添加的name属性丢失
补充回答,'name'属性其实叫'index'.
Getting values
>>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
... index=['cobra', 'viper', 'sidewinder'],
... columns=['max_speed', 'shield'])
>>> df
max_speed shield
cobra 1 2
viper 4 5
sidewinder 7 8
Single label. Note this returns the row as a Series.
>>> df.loc['viper']
max_speed 4
shield 5
Name: viper, dtype: int64
我找到了相关的解决方案。至少对我有用。
import os, PyPDF2, time, re, shutil
import pytesseract
from pdf2image import convert_from_path
import camelot
import datefinder
from difflib import SequenceMatcher
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
similarityAmt = 0.6 # find with 60% similarity
def find_table_name(dataframe, documentString):
# Assuming that you extracted the text from a PDF, it should be multi-lined. We split by line
stringsSeparated = text.split("\n")
for i, string in enumerate(stringsSeparated):
# Split by word
words = string.split()
for k, word in enumerate(words):
# Get the keys from the dataframe as a list (it is initially extracted as a generator type)
dfList = list(dataframe.keys())
keys = str(dfList)
# If the first key is a digit, we assume that the keys are from the row below the keys instead
if keys[0].isdigit():
keys = dataframe[dfList[0]]
# Put all of the keys in a single string
keysAll = ""
for key in keys:
keysAll += key
# Since a row should be horizontal, we check the similarity between that of the text by line.
similarRating = similar(words, keysAll)
if similarRating > similarityAmt: # If similarity rating (which is a ratio from 0 to 1) is above the similarity amount, we approve of it
for j in range(10): # Iterate upwards 10 lines above until we are capable of finding a line that is longer than 4 characters (this is an arbitrary number just to ignore blank lines).
try:
separatedString = stringsSeparated[i-j-1]
if len(separatedString) > 4:
return stringsSeparated[i-j-2]+separatedString # Return the top two lines to hopefully have an accurate name
else:
continue
except:
continue
return "Unnamed"
# Retreive the text from the pdf
pages = convert_from_path(pdf_path, 500) # pdf_path would be the path of the PDF which you extracted the table from
pdf_text = ""
# Add all page strings into a single string, so the entire PDF is one single string
for pageNum, imgBlob in enumerate(pages):
extractedText = pytesseract.image_to_string(imgBlob, lang='eng')
pdf_text += extractedText + "\n"
# Get the name of the table using the table itself and pdf text
tableName = find_table_name(table.df, pdf_text) # A table you extracted with your code, which you want to find the name of
我正在尝试使用 python 中的 camelot 从 pdf 文件中提取 table 和 table 名称。虽然我知道如何使用 camelot 提取 tables(这非常简单),但我正在努力寻找有关如何提取 table 名称的任何帮助。目的是提取此信息并向用户显示 table 及其名称的视觉效果,以显示列表中的 table 相关 table。
我试过提取 tables,然后也从 pdf 中提取文本。我在这两个方面都成功了,但在将 table 名称连接到 table.
方面却没有成功def tables_from_pdfs(filespath):
pdffiles = glob.glob(os.path.join(filespath, "*.pdf"))
print(pdffiles)
dictionary = {}
keys = []
for file in pdffiles:
print(file)
n = PyPDF2.PdfFileReader(open(file, 'rb')).getNumPages()
print(n)
tables_dict = {}
for i in range(n):
tables = camelot.read_pdf(file, pages = str(i))
tables_dict[i] = tables
head, tail = os.path.split(file)
tail = tail.replace(".pdf", "")
keys.append(tail)
dictionary[tail] = tables_dict
return dictionary, keys
预期结果是 table 和 pdf 文件中所述的 table 的名称。例如: Table 在 pdf 名称的第 x 页上:Table 1. Blah Blah blah '''Table'''
Tables 与 TableList 和 Table 函数一起在此处找到的 camelot API 中列出: https://camelot-py.readthedocs.io/en/master/api.html#camelot.core.TableList
从网页开始:
下层-下层类
Camelot 没有引用 table 名称,只有单元格数据描述。 它确实使用了 python 的熊猫数据库 API,尽管其中可能包含 table 名称。
结合使用 Camelot 和 Pandas 得到 table 名称。
Get the name of a pandas DataFrame
附加更新回答
来自 https://camelot-py.readthedocs.io/en/master/
import camelot
tables = camelot.read_pdf('foo.pdf')
tables
<TableList n=1>
tables.export('foo.csv', f='csv', compress=True) # json, excel, html
tables[0]
<Table shape=(7, 7)>
tables[0].parsing_report
{
'accuracy': 99.02,
'whitespace': 12.24,
'order': 1,
'page': 1
}
tables[0].to_csv('foo.csv') # to_json, to_excel, to_html
df_table = tables[0].df # get a pandas DataFrame!
#add
df_table.name = 'name here'
#from
import pandas as pd
df = pd.DataFrame( data=np.ones([4,4]) )
df.name = 'Ones'
print df.name
注意:添加的 'name' 属性不是 df 的一部分。序列化df时,添加的name属性丢失
补充回答,'name'属性其实叫'index'.
Getting values
>>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
... index=['cobra', 'viper', 'sidewinder'],
... columns=['max_speed', 'shield'])
>>> df
max_speed shield
cobra 1 2
viper 4 5
sidewinder 7 8
Single label. Note this returns the row as a Series.
>>> df.loc['viper']
max_speed 4
shield 5
Name: viper, dtype: int64
我找到了相关的解决方案。至少对我有用。
import os, PyPDF2, time, re, shutil
import pytesseract
from pdf2image import convert_from_path
import camelot
import datefinder
from difflib import SequenceMatcher
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
similarityAmt = 0.6 # find with 60% similarity
def find_table_name(dataframe, documentString):
# Assuming that you extracted the text from a PDF, it should be multi-lined. We split by line
stringsSeparated = text.split("\n")
for i, string in enumerate(stringsSeparated):
# Split by word
words = string.split()
for k, word in enumerate(words):
# Get the keys from the dataframe as a list (it is initially extracted as a generator type)
dfList = list(dataframe.keys())
keys = str(dfList)
# If the first key is a digit, we assume that the keys are from the row below the keys instead
if keys[0].isdigit():
keys = dataframe[dfList[0]]
# Put all of the keys in a single string
keysAll = ""
for key in keys:
keysAll += key
# Since a row should be horizontal, we check the similarity between that of the text by line.
similarRating = similar(words, keysAll)
if similarRating > similarityAmt: # If similarity rating (which is a ratio from 0 to 1) is above the similarity amount, we approve of it
for j in range(10): # Iterate upwards 10 lines above until we are capable of finding a line that is longer than 4 characters (this is an arbitrary number just to ignore blank lines).
try:
separatedString = stringsSeparated[i-j-1]
if len(separatedString) > 4:
return stringsSeparated[i-j-2]+separatedString # Return the top two lines to hopefully have an accurate name
else:
continue
except:
continue
return "Unnamed"
# Retreive the text from the pdf
pages = convert_from_path(pdf_path, 500) # pdf_path would be the path of the PDF which you extracted the table from
pdf_text = ""
# Add all page strings into a single string, so the entire PDF is one single string
for pageNum, imgBlob in enumerate(pages):
extractedText = pytesseract.image_to_string(imgBlob, lang='eng')
pdf_text += extractedText + "\n"
# Get the name of the table using the table itself and pdf text
tableName = find_table_name(table.df, pdf_text) # A table you extracted with your code, which you want to find the name of