使用 pdfplumber 从 pdf 文件中提取文本
Extract text from pdf file using pdfplumber
我想从 pdf 文件中提取文本,尝试过:
directory = r'C:\Users\foo\folder'
for x in os.listdir(directory):
print(x)
x = x.replace('.pdf','')
filename = os.fsdecode(x)
print(x)
if filename.endswith('.pdf'):
with pdfplumber.open(x) as pdf1:
page1 = pdf1.pages[0]
text1 = page1.extract_text()
print(text1)
并打印:
20170213091544343.pdf
20170213091544343
看到文件名为20170213091544343
,我补充说:
else:
with pdfplumber.open(x) as pdf1:
page1 = pdf1.pages[0]
text1 = page1.extract_text()
print(text1)
读取文件以防文件名没有 .pdf
并捕获错误:
20170213091544343.pdf
20170213091544343
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-34-e370b214f9ba> in <module>
16
17 else:
---> 18 with pdfplumber.open(x) as pdf1:
19 page1 = pdf1.pages[0]
20 text1 = page1.extract_text()
C:\Python38\lib\site-packages\pdfplumber\pdf.py in open(cls, path_or_fp, **kwargs)
56 def open(cls, path_or_fp, **kwargs):
57 if isinstance(path_or_fp, (str, pathlib.Path)):
---> 58 fp = open(path_or_fp, "rb")
59 inst = cls(fp, **kwargs)
60 inst.close = fp.close
FileNotFoundError: [Errno 2] No such file or directory: '20170213091544343'
os.listdir()
只给出 filename
你必须加入 directory
for filename in os.listdir(directory):
fullpath = os.path.join(directory, filename)
#print(fullpath)
而且你必须保持扩展.pdf
import os
import pdfplumber
directory = r'C:\Users\foo\folder'
for filename in os.listdir(directory):
if filename.endswith('.pdf'):
fullpath = os.path.join(directory, filename)
#print(fullpath)
#all_text = ""
with pdfplumber.open(fullpath) as pdf:
for page in pdf.pages:
text = page.extract_text()
print(text)
#all_text += text
#print(all_text)
或带页码
with pdfplumber.open(fullpath) as pdf:
for number, page in enumerate(pdf.pages, 1):
print('--- page', number, '---')
text = page.extract_text()
print(text)
我想从 pdf 文件中提取文本,尝试过:
directory = r'C:\Users\foo\folder'
for x in os.listdir(directory):
print(x)
x = x.replace('.pdf','')
filename = os.fsdecode(x)
print(x)
if filename.endswith('.pdf'):
with pdfplumber.open(x) as pdf1:
page1 = pdf1.pages[0]
text1 = page1.extract_text()
print(text1)
并打印:
20170213091544343.pdf
20170213091544343
看到文件名为20170213091544343
,我补充说:
else:
with pdfplumber.open(x) as pdf1:
page1 = pdf1.pages[0]
text1 = page1.extract_text()
print(text1)
读取文件以防文件名没有 .pdf
并捕获错误:
20170213091544343.pdf
20170213091544343
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-34-e370b214f9ba> in <module>
16
17 else:
---> 18 with pdfplumber.open(x) as pdf1:
19 page1 = pdf1.pages[0]
20 text1 = page1.extract_text()
C:\Python38\lib\site-packages\pdfplumber\pdf.py in open(cls, path_or_fp, **kwargs)
56 def open(cls, path_or_fp, **kwargs):
57 if isinstance(path_or_fp, (str, pathlib.Path)):
---> 58 fp = open(path_or_fp, "rb")
59 inst = cls(fp, **kwargs)
60 inst.close = fp.close
FileNotFoundError: [Errno 2] No such file or directory: '20170213091544343'
os.listdir()
只给出 filename
你必须加入 directory
for filename in os.listdir(directory):
fullpath = os.path.join(directory, filename)
#print(fullpath)
而且你必须保持扩展.pdf
import os
import pdfplumber
directory = r'C:\Users\foo\folder'
for filename in os.listdir(directory):
if filename.endswith('.pdf'):
fullpath = os.path.join(directory, filename)
#print(fullpath)
#all_text = ""
with pdfplumber.open(fullpath) as pdf:
for page in pdf.pages:
text = page.extract_text()
print(text)
#all_text += text
#print(all_text)
或带页码
with pdfplumber.open(fullpath) as pdf:
for number, page in enumerate(pdf.pages, 1):
print('--- page', number, '---')
text = page.extract_text()
print(text)