逐页阅读pdf
Read pdf page by page
我搜索了我的问题,但在两个可用问题中没有得到我的答案
Extract text per page with Python pdfMiner?
PDFMiner - Iterating through pages and converting them to text
基本上我想遍历每个页面,因为我只想 select 具有 特定文本 .
的页面
我用过pyPdf
。它几乎适用于 pdfs
的 90%,但有时它不会从页面中提取信息。
我使用了下面的代码:
import pyPdf
extract = ""
pdf = pyPdf.PdfFileReader(open('filename.pdf', "rb"))
num_of_pages = pdf.getNumPages()
for p in range(num_of_pages):
ex = pdf.getPage(6)
ex = ex.extractText()
if re.search(r"to be held (at|on)",ex.lower()):
print 'yes'
print ex ,"\n"
extract = extract + ex + "\n"
continue
以上代码有效,但有时无法提取某些页面。
我也尝试过使用 pdfminer
,但我找不到如何逐页迭代其中的 pdf。 pdfminer
returns pdf 的全文。
我使用了下面的代码:
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
在上面的代码中,pdf 中的文本来自 for
循环
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
在此我如何一次迭代一页。
pdfminer
上的文档看不懂。同样的还有很多版本。
那么对于我的问题还有其他可用的软件包吗?或者 pdfminer
可以用来解决这个问题吗?
我知道回答你自己的问题不好,但我想我可能已经找到了这个问题的答案。
我认为这不是最好的方法,但它仍然对我有帮助。
我结合使用了pypdf
和pdfminer
代码如下:
import pyPdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
path = "filename.pdf"
pdf = pyPdf.PdfFileReader(open(path, "rb"))
fp = file(path, 'rb')
num_of_pages = pdf.getNumPages()
extract = ""
for i in range(num_of_pages):
inside = [i]
pagenos=set(inside)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
text = ""
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.decode("ascii","replace")
if re.search(r"to be held (at|on)",text.lower()):
print text
extract = extract + text + "\n"
continue
可能有更好的方法,但目前我发现这个很好。
因为 retstr 将保留每一页,您可以考虑通过调用 retstr.truncate(0) 来更改您的代码,它每次都会清除字符串,否则您每次都会打印已阅读的全部内容:
import pyPdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
path = "filename.pdf"
pdf = pyPdf.PdfFileReader(open(path, "rb"))
fp = file(path, 'rb')
num_of_pages = pdf.getNumPages()
extract = ""
for i in range(num_of_pages):
inside = [i]
pagenos=set(inside)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
text = ""
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
retstr.truncate(0)
text = text.decode("ascii","replace")
if re.search(r"to be held (at|on)",text.lower()):
print text
extract = extract + text + "\n"
continue
您可以参考以下link从PDF中逐页提取文本。
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
for page_layout in extract_pages("test.pdf"):
for element in page_layout:
if isinstance(element, LTTextContainer):
print(element.get_text())
我有同样的问题,但对我来说,我想对许多文件进行循环并仅在 excel 文件中提取 'partenaire' 之后具有特定 word/expression 的 'partenaire'仅在 pdf 的第一页中搜索。我的第一个问题是在文件夹和 pdf 文件中进行循环。这是我现在的代码:
{
enter code here
path_extraction_folder = "C:/Users/xxxxx/yyyyyy/pdf"
list_path = glob.glob(path_extraction_folder+"/**/*.pdf",recursive=True)
cpt = 0
for path in list_path:
path_base = "/".join(str(path).replace("\","/").split("/")[:-1])
new_path = os.path.join(path_base, str(cpt)+".pdf")
os.rename(path, new_path)
cpt +=1
partenaires = [x.split("\")[1] for x in list_path]
type_doc = [x.split("\")[2] for x in list_path]
nom_fichier = [x.split("\")[-1] for x in list_path]
df_files = pd.DataFrame()
df_files["partenaire"] = partenaires
df_files["nom dossier"] = type_doc
df_files["nom fichier"] = nom_fichier
df_files.to_excel("./files-tree_.xlsx",index=False)
df_files = pd.read_excel("./files-tree_.xlsx")
df_files["partenaire"] = df_files["partenaire"].apply(lambda x :
str(x).rjust(14,'0'))
#Here is my issue, I want to do a loop on the 'partenaire' of my df_file but idk
#how
path = file_path
pdf = pyPdf.PdfFileReader(open(path, "rb"))
fp = file(path, 'rb')
num_of_pages = pdf.getNumPages()
extract = ""
for i in range(num_of_pages):
inside = [i]
pagenos=set(inside)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
text = ""
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.decode("ascii","replace")
if re.search(r"PLAN D'EPARGNE EN ACTIONS|PLAN D'EPARGNE EN
ACTIONS)",text.lower()):
print text
extract = extract + text + "\n"
continue
}
我搜索了我的问题,但在两个可用问题中没有得到我的答案
Extract text per page with Python pdfMiner?
PDFMiner - Iterating through pages and converting them to text
基本上我想遍历每个页面,因为我只想 select 具有 特定文本 .
的页面我用过pyPdf
。它几乎适用于 pdfs
的 90%,但有时它不会从页面中提取信息。
我使用了下面的代码:
import pyPdf
extract = ""
pdf = pyPdf.PdfFileReader(open('filename.pdf', "rb"))
num_of_pages = pdf.getNumPages()
for p in range(num_of_pages):
ex = pdf.getPage(6)
ex = ex.extractText()
if re.search(r"to be held (at|on)",ex.lower()):
print 'yes'
print ex ,"\n"
extract = extract + ex + "\n"
continue
以上代码有效,但有时无法提取某些页面。
我也尝试过使用 pdfminer
,但我找不到如何逐页迭代其中的 pdf。 pdfminer
returns pdf 的全文。
我使用了下面的代码:
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
在上面的代码中,pdf 中的文本来自 for
循环
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
在此我如何一次迭代一页。
pdfminer
上的文档看不懂。同样的还有很多版本。
那么对于我的问题还有其他可用的软件包吗?或者 pdfminer
可以用来解决这个问题吗?
我知道回答你自己的问题不好,但我想我可能已经找到了这个问题的答案。
我认为这不是最好的方法,但它仍然对我有帮助。
我结合使用了pypdf
和pdfminer
代码如下:
import pyPdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
path = "filename.pdf"
pdf = pyPdf.PdfFileReader(open(path, "rb"))
fp = file(path, 'rb')
num_of_pages = pdf.getNumPages()
extract = ""
for i in range(num_of_pages):
inside = [i]
pagenos=set(inside)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
text = ""
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.decode("ascii","replace")
if re.search(r"to be held (at|on)",text.lower()):
print text
extract = extract + text + "\n"
continue
可能有更好的方法,但目前我发现这个很好。
因为 retstr 将保留每一页,您可以考虑通过调用 retstr.truncate(0) 来更改您的代码,它每次都会清除字符串,否则您每次都会打印已阅读的全部内容:
import pyPdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
path = "filename.pdf"
pdf = pyPdf.PdfFileReader(open(path, "rb"))
fp = file(path, 'rb')
num_of_pages = pdf.getNumPages()
extract = ""
for i in range(num_of_pages):
inside = [i]
pagenos=set(inside)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
text = ""
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
retstr.truncate(0)
text = text.decode("ascii","replace")
if re.search(r"to be held (at|on)",text.lower()):
print text
extract = extract + text + "\n"
continue
您可以参考以下link从PDF中逐页提取文本。
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
for page_layout in extract_pages("test.pdf"):
for element in page_layout:
if isinstance(element, LTTextContainer):
print(element.get_text())
我有同样的问题,但对我来说,我想对许多文件进行循环并仅在 excel 文件中提取 'partenaire' 之后具有特定 word/expression 的 'partenaire'仅在 pdf 的第一页中搜索。我的第一个问题是在文件夹和 pdf 文件中进行循环。这是我现在的代码:
{
enter code here
path_extraction_folder = "C:/Users/xxxxx/yyyyyy/pdf"
list_path = glob.glob(path_extraction_folder+"/**/*.pdf",recursive=True)
cpt = 0
for path in list_path:
path_base = "/".join(str(path).replace("\","/").split("/")[:-1])
new_path = os.path.join(path_base, str(cpt)+".pdf")
os.rename(path, new_path)
cpt +=1
partenaires = [x.split("\")[1] for x in list_path]
type_doc = [x.split("\")[2] for x in list_path]
nom_fichier = [x.split("\")[-1] for x in list_path]
df_files = pd.DataFrame()
df_files["partenaire"] = partenaires
df_files["nom dossier"] = type_doc
df_files["nom fichier"] = nom_fichier
df_files.to_excel("./files-tree_.xlsx",index=False)
df_files = pd.read_excel("./files-tree_.xlsx")
df_files["partenaire"] = df_files["partenaire"].apply(lambda x :
str(x).rjust(14,'0'))
#Here is my issue, I want to do a loop on the 'partenaire' of my df_file but idk
#how
path = file_path
pdf = pyPdf.PdfFileReader(open(path, "rb"))
fp = file(path, 'rb')
num_of_pages = pdf.getNumPages()
extract = ""
for i in range(num_of_pages):
inside = [i]
pagenos=set(inside)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
text = ""
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.decode("ascii","replace")
if re.search(r"PLAN D'EPARGNE EN ACTIONS|PLAN D'EPARGNE EN
ACTIONS)",text.lower()):
print text
extract = extract + text + "\n"
continue
}