从 PDF 文档中提取特定片段
Extracting specific segments from PDF document
我有几篇 pdf 格式的研究论文,我只想从论文中提取 introduction/background 等内容。另外,我只能使用python。有人可以帮忙吗?
几周前,我在这里得到了类似的帮助。使用 PDF 文件可能很容易,也可能非常困难,而且 PDF 文件种类繁多。话虽如此,您应该考虑将所有 PDF 文件转换为文本文件。试试下面的代码示例。
首先,将 PDF 转换为文本。
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = io.StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
# set paths accordingly:
pdfDir = "C:/your_path_here/PDF_in/"
txtDir = "C:/your_path_here/TEXT_out/"
convertMultiple(pdfDir, txtDir)
其次,查找开始标记 ("New York State Real Property Law") 和结束标记 ("common elements of the property.") 之间的所有文本。
# Loop through all TEXT files in a folder
# Pull out all text between two anchors: "New York State Real Property Law" & "common elements of the property."
import re
import os
myRegex=re.compile("New York State Real Property Law.*?common elements of the property\.",re.DOTALL)
for foldername,subfolders,files in os.walk(r"C:/your_path_here/text_files/"):
for file in files:
print(file)
object=open(os.path.join(foldername,file))
Text=object.read()
for subText in myRegex.findall(Text):
print(subText)
object.close()
也许您可以在不将 PDF 转换为文本文件的情况下完成所有工作,但我还没有找到任何方法。
我有几篇 pdf 格式的研究论文,我只想从论文中提取 introduction/background 等内容。另外,我只能使用python。有人可以帮忙吗?
几周前,我在这里得到了类似的帮助。使用 PDF 文件可能很容易,也可能非常困难,而且 PDF 文件种类繁多。话虽如此,您应该考虑将所有 PDF 文件转换为文本文件。试试下面的代码示例。
首先,将 PDF 转换为文本。
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = io.StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
# set paths accordingly:
pdfDir = "C:/your_path_here/PDF_in/"
txtDir = "C:/your_path_here/TEXT_out/"
convertMultiple(pdfDir, txtDir)
其次,查找开始标记 ("New York State Real Property Law") 和结束标记 ("common elements of the property.") 之间的所有文本。
# Loop through all TEXT files in a folder
# Pull out all text between two anchors: "New York State Real Property Law" & "common elements of the property."
import re
import os
myRegex=re.compile("New York State Real Property Law.*?common elements of the property\.",re.DOTALL)
for foldername,subfolders,files in os.walk(r"C:/your_path_here/text_files/"):
for file in files:
print(file)
object=open(os.path.join(foldername,file))
Text=object.read()
for subText in myRegex.findall(Text):
print(subText)
object.close()
也许您可以在不将 PDF 转换为文本文件的情况下完成所有工作,但我还没有找到任何方法。