使用 python-docx 获取 MS Word 段落的开始(和结束)页面
Get starting (and ending) page of an MS Word paragraph using python-docx
我正在自动创建 MS Word 文档。完成后,我需要能够将其另存为 PDF 并将一些 other/external PDF 页面插入到 Word 文档的 PDF 版本中。为此,我计划在各自页面的 Word 文档中留下标记(例如“[pdfGoesHere]”)。
对于 insert/replace 新的 PDF 页面,我需要知道标记在哪些页面上。 python-docx 是否有办法确定段落开始(和结束)的页码?我已经通读了 python-docx documentation 并且似乎没有任何内容。我知道我可以循环浏览所有段落并找到我感兴趣的段落,但我找不到确定的方法来获取该段落的页码。
有没有办法做到这一点我忽略了?如果没有,对于如何实现插入 PDF 页面的主要目标,还有其他建议吗?
简短的回答是否定的。页码是在渲染时确定的,并且由于可用字体等因素而依赖于设备。
这个答案有更多的细节:
感谢@scanny 提供的反馈。由于在 python-docx
中没有办法做到这一点,而且我正在将文档转换为 PDF,所以我决定在将 Word 文档转换为之后使用 pdfminer
来获取页码一个PDF。这段代码可能很长,但它完成了工作
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def xmlToLines(xml):
text = ''.join(xml)
return text.split('\n')
#Convert a PDF found at the 'path' and turns it into XML lines
#path is the full path directory to the PDF file you're reading from
def convert_pdf_to_xml(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
print 'Converting following file from PDF to XML: \n - ' + str(path)
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
lines = xmlToLines(text)
#Close out pdf and I/O's
fp.close()
device.close()
retstr.close()
return lines
#returns a list of every page number where the field marker is found in the PDF
def getPagesWithField(wordPdfPath, field):
lines = convert_pdf_to_xml(wordPdfPath)
page_regex = r'page id="[0-9]*"'
t_regex = r'<text font='
pagesFound = []
text = ''
field = field.replace('<','&').replace('>','&')
for i in range(len(lines)):
#If it's a new page line, increment to the new page
if len(re.findall(page_regex, lines[i])) > 0:
page = int(re.findall(r'[0-9]{1,}', lines[i])[0])
#print 'page: ' + str(page)
#If it's the end of a line
elif lines[i] == '<text>':
#print "Text: " + text
#check if the collected text is the field you're looking for
if field in text:
pagesFound.append(page)
text = ''
#If it's a line with a text character, add it to text
elif len(re.findall(t_regex, lines[i])) > 0:
text = str(text + re.findall(r'>[^\r\n]*</text>',lines[i])[0][1])
pagesFound = list(set(pagesFound))
pagesFound.sort()
return pagesFound
此后,PyPDF2
可用于简单的PDF页面insertion/merging
我正在自动创建 MS Word 文档。完成后,我需要能够将其另存为 PDF 并将一些 other/external PDF 页面插入到 Word 文档的 PDF 版本中。为此,我计划在各自页面的 Word 文档中留下标记(例如“[pdfGoesHere]”)。
对于 insert/replace 新的 PDF 页面,我需要知道标记在哪些页面上。 python-docx 是否有办法确定段落开始(和结束)的页码?我已经通读了 python-docx documentation 并且似乎没有任何内容。我知道我可以循环浏览所有段落并找到我感兴趣的段落,但我找不到确定的方法来获取该段落的页码。
有没有办法做到这一点我忽略了?如果没有,对于如何实现插入 PDF 页面的主要目标,还有其他建议吗?
简短的回答是否定的。页码是在渲染时确定的,并且由于可用字体等因素而依赖于设备。
这个答案有更多的细节:
感谢@scanny 提供的反馈。由于在 python-docx
中没有办法做到这一点,而且我正在将文档转换为 PDF,所以我决定在将 Word 文档转换为之后使用 pdfminer
来获取页码一个PDF。这段代码可能很长,但它完成了工作
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def xmlToLines(xml):
text = ''.join(xml)
return text.split('\n')
#Convert a PDF found at the 'path' and turns it into XML lines
#path is the full path directory to the PDF file you're reading from
def convert_pdf_to_xml(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
print 'Converting following file from PDF to XML: \n - ' + str(path)
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
lines = xmlToLines(text)
#Close out pdf and I/O's
fp.close()
device.close()
retstr.close()
return lines
#returns a list of every page number where the field marker is found in the PDF
def getPagesWithField(wordPdfPath, field):
lines = convert_pdf_to_xml(wordPdfPath)
page_regex = r'page id="[0-9]*"'
t_regex = r'<text font='
pagesFound = []
text = ''
field = field.replace('<','&').replace('>','&')
for i in range(len(lines)):
#If it's a new page line, increment to the new page
if len(re.findall(page_regex, lines[i])) > 0:
page = int(re.findall(r'[0-9]{1,}', lines[i])[0])
#print 'page: ' + str(page)
#If it's the end of a line
elif lines[i] == '<text>':
#print "Text: " + text
#check if the collected text is the field you're looking for
if field in text:
pagesFound.append(page)
text = ''
#If it's a line with a text character, add it to text
elif len(re.findall(t_regex, lines[i])) > 0:
text = str(text + re.findall(r'>[^\r\n]*</text>',lines[i])[0][1])
pagesFound = list(set(pagesFound))
pagesFound.sort()
return pagesFound
此后,PyPDF2
可用于简单的PDF页面insertion/merging