Python - 从 docx 文件中删除页眉和页脚
Python - Remove header and footer from docx file
我需要删除许多 docx 文件中的页眉和页脚。我目前正在尝试使用 python-docx 库,但目前它不支持 docx 文档中的页眉和页脚(正在进行中)。
在 Python 中有什么方法可以实现吗?
据我了解,docx 是一种基于 xml 的格式,但我不知道如何使用它。
P.S.I有想法用lxml或BeautifulSoup解析xml并替换一些部分,但看起来很脏
更新。感谢 Shawn,这是一个很好的起点。我对脚本进行了一些更改。这是我的最终版本(它对我很有用,因为我需要编辑许多 .docx 文件。我正在使用 BeautifulSoup,因为标准 xml 解析器无法获得有效的 xml -tree。此外,我的 docx 文档在 xml 中没有页眉和页脚。它们只是将页眉和页脚的图像放在页面顶部。此外,为了提高速度,您可以使用 lxml而不是汤。
import zipfile
import shutil as su
import os
import tempfile
from bs4 import BeautifulSoup
def get_xml_from_docx(docx_filename):
"""
Return content of document.xml file inside docx document
"""
with zipfile.ZipFile(docx_filename) as zf:
xml_info = zf.read('word/document.xml')
return xml_info
def write_and_close_docx(self, edited_xml, output_filename):
""" Create a temp directory, expand the original docx zip.
Write the modified xml to word/document.xml
Zip it up as the new docx
"""
tmp_dir = tempfile.mkdtemp()
with zipfile.ZipFile(self) as zf:
zf.extractall(tmp_dir)
with open(os.path.join(tmp_dir, 'word/document.xml'), 'w') as f:
f.write(str(edited_xml))
# Get a list of all the files in the original docx zipfile
filenames = zf.namelist()
# Now, create the new zip file and add all the filex into the archive
zip_copy_filename = output_filename
docx = zipfile.ZipFile(zip_copy_filename, "w")
for filename in filenames:
docx.write(os.path.join(tmp_dir, filename), filename)
# Clean up the temp dir
su.rmtree(tmp_dir)
if __name__ == '__main__':
directory = 'your_directory/'
files = os.listdir(directory)
for file in files:
if file.endswith('.docx'):
word_doc = directory + file
new_word_doc = 'edited/' + file.rstrip('.docx') + '-edited.docx'
tree = get_xml_from_docx(word_doc)
soup = BeautifulSoup(tree, 'xml')
shapes = soup.find_all('shape')
for shape in shapes:
if 'margin-left:0pt' in shape.get('style'):
shape.parent.decompose()
write_and_close_docx(word_doc, soup, new_word_doc)
所以,就是这样 :) 我知道,代码不干净,很抱歉。
好吧,我从来没有想过,但我只是创建了一个带有页眉和页脚的 test.docx。一旦你有了那个 docx,你就可以 unzip
它来获取构成 XML 的文件。对于我的简单测试用例,这产生了:
word/
_rels footer1.xml styles.xml
document.xml footnotes.xml stylesWithEffects.xml
endnotes.xml header1.xml theme
fontTable.xml settings.xml webSettings.xml
打开 word/documents.xml
可以找到主要问题区域。您可以看到其中包含页眉和页脚的元素。在我的简单案例中,我得到了:
<w:headerReference w:type="default" r:id="rId7"/>
<w:footerReference w:type="default" r:id="rId8"/>
和
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="720" w:footer="720" w:gutter="0"/>
所有文档实际上都很小,所以
<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mo="http://schemas.microsoft.com/office/mac/office/2008/main" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:mv="urn:schemas-microsoft-com:mac:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 wp14">
<w:body>
<w:p w:rsidR="009E6E8F" w:rsidRDefault="009E6E8F"/>
<w:p w:rsidR="00B53FFA" w:rsidRDefault="00B53FFA"/>
<w:p w:rsidR="00B53FFA" w:rsidRDefault="00B53FFA"/><w:p w:rsidR="00B53FFA" w:rsidRDefault="00B53FFA">
<w:r>
<w:t>MY BODY</w:t>
</w:r>
<w:bookmarkStart w:id="0" w:name="_GoBack"/>
<w:bookmarkEnd w:id="0"/>
</w:p>
<w:sectPr w:rsidR="00B53FFA" w:rsidSect="009E6E8F">
<w:headerReference w:type="default" r:id="rId7"/>
<w:footerReference w:type="default" r:id="rId8"/>
<w:pgSz w:w="12240" w:h="15840"/>
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="720" w:footer="720" w:gutter="0"/>"""
所以 XML 操纵不会成为问题,无论是在功能上还是在性能上对于这种尺寸的东西。下面是一些代码,可以让您的文档进入 python,解析为 xml 树,并保存为 docx。我现在必须出去,所以这不是您的完整解决方案,但我认为这应该能让您顺利完成。如果您仍然遇到问题,我稍后会 return 看看您的情况。
import zipfile
import shutil as su
import os
import tempfile
import xml.etree.cElementTree
def get_word_xml(docx_filename):
with open(docx_filename, mode='rt') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
return xml_content
def write_and_close_docx (self, xml_content, output_filename):
""" Create a temp directory, expand the original docx zip.
Write the modified xml to word/document.xml
Zip it up as the new docx
"""
tmp_dir = tempfile.mkdtemp()
self.zipfile.extractall(tmp_dir)
with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
xmlstr = tree.tostring(xml_content, pretty_print=True)
f.write(xmlstr)
# Get a list of all the files in the original docx zipfile
filenames = self.zipfile.namelist()
# Now, create the new zip file and add all the filex into the archive
zip_copy_filename = output_filename
with zipfile.ZipFile(zip_copy_filename, "w") as docx:
for filename in filenames:
docx.write(os.path.join(tmp_dir,filename), filename)
# Clean up the temp dir
su.rmtree(tmp_dir)
def get_xml_tree(f):
return xml.etree.ElementTree.parse(f)
word_doc = 'TEXT.docx'
new_word_doc = 'SLIM.docx'
doc = get_word_xml(word_doc)
tree = get_xml_tree(doc)
write_and_close_docx(word_doc, tree, new_word_doc)
我需要删除许多 docx 文件中的页眉和页脚。我目前正在尝试使用 python-docx 库,但目前它不支持 docx 文档中的页眉和页脚(正在进行中)。
在 Python 中有什么方法可以实现吗?
据我了解,docx 是一种基于 xml 的格式,但我不知道如何使用它。
P.S.I有想法用lxml或BeautifulSoup解析xml并替换一些部分,但看起来很脏
更新。感谢 Shawn,这是一个很好的起点。我对脚本进行了一些更改。这是我的最终版本(它对我很有用,因为我需要编辑许多 .docx 文件。我正在使用 BeautifulSoup,因为标准 xml 解析器无法获得有效的 xml -tree。此外,我的 docx 文档在 xml 中没有页眉和页脚。它们只是将页眉和页脚的图像放在页面顶部。此外,为了提高速度,您可以使用 lxml而不是汤。
import zipfile
import shutil as su
import os
import tempfile
from bs4 import BeautifulSoup
def get_xml_from_docx(docx_filename):
"""
Return content of document.xml file inside docx document
"""
with zipfile.ZipFile(docx_filename) as zf:
xml_info = zf.read('word/document.xml')
return xml_info
def write_and_close_docx(self, edited_xml, output_filename):
""" Create a temp directory, expand the original docx zip.
Write the modified xml to word/document.xml
Zip it up as the new docx
"""
tmp_dir = tempfile.mkdtemp()
with zipfile.ZipFile(self) as zf:
zf.extractall(tmp_dir)
with open(os.path.join(tmp_dir, 'word/document.xml'), 'w') as f:
f.write(str(edited_xml))
# Get a list of all the files in the original docx zipfile
filenames = zf.namelist()
# Now, create the new zip file and add all the filex into the archive
zip_copy_filename = output_filename
docx = zipfile.ZipFile(zip_copy_filename, "w")
for filename in filenames:
docx.write(os.path.join(tmp_dir, filename), filename)
# Clean up the temp dir
su.rmtree(tmp_dir)
if __name__ == '__main__':
directory = 'your_directory/'
files = os.listdir(directory)
for file in files:
if file.endswith('.docx'):
word_doc = directory + file
new_word_doc = 'edited/' + file.rstrip('.docx') + '-edited.docx'
tree = get_xml_from_docx(word_doc)
soup = BeautifulSoup(tree, 'xml')
shapes = soup.find_all('shape')
for shape in shapes:
if 'margin-left:0pt' in shape.get('style'):
shape.parent.decompose()
write_and_close_docx(word_doc, soup, new_word_doc)
所以,就是这样 :) 我知道,代码不干净,很抱歉。
好吧,我从来没有想过,但我只是创建了一个带有页眉和页脚的 test.docx。一旦你有了那个 docx,你就可以 unzip
它来获取构成 XML 的文件。对于我的简单测试用例,这产生了:
word/
_rels footer1.xml styles.xml
document.xml footnotes.xml stylesWithEffects.xml
endnotes.xml header1.xml theme
fontTable.xml settings.xml webSettings.xml
打开 word/documents.xml
可以找到主要问题区域。您可以看到其中包含页眉和页脚的元素。在我的简单案例中,我得到了:
<w:headerReference w:type="default" r:id="rId7"/>
<w:footerReference w:type="default" r:id="rId8"/>
和
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="720" w:footer="720" w:gutter="0"/>
所有文档实际上都很小,所以
<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mo="http://schemas.microsoft.com/office/mac/office/2008/main" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:mv="urn:schemas-microsoft-com:mac:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 wp14">
<w:body>
<w:p w:rsidR="009E6E8F" w:rsidRDefault="009E6E8F"/>
<w:p w:rsidR="00B53FFA" w:rsidRDefault="00B53FFA"/>
<w:p w:rsidR="00B53FFA" w:rsidRDefault="00B53FFA"/><w:p w:rsidR="00B53FFA" w:rsidRDefault="00B53FFA">
<w:r>
<w:t>MY BODY</w:t>
</w:r>
<w:bookmarkStart w:id="0" w:name="_GoBack"/>
<w:bookmarkEnd w:id="0"/>
</w:p>
<w:sectPr w:rsidR="00B53FFA" w:rsidSect="009E6E8F">
<w:headerReference w:type="default" r:id="rId7"/>
<w:footerReference w:type="default" r:id="rId8"/>
<w:pgSz w:w="12240" w:h="15840"/>
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="720" w:footer="720" w:gutter="0"/>"""
所以 XML 操纵不会成为问题,无论是在功能上还是在性能上对于这种尺寸的东西。下面是一些代码,可以让您的文档进入 python,解析为 xml 树,并保存为 docx。我现在必须出去,所以这不是您的完整解决方案,但我认为这应该能让您顺利完成。如果您仍然遇到问题,我稍后会 return 看看您的情况。
import zipfile
import shutil as su
import os
import tempfile
import xml.etree.cElementTree
def get_word_xml(docx_filename):
with open(docx_filename, mode='rt') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
return xml_content
def write_and_close_docx (self, xml_content, output_filename):
""" Create a temp directory, expand the original docx zip.
Write the modified xml to word/document.xml
Zip it up as the new docx
"""
tmp_dir = tempfile.mkdtemp()
self.zipfile.extractall(tmp_dir)
with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
xmlstr = tree.tostring(xml_content, pretty_print=True)
f.write(xmlstr)
# Get a list of all the files in the original docx zipfile
filenames = self.zipfile.namelist()
# Now, create the new zip file and add all the filex into the archive
zip_copy_filename = output_filename
with zipfile.ZipFile(zip_copy_filename, "w") as docx:
for filename in filenames:
docx.write(os.path.join(tmp_dir,filename), filename)
# Clean up the temp dir
su.rmtree(tmp_dir)
def get_xml_tree(f):
return xml.etree.ElementTree.parse(f)
word_doc = 'TEXT.docx'
new_word_doc = 'SLIM.docx'
doc = get_word_xml(word_doc)
tree = get_xml_tree(doc)
write_and_close_docx(word_doc, tree, new_word_doc)