Python docx 在保持样式的同时替换段落中的字符串

Python docx Replace string in paragraph while keeping style

我需要帮助替换 word 文档中的字符串,同时保持整个文档的格式。

我正在使用 python-docx,在阅读文档后,它适用于整个段落,所以我放宽了格式,比如粗体或斜体字。 包括要替换的文本以粗体显示,我想保持这种状态。 我正在使用此代码:

from docx import Document
def replace_string2(filename):
    doc = Document(filename)
    for p in doc.paragraphs:
        if 'Text to find and replace' in p.text:
            print 'SEARCH FOUND!!'
            text = p.text.replace('Text to find and replace', 'new text')
            style = p.style
            p.text = text
            p.style = style
    # doc.save(filename)
    doc.save('test.docx')
    return 1

所以如果我实现它并想要类似的东西(包含要替换的字符串的段落会丢失其格式):

这是第 1 段,这是 粗体 中的文本。

这是第 2 段,我将替换旧文本

当前结果为:

这是第 1 段,这是 粗体 中的文本。

这是第 2 段,我将替换新文本

我发布了这个问题(尽管我在这里看到了几个相同的问题),因为其中 none 个(据我所知)解决了这个问题。有一个使用 oodocx 库,我试过了,但没有用。所以我找到了解决方法。

代码非常相似,但逻辑是:当我找到包含我要替换的字符串的段落时,使用 runs 添加另一个循环。 (这仅在我希望替换的字符串具有相同格式的情况下才有效)。

def replace_string(filename):
    doc = Document(filename)
    for p in doc.paragraphs:
        if 'old text' in p.text:
            inline = p.runs
            # Loop added to work with runs (strings with same style)
            for i in range(len(inline)):
                if 'old text' in inline[i].text:
                    text = inline[i].text.replace('old text', 'new text')
                    inline[i].text = text
            print p.text

    doc.save('dest1.docx')
    return 1

这就是我在替换文本时保留文本样式的方法。

根据 Alo 的回答以及搜索文本可以拆分为多个 运行 的事实,以下是我在模板 docx 文件中替换占位符文本的方法。它检查所有文档段落和占位符的任何 table 单元格内容。

一旦在段落中找到搜索文本,它就会循环遍历 运行s 以识别哪些 运行s 包含搜索文本的部分文本,然后将替换文本插入到首先 运行 然后在剩余的 运行 中删除剩余的搜索文本字符。

我希望这对某人有所帮助。这是 gist 如果有人想改进它

编辑: 我随后发现 python-docx-template 允许在 docx 模板中使用 jinja2 样式模板。这是 link 到 documentation

def docx_replace(doc, data):
    paragraphs = list(doc.paragraphs)
    for t in doc.tables:
        for row in t.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    paragraphs.append(paragraph)
    for p in paragraphs:
        for key, val in data.items():
            key_name = '${{{}}}'.format(key) # I'm using placeholders in the form ${PlaceholderName}
            if key_name in p.text:
                inline = p.runs
                # Replace strings and retain the same style.
                # The text to be replaced can be split over several runs so
                # search through, identify which runs need to have text replaced
                # then replace the text in those identified
                started = False
                key_index = 0
                # found_runs is a list of (inline index, index of match, length of match)
                found_runs = list()
                found_all = False
                replace_done = False
                for i in range(len(inline)):

                    # case 1: found in single run so short circuit the replace
                    if key_name in inline[i].text and not started:
                        found_runs.append((i, inline[i].text.find(key_name), len(key_name)))
                        text = inline[i].text.replace(key_name, str(val))
                        inline[i].text = text
                        replace_done = True
                        found_all = True
                        break

                    if key_name[key_index] not in inline[i].text and not started:
                        # keep looking ...
                        continue

                    # case 2: search for partial text, find first run
                    if key_name[key_index] in inline[i].text and inline[i].text[-1] in key_name and not started:
                        # check sequence
                        start_index = inline[i].text.find(key_name[key_index])
                        check_length = len(inline[i].text)
                        for text_index in range(start_index, check_length):
                            if inline[i].text[text_index] != key_name[key_index]:
                                # no match so must be false positive
                                break
                        if key_index == 0:
                            started = True
                        chars_found = check_length - start_index
                        key_index += chars_found
                        found_runs.append((i, start_index, chars_found))
                        if key_index != len(key_name):
                            continue
                        else:
                            # found all chars in key_name
                            found_all = True
                            break

                    # case 2: search for partial text, find subsequent run
                    if key_name[key_index] in inline[i].text and started and not found_all:
                        # check sequence
                        chars_found = 0
                        check_length = len(inline[i].text)
                        for text_index in range(0, check_length):
                            if inline[i].text[text_index] == key_name[key_index]:
                                key_index += 1
                                chars_found += 1
                            else:
                                break
                        # no match so must be end
                        found_runs.append((i, 0, chars_found))
                        if key_index == len(key_name):
                            found_all = True
                            break

                if found_all and not replace_done:
                    for i, item in enumerate(found_runs):
                        index, start, length = [t for t in item]
                        if i == 0:
                            text = inline[index].text.replace(inline[index].text[start:start + length], str(val))
                            inline[index].text = text
                        else:
                            text = inline[index].text.replace(inline[index].text[start:start + length], '')
                            inline[index].text = text
                # print(p.text)

# usage

doc = docx.Document('path/to/template.docx')
docx_replace(doc, dict(ItemOne='replacement text', ItemTwo="Some replacement text\nand some more")
doc.save('path/to/destination.docx')
from docx import Document

document = Document('old.docx')

dic = {
    '{{FULLNAME}}':'First Last',
    '{{FIRST}}':'First',
    '{{LAST}}' : 'Last',
}
for p in document.paragraphs:
    inline = p.runs
    for i in range(len(inline)):
        text = inline[i].text
        for key in dic.keys():
            if key in text:
                 text=text.replace(key,dic[key])
                 inline[i].text = text


document.save('new.docx')

根据DOCX文档的架构:

  1. 文本:docx>段落>运行s
  2. 文本 table:docx>tables>行>单元格>段落>运行s
  3. Header: docx>章节>header>段落>运行s
  4. Header tables: docx>部分>header>tables>行>单元格>段落>运行s

页脚和header一样,我们可以直接遍历段落来查找替换我们的关键词,但是这样会导致文本格式重新设置,所以我们只能遍历里面的词运行 并替换它们。但是,由于我们的关键字可能超出了运行的长度范围,我们无法成功替换它们。

因此,我在这里提供一个思路:首先,以段落为单位,通过list标记段落中每个字符的位置;然后通过list标记运行中每个字符的位置;查找段落中的关键词,按对应关系以字符为单位删除替换。

'''
-*- coding: utf-8 -*-
@Time    : 2021/4/19 13:13
@Author  : ZCG
@Site    :
@File    : Batch DOCX document keyword replacement.py
@Software: PyCharm
'''

from docx import Document
import os


def get_docx_list(dir_path):
    '''
    :param dir_path:
    :return: List of docx files in the current directory
    '''
    file_list = []
    for roots,dirs,files in os.walk(dir_path):
        for file in files:
            if file.endswith("docx") == True and file[0] != "~":  # Locate the docx document and exclude temporary files
                file_root = roots+"\"+file
                file_list.append(file_root)
    print("The directory found a total of {0} related files!".format(len(file_list)))
    return file_list

class ParagraphsKeyWordsReplace:
    '''
        self:paragraph
    '''

    def __init__(self):
        self.text = None
        self.runs = None

    def p_replace(self,x,key,value):
        '''
        The reason why the text in the paragraph is not directly replaced is because this will cause the original format to change.
        Replacing the text in the runs will not cause the original format to change.
        :param x:       paragraph number
        :param key:     keywords to replace
        :param value:   replaced keywords
        :return:
        '''
        paragraph_positions = []                                # Get the coordinate values of all characters in this paragraph {run_index , char_index}
        for y, run in enumerate(self.runs):                     # Read the index of the run
            for z, char in enumerate(list(run.text)):           # Read the index of chars in run
                position = {"run": y, "char": z}                # give each character a dictionary index
                paragraph_positions.append(position)
        # Process the number of times the key appears in this paragraph, and record the starting position in the list
        # Here, if you use while self.text.find(key) >= 0, when you encounter a structural word such as {"ab":"abc"},
        # it will enter an infinite loop, and return the index of the first word of the key in the current paragraph value
        key_indexs = [s for s in range(len(self.text)) if self.text.find(key, s, len(self.text)) == s]
        for i, start_i in enumerate( reversed(key_indexs),start=1):     # Iteration in reverse order
            end_i = start_i + len(key)                                  # where the keyword ends in this paragraph
            key_maps = paragraph_positions[start_i:end_i]               # Map the section of the slice list that contains the keyword in the paragraph
            ParagraphsKeyWordsReplace.c_replace(self, key_maps, value)
            print(f"\tSuccessfully replaced segment {x+1}, object {i}:{key}===>{value}")


    def c_replace(self,key_maps,value):
        '''
        :param key_maps: List of index dictionaries containing keywords
        :param value: replaced new word
        :return:
        Receive parameters, delete the characters in key_maps from back to front, and keep the first one for replacement with value
        Note: Be sure to delete in reverse order, otherwise the change in the length of the list will cause IndedxError: string index out of range error
        '''
        # print(key_maps)
        for i, position in enumerate(reversed(key_maps),start=1):
            y, z = position["run"], position["char"]
            run,char = self.runs[y],self.runs[y].text[z]
            # print("current processing:",position,char,i,len(key_maps))
            # print("Before:",run.text)
            if i < len(key_maps):
                rt = list(run.text)
                rt.pop(z)
                run.text = ''.join(rt)                      # Delete the character at the specified index each time through the loop
                # Stepping on the pit: There is no replace method here. The purpose is to prevent multiple identical words in run.text. If multiple words are replaced at one time, an IndedxError will be raised.
            if i == len(key_maps):
                run.text = run.text.replace(char, value)    # The first character in key_maps is replaced with value
            # print("After:", run.text)

class DocxKeyWordsReplace:
    '''
        self:docx
    '''

    def __init__(self):
        self.paragraphs = None
        self.tables = None
        self.sections = None

    def content(self,replace_dict):
        print(f"(1)Processing keywords in body text...")
        for key, value in replace_dict.items():
            for x, paragraph in enumerate(self.paragraphs):
                ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
        print("\tText keyword replacement completed!")

    def tables(self,replace_dict):
        print(f"(2)Processing keywords in table...")
        for key,value in replace_dict.items():
            for table in self.tables:
                for row in table.rows:
                    for cell in row.cells:
                        for x,paragraph in enumerate(cell.paragraphs):
                            ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
        print("\tTable keyword replacement completed!")

    def header_content(self,replace_dict):
        print(f"(3)Processing keywords in header...")
        for key,value in replace_dict.items():
            for section in self.sections:
                for x,paragraph in enumerate(section.header.paragraphs):
                    ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
        print("\tContent header keyword replacement completed!")

    def header_tables(self,replace_dict):
        print(f"(4)Processing keywords in header table...")
        for key,value in replace_dict.items():
            for section in self.sections:
                for table in section.header.tables:
                    for row in table.rows:
                        for cell in row.cells:
                            for x, paragraph in enumerate(cell.paragraphs):
                                ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
        print("\tHeader table keyword replacement completed!")

    def footer_content(self, replace_dict):
        print(f"(6)Processing keywords in footer...")
        for key, value in replace_dict.items():
            for section in self.sections:
                for x, paragraph in enumerate(section.footer.paragraphs):
                    ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
        print("\tFooter keyword replacement completed!")

    def footer_tables(self, replace_dict):
        print(f"(7)Processing keywords in footer table...")
        for key, value in replace_dict.items():
            for section in self.sections:
                for table in section.footer.tables:
                    for row in table.rows:
                        for cell in row.cells:
                            for x, paragraph in enumerate(cell.paragraphs):
                                ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
        print("\tFooter table keyword replacement completed!")

def main():
    '''
    How to use: Modify the values in replace_dict and file_dir
    replace_dict :The following dictionary corresponds to the format, the key is the content to be replaced, and the value is the new content
    file_dir :The directory where the docx file is stored, and its subdirectories are supported
    '''
    # input section
    replace_dict = {
        "MG life technology (shenzhen) co., LTD":"Shenzhen YW medical technology co., LTD",
        "MG-":"YW-",
        "2017-":"2020-",
        "Z18":"Z20",
        }
    file_dir = r"E:\docxfiles"
    # call processing part
    for i,file in enumerate(get_docx_list(file_dir),start=1):
        print(f"{i}、file being processed:{file}")
        docx = Document(file)
        DocxKeyWordsReplace.content(docx, replace_dict=replace_dict)
        DocxKeyWordsReplace.tables(docx, replace_dict=replace_dict)
        DocxKeyWordsReplace.header_content(docx, replace_dict=replace_dict)
        DocxKeyWordsReplace.header_tables(docx, replace_dict=replace_dict)
        DocxKeyWordsReplace.footer_content(docx, replace_dict=replace_dict)
        DocxKeyWordsReplace.footer_tables(docx, replace_dict=replace_dict)
        docx.save(file)
        print(f'"{file}"Document processing complete!\n')


if __name__ == "__main__":
    main()
    print("All complete processing!")

https://gist.github.com/heimoshuiyu/671a4dfbd13f7c279e85224a5b6726c0

这使用了“穿梭机”,因此它可以找到跨越多个运行的密钥。这类似于 MS Word

中的“全部替换”行为
def shuttle_text(shuttle):
    t = ''
    for i in shuttle:
        t += i.text
    return t

def docx_replace(doc, data):
    for key in data:

        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if key in cell.text:
                        cell.text = cell.text.replace(key, data[key])

        for p in doc.paragraphs:

            begin = 0
            for end in range(len(p.runs)):

                shuttle = p.runs[begin:end+1]

                full_text = shuttle_text(shuttle)
                if key in full_text:
                    # print('Replace:', key, '->', data[key])
                    # print([i.text for i in shuttle])

                    # find the begin
                    index = full_text.index(key)
                    # print('full_text length', len(full_text), 'index:', index)
                    while index >= len(p.runs[begin].text):
                        index -= len(p.runs[begin].text)
                        begin += 1

                    shuttle = p.runs[begin:end+1]

                    # do replace
                    # print('before replace', [i.text for i in shuttle])
                    if key in shuttle[0].text:
                        shuttle[0].text = shuttle[0].text.replace(key, data[key])
                    else:
                        replace_begin_index = shuttle_text(shuttle).index(key)
                        replace_end_index = replace_begin_index + len(key)
                        replace_end_index_in_last_run = replace_end_index - len(shuttle_text(shuttle[:-1]))
                        shuttle[0].text = shuttle[0].text[:replace_begin_index] + data[key]

                        # clear middle runs
                        for i in shuttle[1:-1]:
                            i.text = ''

                        # keep last run
                        shuttle[-1].text = shuttle[-1].text[replace_end_index_in_last_run:]

                    # print('after replace', [i.text for i in shuttle])

                    # set begin to next
                    begin = end

# usage

doc = docx.Document('path/to/template.docx')
docx_replace(doc, dict(ItemOne='replacement text', ItemTwo="Some replacement text\nand some more")
doc.save('path/to/destination.docx')