在 python 中突出显示更改的词

Question

我发现了一个比较 Excel 电子表格并突出显示更改的功能。我怎样才能以只有更改的单词才会突出显示的方式修改脚本？

下面你可以看到我想应用的逻辑。

数据框 1

数据帧 2

数据帧 3

原剧本在GitHub，link在here。作为输出，我想将所有内容保存在 1 个工作簿中，如下面的代码所示。唯一需要做的更改是以某种方式标记所有更改的单词并将它们导出到 DataFrame 3 (dfDiff)。

import pandas as pd
from pathlib import Path
import os


def excel_diff(path_OLD, path_NEW, index_col):
    df_OLD = pd.read_excel(path_OLD, index_col=index_col).fillna(0)
    df_NEW = pd.read_excel(path_NEW, index_col=index_col).fillna(0)

    # Perform Diff
    dfDiff = df_NEW.copy()
    droppedRows = []
    newRows = []

    cols_OLD = df_OLD.columns
    cols_NEW = df_NEW.columns
    sharedCols = list(set(cols_OLD).intersection(cols_NEW))

    for row in dfDiff.index:
        if (row in df_OLD.index) and (row in df_NEW.index):
            for col in sharedCols:
                value_OLD = df_OLD.loc[row, col]
                value_NEW = df_NEW.loc[row, col]
                if value_OLD == value_NEW:
                    dfDiff.loc[row, col] = df_NEW.loc[row, col]
                else:
                    dfDiff.loc[row, col] = ('{}→{}').format(value_OLD, value_NEW)
        else:
            newRows.append(row)

    for row in df_OLD.index:
        if row not in df_NEW.index:
            droppedRows.append(row)
            dfDiff = dfDiff.append(df_OLD.loc[row, :])

    dfDiff = dfDiff.sort_index().fillna('')
    print(dfDiff)
    print('\nNew Rows:     {}'.format(newRows))
    print('Dropped Rows: {}'.format(droppedRows))

    # Save output and format
    # fname = '{} vs {}.xlsx'.format(path_OLD.stem, path_NEW.stem)
    fname = (os.path.dirname(os.path.abspath(__file__)) + '/uploads/differences.xlsx'.format(path_OLD.stem, path_NEW.stem))
    writer = pd.ExcelWriter(fname, engine='xlsxwriter')

    dfDiff.to_excel(writer, sheet_name='differences', index=True)
    df_NEW.to_excel(writer, sheet_name=path_NEW.stem, index=True)
    df_OLD.to_excel(writer, sheet_name=path_OLD.stem, index=True)

    # get xlsxwriter objects
    workbook = writer.book
    worksheet = writer.sheets['differences']
    worksheet.hide_gridlines(2)
    worksheet.set_default_row(15)

    # define formats
    grey_fmt = workbook.add_format({'font_color': '#E0E0E0'})
    highlight_fmt = workbook.add_format({'font_color': '#ff6666', 'bg_color': '#ffff00'})
    new_fmt = workbook.add_format({'font_color': '#32CD32', 'bold': True})

    # set format over range
    ## highlight changed cells
    worksheet.conditional_format('A1:ZZ1000', {'type': 'text',
                                               'criteria': 'containing',
                                               'value': '→',
                                               'format': highlight_fmt})

    # highlight new/changed rows
    for row in range(dfDiff.shape[0]):
        if row + 1 in newRows:
            worksheet.set_row(row + 1, 15, new_fmt)
        if row + 1 in droppedRows:
            worksheet.set_row(row + 1, 15, grey_fmt)

    # save
    writer.save()
    print('\nDone.')


def main():
    path_OLD = Path(os.path.dirname(os.path.abspath(__file__)) + '/uploads/old_content.xlsx')
    path_NEW = Path(os.path.dirname(os.path.abspath(__file__)) + '/uploads/new_content.xlsx')

    # get index col from data
    df = pd.read_excel(path_NEW)
    index_col = df.columns[0]
    print('\nIndex column: {}\n'.format(index_col))
    excel_diff(path_OLD, path_NEW, index_col)


main()

Answer 1

我写了一个你想要实现的更简单的实现。请记住，此代码要求两个数据帧具有相同的结构（具有相同结构的相同行数和列数），如果数据帧不同，则需要应用更多逻辑。

import pandas as pd

# Create out test dataframes
df_old = pd.DataFrame({'ID': [1,2,3], 'upload_date': ['15/03/2021','16/03/2021','17/03/2021'],
                    'product_ASIN': ['1234567890','1234567891','1234567892'], 'Project_name': ['Name1','Name2','Name3'],
                    'Country': ['Poland','Poland','Poland'], 'Brand': ['Hugo Boss','Hugo Boss','Hugo Boss'],
                    'Category': ['Perfumes','Perfumes','Perfumes'], 'Title': ['title 1','title 2','title 3'],
                    'Description': ['Description 1','Description 2','Description 3']})

df_new = pd.DataFrame({'ID': [1,2,3], 'upload_date': ['15/03/2021','16/03/2021','17/03/2021'],
                    'product_ASIN': ['1234567890','1234567891','1234567892'], 'Project_name': ['Name1','Name2','Name3'],
                    'Country': ['Poland','Poland','Poland'], 'Brand': ['Hugo Boss','Hugo Boss','Hugo Boss'],
                    'Category': ['Perfumes','Perfumes','Perfumes'], 'Title': ['title 1','title changed','title 3'],
                    'Description': ['Description 1','Description 2','description']})

# Declare a list in which we will store the indexes of the cells which are different
differencesList = []

# Iterate through the cells of the old df and compare its values with the new one's values
# If they are not equal add them to the list
for row in range(0, df_old.shape[0]):
    for col in range(0, df_old.shape[1]):
        if df_old.iloc[row,col] != df_new.iloc[row,col]:
            differencesList.append([row,col])

# Pass the new df to xlsxwriter as this is the one we want to keep
writer = pd.ExcelWriter('df_diff.xlsx', engine='xlsxwriter')
df_new.to_excel(writer, sheet_name='Sheet1', index=False)
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

# Define a format object with red color
cell_format_red = workbook.add_format({'font_color': 'red'})

# Iterate through the cells again and check if the values are contained in the list
# If they do write them with and apply the format, else write the value without any format
for row in range(0, df_new.shape[0]):
    for col in range(0, df_new.shape[1]):
        if [row,col] in differencesList:
            worksheet.write(row + 1, col, df_new.iloc[row,col], cell_format_red)
        else:
            worksheet.write(row + 1, col, df_new.iloc[row,col], None)

writer.save()

输出：

最后，如果您只想格式化已更改的字符串部分（如您的屏幕截图所示），您需要使用丰富的字符串 (https://xlsxwriter.readthedocs.io/worksheet.html#worksheet-write-rich-string)。

在 python 中突出显示更改的词

highlighted changed words in python

python

pandas

xlsxwriter