Pandas: 如何根据 Z 分数值突出显示单元格值?
Pandas: How to highlight a cell value based on a Z-score value?
在我的df
下面,我想:
- 使用 z-scores
识别并标记 col_E
中的异常值
- 分别说明如何在两列或更多列中使用 z 分数来识别和标记异常值,例如
col_D
& col_E
数据集见下文
import pandas as pd
from scipy import stats
# intialise data of lists
df = {
'col_A':['P0', 'P1', 'P2', 'P4', 'P5'],
'col_B':[1,1,1,1,1],
'col_C':[1,2,3,5,9],
'col_D':[120.05, 181.90, 10.34, 153.10, 311.17],
'col_E':[110.21, 191.12, 190.21, 12.00, 245.09 ],
'col_F':[100.22,199.10, 191.13,199.99, 255.19],
'col_G':[140.29, 291.07, 390.22, 245.09, 4122.62],
}
# Create DataFrame
df = pd.DataFrame(df)
# Print the output.
df
期望:首先标记 col_D
中的所有异常值,然后标记 col_D
和 col_E
(注意:在我下面的图像中 10.34
和 12.00
随机突出显示)
Q1
尝试:
#Q1
exclude_cols = ['col_A','col_B','col_C','col_D','col_F','col_G']
include_cols = ['col_E'] # desired column
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
#Q2
exclude_cols = ['col_A','col_B','col_C','col_F','col_G']
include_cols = ['col_D','col_E'] # desired columns
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
谢谢!
基于此answer,只需将分数的条件传递给存储每个列索引的背景颜色的字典即可。
include_cols = ['col_D', 'col_E']
def color_outliers_yellow(row, include, color='yellow', z_score = 1):
styles = {col: '' for col in row.index}
if row.name in include:
scores = stats.zscore(list(row))
scores = [(f'background-color: {color}' if score > z_score else '') for score in scores]
return {k:v for k, v in zip(styles.keys(), scores)}
else:
return styles
df.style.apply(lambda x: color_outliers_yellow(x, include=include_cols), axis=0)
结果:
我假定以下含义来展示更广泛的用法。
- Q1代表计算单列
- Q2 代表对合并在一起的多个列进行计算。
如果 Q2 打算分别在多列上计算,那么您可以简单地在多列上循环您的 Q1 解决方案,这应该是微不足道的,所以我将在这里省略这种情况。
键
- Q1 非常简单,因为可以通过列表理解 return 值列表。
- Q2 有点复杂,因为 z 分数将应用于 DataFrame 子集(即必须使用
axis=None
)。根据 official docs,在 DataFrame 上应用样式时,returning 对象也必须是与子集具有相同索引和列的 DataFrame。这就是导致重塑和 DataFrame 构建工件的原因。
单列 (Q1)
请注意,出于演示目的,z=3
已降低为 1.5
。
# 想要的列
include_cols = ['col_E']
# additional control
outlier_threshold = 1.5 # 3 won't work!
ddof = 0 # degree of freedom correction. Sample = 1 and population = 0.
def flag_outliers(s: pd.Series):
outlier_mask = np.abs(stats.zscore(s, ddof=ddof)) > outlier_threshold
# replace boolean values with corresponding strings
return ['background-color: yellow' if val else '' for val in outlier_mask]
df.style.apply(flag_outliers, subset=include_cols)
结果
多列合并(Q2,假定)
Q2
include_cols = ['col_D', 'col_E'] # desired columns
outlier_threshold = 1.5
ddof = 0
def flag_outliers(s: pd.DataFrame) -> pd.DataFrame:
outlier_mask = np.abs(stats.zscore(s.values.reshape(-1), axis=None, ddof=ddof)) > outlier_threshold
# prepare the array of string to be returned
arr = np.array(['background-color: yellow' if val else '' for val in outlier_mask], dtype=object).reshape(s.shape)
# cast the array into dataframe
return pd.DataFrame(arr, columns=s.columns, index=s.index)
df.style.apply(flag_outliers, axis=None, subset=include_cols)
结果
在我的df
下面,我想:
- 使用 z-scores 识别并标记
- 分别说明如何在两列或更多列中使用 z 分数来识别和标记异常值,例如
col_D
&col_E
col_E
中的异常值
数据集见下文
import pandas as pd
from scipy import stats
# intialise data of lists
df = {
'col_A':['P0', 'P1', 'P2', 'P4', 'P5'],
'col_B':[1,1,1,1,1],
'col_C':[1,2,3,5,9],
'col_D':[120.05, 181.90, 10.34, 153.10, 311.17],
'col_E':[110.21, 191.12, 190.21, 12.00, 245.09 ],
'col_F':[100.22,199.10, 191.13,199.99, 255.19],
'col_G':[140.29, 291.07, 390.22, 245.09, 4122.62],
}
# Create DataFrame
df = pd.DataFrame(df)
# Print the output.
df
期望:首先标记 col_D
中的所有异常值,然后标记 col_D
和 col_E
(注意:在我下面的图像中 10.34
和 12.00
随机突出显示)
Q1
尝试:
#Q1
exclude_cols = ['col_A','col_B','col_C','col_D','col_F','col_G']
include_cols = ['col_E'] # desired column
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
#Q2
exclude_cols = ['col_A','col_B','col_C','col_F','col_G']
include_cols = ['col_D','col_E'] # desired columns
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
谢谢!
基于此answer,只需将分数的条件传递给存储每个列索引的背景颜色的字典即可。
include_cols = ['col_D', 'col_E']
def color_outliers_yellow(row, include, color='yellow', z_score = 1):
styles = {col: '' for col in row.index}
if row.name in include:
scores = stats.zscore(list(row))
scores = [(f'background-color: {color}' if score > z_score else '') for score in scores]
return {k:v for k, v in zip(styles.keys(), scores)}
else:
return styles
df.style.apply(lambda x: color_outliers_yellow(x, include=include_cols), axis=0)
结果:
我假定以下含义来展示更广泛的用法。
- Q1代表计算单列
- Q2 代表对合并在一起的多个列进行计算。
如果 Q2 打算分别在多列上计算,那么您可以简单地在多列上循环您的 Q1 解决方案,这应该是微不足道的,所以我将在这里省略这种情况。
键
- Q1 非常简单,因为可以通过列表理解 return 值列表。
- Q2 有点复杂,因为 z 分数将应用于 DataFrame 子集(即必须使用
axis=None
)。根据 official docs,在 DataFrame 上应用样式时,returning 对象也必须是与子集具有相同索引和列的 DataFrame。这就是导致重塑和 DataFrame 构建工件的原因。
单列 (Q1)
请注意,出于演示目的,z=3
已降低为 1.5
。
# 想要的列
include_cols = ['col_E']
# additional control
outlier_threshold = 1.5 # 3 won't work!
ddof = 0 # degree of freedom correction. Sample = 1 and population = 0.
def flag_outliers(s: pd.Series):
outlier_mask = np.abs(stats.zscore(s, ddof=ddof)) > outlier_threshold
# replace boolean values with corresponding strings
return ['background-color: yellow' if val else '' for val in outlier_mask]
df.style.apply(flag_outliers, subset=include_cols)
结果
多列合并(Q2,假定)
Q2
include_cols = ['col_D', 'col_E'] # desired columns
outlier_threshold = 1.5
ddof = 0
def flag_outliers(s: pd.DataFrame) -> pd.DataFrame:
outlier_mask = np.abs(stats.zscore(s.values.reshape(-1), axis=None, ddof=ddof)) > outlier_threshold
# prepare the array of string to be returned
arr = np.array(['background-color: yellow' if val else '' for val in outlier_mask], dtype=object).reshape(s.shape)
# cast the array into dataframe
return pd.DataFrame(arr, columns=s.columns, index=s.index)
df.style.apply(flag_outliers, axis=None, subset=include_cols)
结果