Plotly box p 值显着注释
Plotly box p-value significant annotation
我已经开始使用并喜欢 plotly boxplots 来表示我的数据。然而,我很难找到一种方法来对比两组。使用 Plotly 时,有没有办法在数据之间引入统计显着性比较?我想创建这样的图表:
其中 * 对应于 p 值 < 0.05,ns(不显着)对应于 p 值 > 0.05。
我发现使用 scipy.stats.ttest_ind()
和 stats.ttest_ind_from_stats()
可以轻松找到两个分布的 p 值。
我没有在网上找到任何相关的帖子,我认为这是一个相当有用的实现,所以任何帮助将不胜感激!
在 Plotly 中绝对没有针对这种特定内容的内置方法。
您需要做的是通过使用 fig.add_shape
方法三次来创建三个不同的线来自己创建括号注释,其中 x 值对应于您正在比较的两个柱,以及y 值对应于此支架形状的相对较小的高度变化(y 坐标在图的 paper coordinates 中给出)。由于您希望括号注释位于图上方,我们将处理大于 1 的纸张坐标,例如 [1.02, 1.03]
的 y_range。
然后我们将要使用 fig.add_annotation
方法。可以在 text and annotations documentation.
中找到更深入的解释
为了可重用性,我将整个过程包装在一个函数中,该函数包含您要比较的两天的列表,以及您希望括号注释的纸张坐标中的 y 范围被约束。
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
tips = px.data.tips()
# stats.ttest_ind(tips[tips['day']=='Thur'].total_bill,tips[tips['day']=='Fri'].total_bill)
# stats.ttest_ind(tips[tips['day']=='Thur'].total_bill,tips[tips['day']=='Sat'].total_bill)
fig = go.Figure()
for day in ['Thur','Fri','Sat','Sun']:
fig.add_trace(go.Box(
y=tips[tips['day'] == day].total_bill,
name=day,
boxpoints='outliers'
))
def add_pvalue_annotation(days, y_range, symbol=''):
"""
arguments:
days --- a list of two different days e.g. ['Thur','Sat']
y_range --- a list of y_range in the form [y_min, y_max] in paper units
"""
pvalue = stats.ttest_ind(
tips[tips['day']==days[0]].total_bill,
tips[tips['day']==days[1]].total_bill)[1]
# print(pvalue)
if pvalue >= 0.05:
symbol = 'ns'
if pvalue < 0.05:
symbol = '*'
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[0], y0=y_range[0], x1=days[0], y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[0], y0=y_range[1], x1=days[1], y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[1], y0=y_range[1], x1=days[1], y1=y_range[0],
line=dict(
color="black",
width=2,
)
)
## add text at the correct x, y coordinates
## for bars, there is a direct mapping from the bar number to 0, 1, 2...
bar_xcoord_map = {x: idx for idx, x in enumerate(['Thur','Fri','Sat','Sun'])}
fig.add_annotation(dict(font=dict(color="black",size=14),
x=(bar_xcoord_map[days[0]] + bar_xcoord_map[days[1]])/2,
y=y_range[1]*1.03,
showarrow=False,
text=symbol,
textangle=0,
xref="x",
yref="paper"
))
add_pvalue_annotation(['Thur','Sun'],[1.01,1.02])
add_pvalue_annotation(['Thur','Sat'],[1.05,1.06])
fig.show()
如果有人觉得有用,我写了这个函数add_p_value_annotation
。它创建一个括号注释并用星号指定两个箱线图之间的 p 值。当你的图中有子图时,它也应该有效。
def add_p_value_annotation(fig, array_columns, subplot=None, _format=dict(interline=0.07, text_height=1.07, color='black')):
''' Adds notations giving the p-value between two box plot data (t-test two-sided comparison)
Parameters:
----------
fig: figure
plotly boxplot figure
array_columns: np.array
array of which columns to compare
e.g.: [[0,1], [1,2]] compares column 0 with 1 and 1 with 2
subplot: None or int
specifies if the figures has subplots and what subplot to add the notation to
_format: dict
format characteristics for the lines
Returns:
-------
fig: figure
figure with the added notation
'''
# Specify in what y_range to plot for each pair of columns
y_range = np.zeros([len(array_columns), 2])
for i in range(len(array_columns)):
y_range[i] = [1.01+i*_format['interline'], 1.02+i*_format['interline']]
# Get values from figure
fig_dict = fig.to_dict()
# Get indices if working with subplots
if subplot:
if subplot == 1:
subplot_str = ''
else:
subplot_str =str(subplot)
indices = [] #Change the box index to the indices of the data for that subplot
for index, data in enumerate(fig_dict['data']):
#print(index, data['xaxis'], 'x' + subplot_str)
if data['xaxis'] == 'x' + subplot_str:
indices = np.append(indices, index)
indices = [int(i) for i in indices]
print((indices))
else:
subplot_str = ''
# Print the p-values
for index, column_pair in enumerate(array_columns):
if subplot:
data_pair = [indices[column_pair[0]], indices[column_pair[1]]]
else:
data_pair = column_pair
# Mare sure it is selecting the data and subplot you want
#print('0:', fig_dict['data'][data_pair[0]]['name'], fig_dict['data'][data_pair[0]]['xaxis'])
#print('1:', fig_dict['data'][data_pair[1]]['name'], fig_dict['data'][data_pair[1]]['xaxis'])
# Get the p-value
pvalue = stats.ttest_ind(
fig_dict['data'][data_pair[0]]['y'],
fig_dict['data'][data_pair[1]]['y'],
equal_var=False,
)[1]
if pvalue >= 0.05:
symbol = 'ns'
elif pvalue >= 0.01:
symbol = '*'
elif pvalue >= 0.001:
symbol = '**'
else:
symbol = '***'
# Vertical line
fig.add_shape(type="line",
xref="x"+subplot_str, yref="y"+subplot_str+" domain",
x0=column_pair[0], y0=y_range[index][0],
x1=column_pair[0], y1=y_range[index][1],
line=dict(color=_format['color'], width=2,)
)
# Horizontal line
fig.add_shape(type="line",
xref="x"+subplot_str, yref="y"+subplot_str+" domain",
x0=column_pair[0], y0=y_range[index][1],
x1=column_pair[1], y1=y_range[index][1],
line=dict(color=_format['color'], width=2,)
)
# Vertical line
fig.add_shape(type="line",
xref="x"+subplot_str, yref="y"+subplot_str+" domain",
x0=column_pair[1], y0=y_range[index][0],
x1=column_pair[1], y1=y_range[index][1],
line=dict(color=_format['color'], width=2,)
)
## add text at the correct x, y coordinates
## for bars, there is a direct mapping from the bar number to 0, 1, 2...
fig.add_annotation(dict(font=dict(color=_format['color'],size=14),
x=(column_pair[0] + column_pair[1])/2,
y=y_range[index][1]*_format['text_height'],
showarrow=False,
text=symbol,
textangle=0,
xref="x"+subplot_str,
yref="y"+subplot_str+" domain"
))
return fig
如果我们现在创建一个图形并测试函数,我们应该得到以下输出。
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
tips = px.data.tips()
fig = go.Figure()
for day in ['Thur','Fri','Sat','Sun']:
fig.add_trace(go.Box(
y=tips[tips['day'] == day].total_bill,
name=day,
boxpoints='outliers'
))
fig = add_p_value_annotation(fig, [[0,1], [0,2], [0,3]])
fig.show()
我已经开始使用并喜欢 plotly boxplots 来表示我的数据。然而,我很难找到一种方法来对比两组。使用 Plotly 时,有没有办法在数据之间引入统计显着性比较?我想创建这样的图表:
其中 * 对应于 p 值 < 0.05,ns(不显着)对应于 p 值 > 0.05。
我发现使用 scipy.stats.ttest_ind()
和 stats.ttest_ind_from_stats()
可以轻松找到两个分布的 p 值。
我没有在网上找到任何相关的帖子,我认为这是一个相当有用的实现,所以任何帮助将不胜感激!
在 Plotly 中绝对没有针对这种特定内容的内置方法。
您需要做的是通过使用 fig.add_shape
方法三次来创建三个不同的线来自己创建括号注释,其中 x 值对应于您正在比较的两个柱,以及y 值对应于此支架形状的相对较小的高度变化(y 坐标在图的 paper coordinates 中给出)。由于您希望括号注释位于图上方,我们将处理大于 1 的纸张坐标,例如 [1.02, 1.03]
的 y_range。
然后我们将要使用 fig.add_annotation
方法。可以在 text and annotations documentation.
为了可重用性,我将整个过程包装在一个函数中,该函数包含您要比较的两天的列表,以及您希望括号注释的纸张坐标中的 y 范围被约束。
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
tips = px.data.tips()
# stats.ttest_ind(tips[tips['day']=='Thur'].total_bill,tips[tips['day']=='Fri'].total_bill)
# stats.ttest_ind(tips[tips['day']=='Thur'].total_bill,tips[tips['day']=='Sat'].total_bill)
fig = go.Figure()
for day in ['Thur','Fri','Sat','Sun']:
fig.add_trace(go.Box(
y=tips[tips['day'] == day].total_bill,
name=day,
boxpoints='outliers'
))
def add_pvalue_annotation(days, y_range, symbol=''):
"""
arguments:
days --- a list of two different days e.g. ['Thur','Sat']
y_range --- a list of y_range in the form [y_min, y_max] in paper units
"""
pvalue = stats.ttest_ind(
tips[tips['day']==days[0]].total_bill,
tips[tips['day']==days[1]].total_bill)[1]
# print(pvalue)
if pvalue >= 0.05:
symbol = 'ns'
if pvalue < 0.05:
symbol = '*'
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[0], y0=y_range[0], x1=days[0], y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[0], y0=y_range[1], x1=days[1], y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[1], y0=y_range[1], x1=days[1], y1=y_range[0],
line=dict(
color="black",
width=2,
)
)
## add text at the correct x, y coordinates
## for bars, there is a direct mapping from the bar number to 0, 1, 2...
bar_xcoord_map = {x: idx for idx, x in enumerate(['Thur','Fri','Sat','Sun'])}
fig.add_annotation(dict(font=dict(color="black",size=14),
x=(bar_xcoord_map[days[0]] + bar_xcoord_map[days[1]])/2,
y=y_range[1]*1.03,
showarrow=False,
text=symbol,
textangle=0,
xref="x",
yref="paper"
))
add_pvalue_annotation(['Thur','Sun'],[1.01,1.02])
add_pvalue_annotation(['Thur','Sat'],[1.05,1.06])
fig.show()
如果有人觉得有用,我写了这个函数add_p_value_annotation
。它创建一个括号注释并用星号指定两个箱线图之间的 p 值。当你的图中有子图时,它也应该有效。
def add_p_value_annotation(fig, array_columns, subplot=None, _format=dict(interline=0.07, text_height=1.07, color='black')):
''' Adds notations giving the p-value between two box plot data (t-test two-sided comparison)
Parameters:
----------
fig: figure
plotly boxplot figure
array_columns: np.array
array of which columns to compare
e.g.: [[0,1], [1,2]] compares column 0 with 1 and 1 with 2
subplot: None or int
specifies if the figures has subplots and what subplot to add the notation to
_format: dict
format characteristics for the lines
Returns:
-------
fig: figure
figure with the added notation
'''
# Specify in what y_range to plot for each pair of columns
y_range = np.zeros([len(array_columns), 2])
for i in range(len(array_columns)):
y_range[i] = [1.01+i*_format['interline'], 1.02+i*_format['interline']]
# Get values from figure
fig_dict = fig.to_dict()
# Get indices if working with subplots
if subplot:
if subplot == 1:
subplot_str = ''
else:
subplot_str =str(subplot)
indices = [] #Change the box index to the indices of the data for that subplot
for index, data in enumerate(fig_dict['data']):
#print(index, data['xaxis'], 'x' + subplot_str)
if data['xaxis'] == 'x' + subplot_str:
indices = np.append(indices, index)
indices = [int(i) for i in indices]
print((indices))
else:
subplot_str = ''
# Print the p-values
for index, column_pair in enumerate(array_columns):
if subplot:
data_pair = [indices[column_pair[0]], indices[column_pair[1]]]
else:
data_pair = column_pair
# Mare sure it is selecting the data and subplot you want
#print('0:', fig_dict['data'][data_pair[0]]['name'], fig_dict['data'][data_pair[0]]['xaxis'])
#print('1:', fig_dict['data'][data_pair[1]]['name'], fig_dict['data'][data_pair[1]]['xaxis'])
# Get the p-value
pvalue = stats.ttest_ind(
fig_dict['data'][data_pair[0]]['y'],
fig_dict['data'][data_pair[1]]['y'],
equal_var=False,
)[1]
if pvalue >= 0.05:
symbol = 'ns'
elif pvalue >= 0.01:
symbol = '*'
elif pvalue >= 0.001:
symbol = '**'
else:
symbol = '***'
# Vertical line
fig.add_shape(type="line",
xref="x"+subplot_str, yref="y"+subplot_str+" domain",
x0=column_pair[0], y0=y_range[index][0],
x1=column_pair[0], y1=y_range[index][1],
line=dict(color=_format['color'], width=2,)
)
# Horizontal line
fig.add_shape(type="line",
xref="x"+subplot_str, yref="y"+subplot_str+" domain",
x0=column_pair[0], y0=y_range[index][1],
x1=column_pair[1], y1=y_range[index][1],
line=dict(color=_format['color'], width=2,)
)
# Vertical line
fig.add_shape(type="line",
xref="x"+subplot_str, yref="y"+subplot_str+" domain",
x0=column_pair[1], y0=y_range[index][0],
x1=column_pair[1], y1=y_range[index][1],
line=dict(color=_format['color'], width=2,)
)
## add text at the correct x, y coordinates
## for bars, there is a direct mapping from the bar number to 0, 1, 2...
fig.add_annotation(dict(font=dict(color=_format['color'],size=14),
x=(column_pair[0] + column_pair[1])/2,
y=y_range[index][1]*_format['text_height'],
showarrow=False,
text=symbol,
textangle=0,
xref="x"+subplot_str,
yref="y"+subplot_str+" domain"
))
return fig
如果我们现在创建一个图形并测试函数,我们应该得到以下输出。
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
tips = px.data.tips()
fig = go.Figure()
for day in ['Thur','Fri','Sat','Sun']:
fig.add_trace(go.Box(
y=tips[tips['day'] == day].total_bill,
name=day,
boxpoints='outliers'
))
fig = add_p_value_annotation(fig, [[0,1], [0,2], [0,3]])
fig.show()