pandas - 多指标绘图
pandas - multi index plotting
我有一些数据,我使用以下代码操作了数据框:
import pandas as pd
import numpy as np
data = pd.DataFrame([[0,0,0,3,6,5,6,1],[1,1,1,3,4,5,2,0],[2,1,0,3,6,5,6,1],[3,0,0,2,9,4,2,1],[4,0,1,3,4,8,1,1],[5,1,1,3,3,5,9,1],[6,1,0,3,3,5,6,1],[7,0,1,3,4,8,9,1]], columns=["id", "sex", "split", "group0Low", "group0High", "group1Low", "group1High", "trim"])
data
#remove all where trim == 0
trimmed = data[(data.trim == 1)]
trimmed
#create df with columns to be split
columns = ['group0Low', 'group0High', 'group1Low', 'group1High']
to_split = trimmed[columns]
to_split
level_group = np.where(to_split.columns.str.contains('0'), 0, 1)
# output: array([0, 0, 1, 1])
level_low_high = np.where(to_split.columns.str.contains('Low'), 'low', 'high')
# output: array(['low', 'high', 'low', 'high'], dtype='<U4')
multi_level_columns = pd.MultiIndex.from_arrays([level_group, level_low_high], names=['group', 'val'])
to_split.columns = multi_level_columns
to_split.stack(level='group')
sex = trimmed['sex']
split = trimmed['split']
horizontalStack = pd.concat([sex, split, to_split], axis=1)
horizontalStack
finalData = horizontalStack.groupby(['split', 'sex', 'group'])
finalData.mean()
我的问题是,如何使用 ggplot 或 seaborn 绘制平均数据,以便对于每个 "split" 级别,我得到一个如下所示的图表:
在代码的底部,您可以看到我尝试拆分组因子以便分隔条形,但这导致了错误 (KeyError: 'group'),我认为与我使用多索引的方式有关
我会使用 seaborn 的因子图。
假设您有这样的数据:
import numpy as np
import pandas
import seaborn
seaborn.set(style='ticks')
np.random.seed(0)
groups = ('Group 1', 'Group 2')
sexes = ('Male', 'Female')
means = ('Low', 'High')
index = pandas.MultiIndex.from_product(
[groups, sexes, means],
names=['Group', 'Sex', 'Mean']
)
values = np.random.randint(low=20, high=100, size=len(index))
data = pandas.DataFrame(data={'val': values}, index=index).reset_index()
print(data)
Group Sex Mean val
0 Group 1 Male Low 64
1 Group 1 Male High 67
2 Group 1 Female Low 84
3 Group 1 Female High 87
4 Group 2 Male Low 87
5 Group 2 Male High 29
6 Group 2 Female Low 41
7 Group 2 Female High 56
然后您可以使用一个命令创建因子图 + 加上额外的一行以删除一些多余的(对于您的数据)x 标签:
fg = seaborn.factorplot(x='Group', y='val', hue='Mean',
col='Sex', data=data, kind='bar')
fg.set_xlabels('')
这给了我:
在 related question 中,我找到了@Stein 的替代解决方案,该解决方案将多索引级别编码为不同的标签。以下是您的示例的样子:
import pandas as pd
import matplotlib.pyplot as plt
from itertools import groupby
import numpy as np
%matplotlib inline
groups = ('Group 1', 'Group 2')
sexes = ('Male', 'Female')
means = ('Low', 'High')
index = pd.MultiIndex.from_product(
[groups, sexes, means],
names=['Group', 'Sex', 'Mean']
)
values = np.random.randint(low=20, high=100, size=len(index))
data = pd.DataFrame(data={'val': values}, index=index)
# unstack last level to plot two separate columns
data = data.unstack(level=-1)
def add_line(ax, xpos, ypos):
line = plt.Line2D([xpos, xpos], [ypos + .1, ypos],
transform=ax.transAxes, color='gray')
line.set_clip_on(False)
ax.add_line(line)
def label_len(my_index,level):
labels = my_index.get_level_values(level)
return [(k, sum(1 for i in g)) for k,g in groupby(labels)]
def label_group_bar_table(ax, df):
ypos = -.1
scale = 1./df.index.size
for level in range(df.index.nlevels)[::-1]:
pos = 0
for label, rpos in label_len(df.index,level):
lxpos = (pos + .5 * rpos)*scale
ax.text(lxpos, ypos, label, ha='center', transform=ax.transAxes)
add_line(ax, pos*scale, ypos)
pos += rpos
add_line(ax, pos*scale , ypos)
ypos -= .1
ax = data['val'].plot(kind='bar')
#Below 2 lines remove default labels
ax.set_xticklabels('')
ax.set_xlabel('')
label_group_bar_table(ax, data)
这给出:
我有一些数据,我使用以下代码操作了数据框:
import pandas as pd
import numpy as np
data = pd.DataFrame([[0,0,0,3,6,5,6,1],[1,1,1,3,4,5,2,0],[2,1,0,3,6,5,6,1],[3,0,0,2,9,4,2,1],[4,0,1,3,4,8,1,1],[5,1,1,3,3,5,9,1],[6,1,0,3,3,5,6,1],[7,0,1,3,4,8,9,1]], columns=["id", "sex", "split", "group0Low", "group0High", "group1Low", "group1High", "trim"])
data
#remove all where trim == 0
trimmed = data[(data.trim == 1)]
trimmed
#create df with columns to be split
columns = ['group0Low', 'group0High', 'group1Low', 'group1High']
to_split = trimmed[columns]
to_split
level_group = np.where(to_split.columns.str.contains('0'), 0, 1)
# output: array([0, 0, 1, 1])
level_low_high = np.where(to_split.columns.str.contains('Low'), 'low', 'high')
# output: array(['low', 'high', 'low', 'high'], dtype='<U4')
multi_level_columns = pd.MultiIndex.from_arrays([level_group, level_low_high], names=['group', 'val'])
to_split.columns = multi_level_columns
to_split.stack(level='group')
sex = trimmed['sex']
split = trimmed['split']
horizontalStack = pd.concat([sex, split, to_split], axis=1)
horizontalStack
finalData = horizontalStack.groupby(['split', 'sex', 'group'])
finalData.mean()
我的问题是,如何使用 ggplot 或 seaborn 绘制平均数据,以便对于每个 "split" 级别,我得到一个如下所示的图表:
在代码的底部,您可以看到我尝试拆分组因子以便分隔条形,但这导致了错误 (KeyError: 'group'),我认为与我使用多索引的方式有关
我会使用 seaborn 的因子图。
假设您有这样的数据:
import numpy as np
import pandas
import seaborn
seaborn.set(style='ticks')
np.random.seed(0)
groups = ('Group 1', 'Group 2')
sexes = ('Male', 'Female')
means = ('Low', 'High')
index = pandas.MultiIndex.from_product(
[groups, sexes, means],
names=['Group', 'Sex', 'Mean']
)
values = np.random.randint(low=20, high=100, size=len(index))
data = pandas.DataFrame(data={'val': values}, index=index).reset_index()
print(data)
Group Sex Mean val
0 Group 1 Male Low 64
1 Group 1 Male High 67
2 Group 1 Female Low 84
3 Group 1 Female High 87
4 Group 2 Male Low 87
5 Group 2 Male High 29
6 Group 2 Female Low 41
7 Group 2 Female High 56
然后您可以使用一个命令创建因子图 + 加上额外的一行以删除一些多余的(对于您的数据)x 标签:
fg = seaborn.factorplot(x='Group', y='val', hue='Mean',
col='Sex', data=data, kind='bar')
fg.set_xlabels('')
这给了我:
在 related question 中,我找到了@Stein 的替代解决方案,该解决方案将多索引级别编码为不同的标签。以下是您的示例的样子:
import pandas as pd
import matplotlib.pyplot as plt
from itertools import groupby
import numpy as np
%matplotlib inline
groups = ('Group 1', 'Group 2')
sexes = ('Male', 'Female')
means = ('Low', 'High')
index = pd.MultiIndex.from_product(
[groups, sexes, means],
names=['Group', 'Sex', 'Mean']
)
values = np.random.randint(low=20, high=100, size=len(index))
data = pd.DataFrame(data={'val': values}, index=index)
# unstack last level to plot two separate columns
data = data.unstack(level=-1)
def add_line(ax, xpos, ypos):
line = plt.Line2D([xpos, xpos], [ypos + .1, ypos],
transform=ax.transAxes, color='gray')
line.set_clip_on(False)
ax.add_line(line)
def label_len(my_index,level):
labels = my_index.get_level_values(level)
return [(k, sum(1 for i in g)) for k,g in groupby(labels)]
def label_group_bar_table(ax, df):
ypos = -.1
scale = 1./df.index.size
for level in range(df.index.nlevels)[::-1]:
pos = 0
for label, rpos in label_len(df.index,level):
lxpos = (pos + .5 * rpos)*scale
ax.text(lxpos, ypos, label, ha='center', transform=ax.transAxes)
add_line(ax, pos*scale, ypos)
pos += rpos
add_line(ax, pos*scale , ypos)
ypos -= .1
ax = data['val'].plot(kind='bar')
#Below 2 lines remove default labels
ax.set_xticklabels('')
ax.set_xlabel('')
label_group_bar_table(ax, data)
这给出: