使用 pandas 根据另一列查找列值合计
Find column value wise total against another column using pandas
我有一个如下所示的数据框
import numpy as np
import pandas as pd
from numpy.random import default_rng
rng = default_rng(100)
cf = pd.DataFrame({'grade': rng.choice(list('ACD'),size=(8)),
'dash': rng.choice(list('PQRS'),size=(8)),
'dumeel': rng.choice(list('QWER'),size=(8)),
'dumma': rng.choice((1234),size=(8)),
'target': rng.choice([0,1],size=(8))
})
我想执行以下操作
a) 针对 target
列
在分类列中为我的每个值查找 total
和 %total
我尝试了以下方法,但只得到了一半的结果。
cols = cf.select_dtypes('object')
cf.melt('target',cols).groupby(['variable','value']).size().reset_index(name='cnt of records')
如何使用上述结果计算 target met
和 target not met
使用 target
列的详细信息?
我希望我的输出如下所示(请注意,我只显示了两列 grade
和 dash
作为示例)。代码应遵循所有字符串列的相同逻辑
Select 您的列用 melt
展平,然后加入 target
列。最后,按 variable
和 value
列分组,并将函数字典应用于每个组。
funcs = {
'cnt of records': 'count',
'target met': lambda x: sum(x),
'target not met': lambda x: len(x) - sum(x),
'target met %': lambda x: f"{round(100 * sum(x) / len(x), 2):.2f}%",
'target not met %': lambda x: f"{round(100 * (len(x) - sum(x)) / len(x), 2):.2f}%"
}
out = df.select_dtypes('object').melt(ignore_index=False).join(df['target']) \
.groupby(['variable', 'value'])['target'].agg(**funcs).reset_index()
输出:
>>> out
variable value cnt of records target met target not met target met % target not met %
0 dash Q 2 0 2 0.00% 100.00%
1 dash R 2 2 0 100.00% 0.00%
2 dash S 4 2 2 50.00% 50.00%
3 dumeel E 3 2 1 66.67% 33.33%
4 dumeel Q 3 2 1 66.67% 33.33%
5 dumeel R 1 0 1 0.00% 100.00%
6 dumeel W 1 0 1 0.00% 100.00%
7 grade A 2 0 2 0.00% 100.00%
8 grade C 3 2 1 66.67% 33.33%
9 grade D 3 2 1 66.67% 33.33%
您可以在 groupby
之后使用 agg
为此:
cols = cf.select_dtypes('object')
df = (
cf.melt('target', cols)
.groupby(['variable','value'])
['target']
.agg([('l', 'size'), ('s', 'sum')]) # l = length (total count of rows in this group), s = sum (total count of rows in the group where target = 1)
.pipe(lambda x: (
x.assign(
met_pct=x.s / x.l * 100,
not_met_pct=100 - (x.s / x.l * 100),
met=x.s,
not_met=x.l - x.s
)
)).reset_index()
.drop(['l', 's'], axis=1)
)
输出:
>>> df
variable value met_pct not_met_pct met not_met
0 dash P 100.000000 0.000000 1 0
1 dash Q 0.000000 100.000000 0 3
2 dash R 50.000000 50.000000 1 1
3 dash S 50.000000 50.000000 1 1
4 dumeel E 0.000000 100.000000 0 1
5 dumeel Q 100.000000 0.000000 1 0
6 dumeel R 50.000000 50.000000 2 2
7 dumeel W 0.000000 100.000000 0 2
8 grade A 0.000000 100.000000 0 1
9 grade C 50.000000 50.000000 2 2
10 grade D 33.333333 66.666667 1 2
我有一个如下所示的数据框
import numpy as np
import pandas as pd
from numpy.random import default_rng
rng = default_rng(100)
cf = pd.DataFrame({'grade': rng.choice(list('ACD'),size=(8)),
'dash': rng.choice(list('PQRS'),size=(8)),
'dumeel': rng.choice(list('QWER'),size=(8)),
'dumma': rng.choice((1234),size=(8)),
'target': rng.choice([0,1],size=(8))
})
我想执行以下操作
a) 针对 target
列
total
和 %total
我尝试了以下方法,但只得到了一半的结果。
cols = cf.select_dtypes('object')
cf.melt('target',cols).groupby(['variable','value']).size().reset_index(name='cnt of records')
如何使用上述结果计算 target met
和 target not met
使用 target
列的详细信息?
我希望我的输出如下所示(请注意,我只显示了两列 grade
和 dash
作为示例)。代码应遵循所有字符串列的相同逻辑
Select 您的列用 melt
展平,然后加入 target
列。最后,按 variable
和 value
列分组,并将函数字典应用于每个组。
funcs = {
'cnt of records': 'count',
'target met': lambda x: sum(x),
'target not met': lambda x: len(x) - sum(x),
'target met %': lambda x: f"{round(100 * sum(x) / len(x), 2):.2f}%",
'target not met %': lambda x: f"{round(100 * (len(x) - sum(x)) / len(x), 2):.2f}%"
}
out = df.select_dtypes('object').melt(ignore_index=False).join(df['target']) \
.groupby(['variable', 'value'])['target'].agg(**funcs).reset_index()
输出:
>>> out
variable value cnt of records target met target not met target met % target not met %
0 dash Q 2 0 2 0.00% 100.00%
1 dash R 2 2 0 100.00% 0.00%
2 dash S 4 2 2 50.00% 50.00%
3 dumeel E 3 2 1 66.67% 33.33%
4 dumeel Q 3 2 1 66.67% 33.33%
5 dumeel R 1 0 1 0.00% 100.00%
6 dumeel W 1 0 1 0.00% 100.00%
7 grade A 2 0 2 0.00% 100.00%
8 grade C 3 2 1 66.67% 33.33%
9 grade D 3 2 1 66.67% 33.33%
您可以在 groupby
之后使用 agg
为此:
cols = cf.select_dtypes('object')
df = (
cf.melt('target', cols)
.groupby(['variable','value'])
['target']
.agg([('l', 'size'), ('s', 'sum')]) # l = length (total count of rows in this group), s = sum (total count of rows in the group where target = 1)
.pipe(lambda x: (
x.assign(
met_pct=x.s / x.l * 100,
not_met_pct=100 - (x.s / x.l * 100),
met=x.s,
not_met=x.l - x.s
)
)).reset_index()
.drop(['l', 's'], axis=1)
)
输出:
>>> df
variable value met_pct not_met_pct met not_met
0 dash P 100.000000 0.000000 1 0
1 dash Q 0.000000 100.000000 0 3
2 dash R 50.000000 50.000000 1 1
3 dash S 50.000000 50.000000 1 1
4 dumeel E 0.000000 100.000000 0 1
5 dumeel Q 100.000000 0.000000 1 0
6 dumeel R 50.000000 50.000000 2 2
7 dumeel W 0.000000 100.000000 0 2
8 grade A 0.000000 100.000000 0 1
9 grade C 50.000000 50.000000 2 2
10 grade D 33.333333 66.666667 1 2