使用 melt with python、Pandas 重塑多个变量
Reshape multiples variables with melt with python, Pandas
我的调查日期有 5 个主要变量 f1_、f2_、f3_、f4_ 和 f5_,每个 f*_ 组变量最多有 10 个子组,例如:f1_1、f1_2、 f1_3 ... 或 f2_1、f2_2、... f2_10.
我想执行 pivot_longer 重塑我的数据框以便进行分析,我是 R 用户并且是这样做的,我想知道如何通过以下方式实现相同的输出python, pandas.
df %>%
# Reshape data - to long
pivot_longer(cols = all_of(ends_with(c("1","2","3", "4" ,"5"))), names_to = c("name", "check_id"), names_pattern = "(.*)(.)") %>%
# Reshape data - to wide
pivot_wider(names_from = name) %>%
#unnest data
unnest() %>%
# remove row if it has a NA value in both column
filter_at(.vars = vars(one_of(c("f1_", "f2_"))),~ !is.na(.)) %>%
# Crosstab 3 way
tabyl(check_id, f1_ ,f2_ ) %>%
# add total row and col
adorn_totals(c("row", "col" ))
这是所需的输出:
$No
check_id Person 1 Person 2 Person 3 Person 4 Total
1 2 0 0 0 2
2 0 1 0 0 1
3 0 0 1 0 1
4 1 0 0 1 2
Total 3 1 1 1 6
$Yes
check_id Person 1 Person 2 Person 3 Person 4 Total
1 5 0 0 0 5
2 0 5 0 0 5
3 0 1 2 0 3
4 0 0 0 1 1
Total 5 6 2 1 14
Python 样本数据
f1_ 和 f2_1 有 5 个子组
df = pd.DataFrame(
{
"f1_1": ["Person 1","NA","Person 1","Person 1","Person 1","Person 1","NA","Person 1", "Person 1"],
"f1_2": ["Person 2","NA","Person 2","Person 2","Person 2","NA","NA","Person 2","Person 2"],
"f1_3": ["Person 3","NA","NA","Person 3","Person 2","NA","NA","Person 3","NA"],
"f1_4": ["Person 4","NA","NA","Person 4", "NA","NA","NA","Person 1","NA"],
"f1_5": ["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"],
"f2_1": ["Yes", "NA", "Yes", "No", "Yes", "No", "NA", "Yes", "Yes"],
"f2_2": ["Yes", "NA", "Yes", "No", "Yes", "NA", "NA", "Yes", "Yes"],
"f2_3": ["Yes", "NA", "NA", "No", "Yes", "NA", "NA", "Yes", "NA"],
"f2_4": ["Yes", "NA", "NA", "No", "NA", "NA", "NA", "No", "NA"],
"f2_5": ["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"]
}
)
R样本数据
df <- tibble::tribble(
~f1_1, ~f1_2, ~f1_3, ~f1_4, ~f1_5, ~f2_1, ~f2_2, ~f2_3, ~f2_4, ~f2_5,
"Person 1", "Person 2", "Person 3", "Person 4", NA, "Yes", "Yes", "Yes", "Yes", NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Person 1", "Person 2", NA, NA, NA, "Yes", "Yes", NA, NA, NA,
"Person 1", "Person 2", "Person 3", "Person 4", NA, "No", "No", "No", "No", NA,
"Person 1", "Person 2", "Person 2", NA, NA, "Yes", "Yes", "Yes", NA, NA,
"Person 1", NA, NA, NA, NA, "No", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Person 1", "Person 2", "Person 3", "Person 1", NA, "Yes", "Yes", "Yes", "No", NA,
"Person 1", "Person 2", NA, NA, NA, "Yes", "Yes", NA, NA, NA
)
让我们试试:
# convert columns to multi index:
df.columns = pd.MultiIndex.from_tuples(map(tuple,df.columns.str.split('_')))
(df.where(df.ne('NA')).stack()
.set_index('f1', append=True)
.groupby(level=(1,2))['f2']
.value_counts()
.unstack(['f1'],fill_value=0)
.assign(total=lambda x: x.sum(1))
)
输出:
f1 Person 1 Person 2 Person 3 Person 4 total
f2
1 No 2 0 0 0 2
Yes 5 0 0 0 5
2 No 0 1 0 0 1
Yes 0 5 0 0 5
3 No 0 0 1 0 1
Yes 0 1 2 0 3
4 No 1 0 0 1 2
Yes 0 0 0 1 1
您也可以使用 pivot_longer function from pyjanitor; at the moment you have to install the latest development version from github:
# install latest dev version
# pip install git+https://github.com/ericmjl/pyjanitor.git
import janitor
temp = (
df.pivot_longer(names_to=(".value", "fs"), names_sep="_")
.loc[lambda df: df.ne("NA").all(1)]
.groupby(["fs", "f2", "f1"])
.f1.agg("count")
.unstack(fill_value=0)
.assign(total=lambda df: df.sum(1))
.swaplevel("f2", "fs")
)
temp
f1 Person 1 Person 2 Person 3 Person 4 total
f2 fs
No 1 2 0 0 0 2
Yes 1 5 0 0 0 5
No 2 0 1 0 0 1
Yes 2 0 5 0 0 5
No 3 0 0 1 0 1
Yes 3 0 1 2 0 3
No 4 1 0 0 1 2
Yes 4 0 0 0 1 1
然后您可以将其拆分为是和否数据帧:
yes_df = temp.loc(axis=0)["Yes", :]
yes_df = yes_df.copy()
yes_df.loc[("Yes", "Total"), :] = yes_df.sum()
yes_df
f1 Person 1 Person 2 Person 3 Person 4 total
f2 fs
Yes 1 5.0 0.0 0.0 0.0 5.0
2 0.0 5.0 0.0 0.0 5.0
3 0.0 1.0 2.0 0.0 3.0
4 0.0 0.0 0.0 1.0 1.0
Total 5.0 6.0 2.0 1.0 14.0
no_df = temp.loc(axis=0)["No", :]
no_df = no_df.copy()
no_df.loc[("No", "Total"), :] = no_df.sum()
no_df
f1 Person 1 Person 2 Person 3 Person 4 total
f2 fs
No 1 2.0 0.0 0.0 0.0 2.0
2 0.0 1.0 0.0 0.0 1.0
3 0.0 0.0 1.0 0.0 1.0
4 1.0 0.0 0.0 1.0 2.0
Total 3.0 1.0 1.0 1.0 6.0
对于您在 R 中的代码,我可以建议对您现有的代码进行一些调整:
df %>%
pivot_longer(starts_with('f'),
names_to = c('.value', "check_id"),
names_sep="_")%>%
drop_na()%>%
tabyl(check_id, f1, f2)%>%
adorn_totals(c("row", "col"))
$No
check_id Person 1 Person 2 Person 3 Person 4 Total
1 2 0 0 0 2
2 0 1 0 0 1
3 0 0 1 0 1
4 1 0 0 1 2
Total 3 1 1 1 6
$Yes
check_id Person 1 Person 2 Person 3 Person 4 Total
1 5 0 0 0 5
2 0 5 0 0 5
3 0 1 2 0 3
4 0 0 0 1 1
Total 5 6 2 1 14
我的调查日期有 5 个主要变量 f1_、f2_、f3_、f4_ 和 f5_,每个 f*_ 组变量最多有 10 个子组,例如:f1_1、f1_2、 f1_3 ... 或 f2_1、f2_2、... f2_10.
我想执行 pivot_longer 重塑我的数据框以便进行分析,我是 R 用户并且是这样做的,我想知道如何通过以下方式实现相同的输出python, pandas.
df %>%
# Reshape data - to long
pivot_longer(cols = all_of(ends_with(c("1","2","3", "4" ,"5"))), names_to = c("name", "check_id"), names_pattern = "(.*)(.)") %>%
# Reshape data - to wide
pivot_wider(names_from = name) %>%
#unnest data
unnest() %>%
# remove row if it has a NA value in both column
filter_at(.vars = vars(one_of(c("f1_", "f2_"))),~ !is.na(.)) %>%
# Crosstab 3 way
tabyl(check_id, f1_ ,f2_ ) %>%
# add total row and col
adorn_totals(c("row", "col" ))
这是所需的输出:
$No
check_id Person 1 Person 2 Person 3 Person 4 Total
1 2 0 0 0 2
2 0 1 0 0 1
3 0 0 1 0 1
4 1 0 0 1 2
Total 3 1 1 1 6
$Yes
check_id Person 1 Person 2 Person 3 Person 4 Total
1 5 0 0 0 5
2 0 5 0 0 5
3 0 1 2 0 3
4 0 0 0 1 1
Total 5 6 2 1 14
Python 样本数据
f1_ 和 f2_1 有 5 个子组
df = pd.DataFrame(
{
"f1_1": ["Person 1","NA","Person 1","Person 1","Person 1","Person 1","NA","Person 1", "Person 1"],
"f1_2": ["Person 2","NA","Person 2","Person 2","Person 2","NA","NA","Person 2","Person 2"],
"f1_3": ["Person 3","NA","NA","Person 3","Person 2","NA","NA","Person 3","NA"],
"f1_4": ["Person 4","NA","NA","Person 4", "NA","NA","NA","Person 1","NA"],
"f1_5": ["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"],
"f2_1": ["Yes", "NA", "Yes", "No", "Yes", "No", "NA", "Yes", "Yes"],
"f2_2": ["Yes", "NA", "Yes", "No", "Yes", "NA", "NA", "Yes", "Yes"],
"f2_3": ["Yes", "NA", "NA", "No", "Yes", "NA", "NA", "Yes", "NA"],
"f2_4": ["Yes", "NA", "NA", "No", "NA", "NA", "NA", "No", "NA"],
"f2_5": ["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"]
}
)
R样本数据
df <- tibble::tribble(
~f1_1, ~f1_2, ~f1_3, ~f1_4, ~f1_5, ~f2_1, ~f2_2, ~f2_3, ~f2_4, ~f2_5,
"Person 1", "Person 2", "Person 3", "Person 4", NA, "Yes", "Yes", "Yes", "Yes", NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Person 1", "Person 2", NA, NA, NA, "Yes", "Yes", NA, NA, NA,
"Person 1", "Person 2", "Person 3", "Person 4", NA, "No", "No", "No", "No", NA,
"Person 1", "Person 2", "Person 2", NA, NA, "Yes", "Yes", "Yes", NA, NA,
"Person 1", NA, NA, NA, NA, "No", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Person 1", "Person 2", "Person 3", "Person 1", NA, "Yes", "Yes", "Yes", "No", NA,
"Person 1", "Person 2", NA, NA, NA, "Yes", "Yes", NA, NA, NA
)
让我们试试:
# convert columns to multi index:
df.columns = pd.MultiIndex.from_tuples(map(tuple,df.columns.str.split('_')))
(df.where(df.ne('NA')).stack()
.set_index('f1', append=True)
.groupby(level=(1,2))['f2']
.value_counts()
.unstack(['f1'],fill_value=0)
.assign(total=lambda x: x.sum(1))
)
输出:
f1 Person 1 Person 2 Person 3 Person 4 total
f2
1 No 2 0 0 0 2
Yes 5 0 0 0 5
2 No 0 1 0 0 1
Yes 0 5 0 0 5
3 No 0 0 1 0 1
Yes 0 1 2 0 3
4 No 1 0 0 1 2
Yes 0 0 0 1 1
您也可以使用 pivot_longer function from pyjanitor; at the moment you have to install the latest development version from github:
# install latest dev version
# pip install git+https://github.com/ericmjl/pyjanitor.git
import janitor
temp = (
df.pivot_longer(names_to=(".value", "fs"), names_sep="_")
.loc[lambda df: df.ne("NA").all(1)]
.groupby(["fs", "f2", "f1"])
.f1.agg("count")
.unstack(fill_value=0)
.assign(total=lambda df: df.sum(1))
.swaplevel("f2", "fs")
)
temp
f1 Person 1 Person 2 Person 3 Person 4 total
f2 fs
No 1 2 0 0 0 2
Yes 1 5 0 0 0 5
No 2 0 1 0 0 1
Yes 2 0 5 0 0 5
No 3 0 0 1 0 1
Yes 3 0 1 2 0 3
No 4 1 0 0 1 2
Yes 4 0 0 0 1 1
然后您可以将其拆分为是和否数据帧:
yes_df = temp.loc(axis=0)["Yes", :]
yes_df = yes_df.copy()
yes_df.loc[("Yes", "Total"), :] = yes_df.sum()
yes_df
f1 Person 1 Person 2 Person 3 Person 4 total
f2 fs
Yes 1 5.0 0.0 0.0 0.0 5.0
2 0.0 5.0 0.0 0.0 5.0
3 0.0 1.0 2.0 0.0 3.0
4 0.0 0.0 0.0 1.0 1.0
Total 5.0 6.0 2.0 1.0 14.0
no_df = temp.loc(axis=0)["No", :]
no_df = no_df.copy()
no_df.loc[("No", "Total"), :] = no_df.sum()
no_df
f1 Person 1 Person 2 Person 3 Person 4 total
f2 fs
No 1 2.0 0.0 0.0 0.0 2.0
2 0.0 1.0 0.0 0.0 1.0
3 0.0 0.0 1.0 0.0 1.0
4 1.0 0.0 0.0 1.0 2.0
Total 3.0 1.0 1.0 1.0 6.0
对于您在 R 中的代码,我可以建议对您现有的代码进行一些调整:
df %>%
pivot_longer(starts_with('f'),
names_to = c('.value', "check_id"),
names_sep="_")%>%
drop_na()%>%
tabyl(check_id, f1, f2)%>%
adorn_totals(c("row", "col"))
$No
check_id Person 1 Person 2 Person 3 Person 4 Total
1 2 0 0 0 2
2 0 1 0 0 1
3 0 0 1 0 1
4 1 0 0 1 2
Total 3 1 1 1 6
$Yes
check_id Person 1 Person 2 Person 3 Person 4 Total
1 5 0 0 0 5
2 0 5 0 0 5
3 0 1 2 0 3
4 0 0 0 1 1
Total 5 6 2 1 14