使用 melt with python、Pandas 重塑多个变量

Reshape multiples variables with melt with python, Pandas

我的调查日期有 5 个主要变量 f1_、f2_、f3_、f4_ 和 f5_,每个 f*_ 组变量最多有 10 个子组,例如:f1_1、f1_2、 f1_3 ... 或 f2_1、f2_2、... f2_10.

我想执行 pivot_longer 重塑我的数据框以便进行分析,我是 R 用户并且是这样做的,我想知道如何通过以下方式实现相同的输出python, pandas.

df %>% 
  # Reshape data - to long
  pivot_longer(cols = all_of(ends_with(c("1","2","3", "4" ,"5"))), names_to = c("name", "check_id"), names_pattern = "(.*)(.)") %>% 
  # Reshape data - to wide
  pivot_wider(names_from = name) %>% 
  #unnest data
  unnest() %>% 
  # remove row if it has a NA value in both column
  filter_at(.vars = vars(one_of(c("f1_", "f2_"))),~ !is.na(.)) %>%
  # Crosstab 3 way
  tabyl(check_id, f1_ ,f2_ ) %>% 
  # add total row and col
  adorn_totals(c("row", "col" ))

这是所需的输出:

$No
 check_id Person 1 Person 2 Person 3 Person 4 Total
        1        2        0        0        0     2
        2        0        1        0        0     1
        3        0        0        1        0     1
        4        1        0        0        1     2
    Total        3        1        1        1     6

$Yes
 check_id Person 1 Person 2 Person 3 Person 4 Total
        1        5        0        0        0     5
        2        0        5        0        0     5
        3        0        1        2        0     3
        4        0        0        0        1     1
    Total        5        6        2        1    14

Python 样本数据

f1_ 和 f2_1 有 5 个子组

df = pd.DataFrame(
  {
    "f1_1": ["Person 1","NA","Person 1","Person 1","Person 1","Person 1","NA","Person 1", "Person 1"],
    "f1_2": ["Person 2","NA","Person 2","Person 2","Person 2","NA","NA","Person 2","Person 2"],
    "f1_3": ["Person 3","NA","NA","Person 3","Person 2","NA","NA","Person 3","NA"],
    "f1_4": ["Person 4","NA","NA","Person 4", "NA","NA","NA","Person 1","NA"],
    "f1_5": ["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"],
    "f2_1": ["Yes", "NA", "Yes", "No", "Yes", "No", "NA", "Yes", "Yes"],
    "f2_2": ["Yes", "NA", "Yes", "No", "Yes", "NA", "NA", "Yes", "Yes"],
    "f2_3": ["Yes", "NA", "NA", "No", "Yes", "NA", "NA", "Yes", "NA"],
    "f2_4": ["Yes", "NA", "NA", "No", "NA", "NA", "NA", "No", "NA"],
    "f2_5": ["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"]
    
  }
)

R样本数据

df <- tibble::tribble(
       ~f1_1,      ~f1_2,      ~f1_3,      ~f1_4, ~f1_5, ~f2_1, ~f2_2, ~f2_3, ~f2_4, ~f2_5,
  "Person 1", "Person 2", "Person 3", "Person 4",    NA, "Yes", "Yes", "Yes", "Yes",    NA,
          NA,         NA,         NA,         NA,    NA,    NA,    NA,    NA,    NA,    NA,
  "Person 1", "Person 2",         NA,         NA,    NA, "Yes", "Yes",    NA,    NA,    NA,
  "Person 1", "Person 2", "Person 3", "Person 4",    NA,  "No",  "No",  "No",  "No",    NA,
  "Person 1", "Person 2", "Person 2",         NA,    NA, "Yes", "Yes", "Yes",    NA,    NA,
  "Person 1",         NA,         NA,         NA,    NA,  "No",    NA,    NA,    NA,    NA,
          NA,         NA,         NA,         NA,    NA,    NA,    NA,    NA,    NA,    NA,
  "Person 1", "Person 2", "Person 3", "Person 1",    NA, "Yes", "Yes", "Yes",  "No",    NA,
  "Person 1", "Person 2",         NA,         NA,    NA, "Yes", "Yes",    NA,    NA,    NA
  )

让我们试试:

# convert columns to multi index:
df.columns = pd.MultiIndex.from_tuples(map(tuple,df.columns.str.split('_')))

(df.where(df.ne('NA')).stack()
  .set_index('f1', append=True)
  .groupby(level=(1,2))['f2']
  .value_counts()
  .unstack(['f1'],fill_value=0)
  .assign(total=lambda x: x.sum(1))
)

输出:

  f1   Person 1  Person 2  Person 3  Person 4  total
  f2                                                
1 No          2         0         0         0      2
  Yes         5         0         0         0      5
2 No          0         1         0         0      1
  Yes         0         5         0         0      5
3 No          0         0         1         0      1
  Yes         0         1         2         0      3
4 No          1         0         0         1      2
  Yes         0         0         0         1      1

您也可以使用 pivot_longer function from pyjanitor; at the moment you have to install the latest development version from github:

 # install latest dev version
 # pip install git+https://github.com/ericmjl/pyjanitor.git

 import janitor

temp = (
    df.pivot_longer(names_to=(".value", "fs"), names_sep="_")
    .loc[lambda df: df.ne("NA").all(1)]
    .groupby(["fs", "f2", "f1"])
    .f1.agg("count")
    .unstack(fill_value=0)
    .assign(total=lambda df: df.sum(1))
    .swaplevel("f2", "fs")
)

temp

f1      Person 1  Person 2  Person 3  Person 4  total
f2  fs                                               
No  1          2         0         0         0      2
Yes 1          5         0         0         0      5
No  2          0         1         0         0      1
Yes 2          0         5         0         0      5
No  3          0         0         1         0      1
Yes 3          0         1         2         0      3
No  4          1         0         0         1      2
Yes 4          0         0         0         1      1

然后您可以将其拆分为是和否数据帧:

yes_df = temp.loc(axis=0)["Yes", :]
yes_df = yes_df.copy()
yes_df.loc[("Yes", "Total"), :] = yes_df.sum()
yes_df

f1         Person 1  Person 2  Person 3  Person 4  total
f2  fs                                                  
Yes 1           5.0       0.0       0.0       0.0    5.0
    2           0.0       5.0       0.0       0.0    5.0
    3           0.0       1.0       2.0       0.0    3.0
    4           0.0       0.0       0.0       1.0    1.0
    Total       5.0       6.0       2.0       1.0   14.0




no_df = temp.loc(axis=0)["No", :]
no_df = no_df.copy()
no_df.loc[("No", "Total"), :] = no_df.sum()
no_df

f1        Person 1  Person 2  Person 3  Person 4  total
f2 fs                                                  
No 1           2.0       0.0       0.0       0.0    2.0
   2           0.0       1.0       0.0       0.0    1.0
   3           0.0       0.0       1.0       0.0    1.0
   4           1.0       0.0       0.0       1.0    2.0
   Total       3.0       1.0       1.0       1.0    6.0

对于您在 R 中的代码,我可以建议对您现有的代码进行一些调整:

df %>% 
  pivot_longer(starts_with('f'), 
               names_to = c('.value', "check_id"), 
               names_sep="_")%>%
  drop_na()%>%
  tabyl(check_id, f1, f2)%>%
  adorn_totals(c("row", "col"))

$No
 check_id Person 1 Person 2 Person 3 Person 4 Total
        1        2        0        0        0     2
        2        0        1        0        0     1
        3        0        0        1        0     1
        4        1        0        0        1     2
    Total        3        1        1        1     6

$Yes
 check_id Person 1 Person 2 Person 3 Person 4 Total
        1        5        0        0        0     5
        2        0        5        0        0     5
        3        0        1        2        0     3
        4        0        0        0        1     1
    Total        5        6        2        1    14