创建考虑特定实体的长格式

Create a long format considering specific enteties

我有一个数据集,其中一些变量引用特定实体。所有其他变量与这些实体没有关系。

这是一个例子:

library(dplyr)

d = samp %>%
  select(matches("Q2|Q8")) 

glimpse(d)

Observations: 10
Variables: 23
$ Q2    <dbl> 7, 6, 6, 9, 3, 3, 3, 3, 8, 5
$ Q8ar1 <dbl> 1, 1, 1, 1, 1, 1, 0, 0, 1, 1
$ Q8ar2 <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 1
$ Q8ar3 <dbl> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1
$ Q8ar4 <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0
$ Q8ar5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ Q8br1 <dbl> 6, 6, 6, 6, 4, 2, NA, NA, 6, 5
$ Q8br2 <dbl> 6, NA, NA, NA, 1, 6, 6, NA, 6, 6
$ Q8br3 <dbl> 6, 6, NA, 6, 6, 3, 6, 6, 4, 4
$ Q8br4 <dbl> 6, 6, NA, 6, NA, 6, NA, 6, 6, NA
$ Q8cr1 <dbl> 5, 5, 5, 5, 1, 1, NA, NA, 1, 1
$ Q8cr2 <dbl> 5, NA, NA, NA, 1, 4, 2, NA, 2, 3
$ Q8cr3 <dbl> 5, 5, NA, 5, 2, 1, 1, 5, 1, 1
$ Q8cr4 <dbl> 5, 4, NA, 5, NA, 4, NA, 5, 2, NA
$ Q8dr1 <dbl> NA, NA, NA, NA, 4, 2, NA, NA, NA, 6
$ Q8dr2 <dbl> NA, NA, NA, NA, 1, NA, NA, NA, NA, NA
$ Q8dr3 <dbl> NA, NA, NA, NA, NA, 5, NA, NA, 6, 6
$ Q8dr4 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ Q8er1 <dbl> NA, NA, NA, NA, 1, 1, NA, NA, NA, NA
$ Q8er2 <dbl> NA, NA, NA, NA, 1, NA, NA, NA, NA, NA
$ Q8er3 <dbl> NA, NA, NA, NA, NA, 1, NA, NA, NA, NA
$ Q8er4 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ Q8f   <dbl> 5, 5, 5, 5, 2, 1, 3, 5, 3, 3

变量 Q2 不属于实体。 Q8a 到 Q8e 属于一个实体,Q8F 不属于。以1结尾的变量Q8a至Q8e属于实体1,以2结尾的变量属于实体2,依此类推。以 5 (Q8ar5) 结尾的那个不属于实体(例如 Q8ar5 应该像 Q2 或 Q8f 一样对待)。目标是将数据转换为考虑实体的长格式。现在,我是这样做的:

首先,我创建了实体变量,然后删除了变量的结尾。结果是 4 个数据集,它们在 "non_entity" 个变量中相同,但包含每个实体的变量。

E1 = d %>%
      mutate(E = "E1")%>%
      mutate(E = as.factor(E)) %>%
      select(-matches("^Q8a.*(2|3|4)$")) %>%
      select(-matches("^Q8b.*(2|3|4)$")) %>%
      select(-matches("^Q8c.*(2|3|4)$")) %>%
      select(-matches("^Q8d.*(2|3|4)$")) %>%
      select(-matches("^Q8e.*(2|3|4)$")) %>%
      rename_at(vars(matches("^Q8(a|b|c|d|e)")), ~str_remove(., "(1|2|3|4)$"))



E2 = d %>%
  mutate(E = "E2") %>%
  mutate(E = as.factor(E)) %>%
  select(-matches("^Q8a.*(1|3|4)$")) %>%
  select(-matches("^Q8b.*(1|3|4)$")) %>%
  select(-matches("^Q8c.*(1|3|4)$")) %>%
  select(-matches("^Q8d.*(1|3|4)$")) %>%
  select(-matches("^Q8e.*(1|3|4)$")) %>%
  rename_at(vars(matches("^Q8(a|b|c|d|e)")), ~str_remove(., "(1|2|3|4)$"))


E3 = d %>%
  mutate(E = "E3") %>%
  mutate(E = as.factor(E)) %>%
  select(-matches("^Q8a.*(1|2|4)$")) %>%
  select(-matches("^Q8b.*(1|2|4)$")) %>%
  select(-matches("^Q8c.*(1|2|4)$")) %>%
  select(-matches("^Q8d.*(1|2|4)$")) %>%
  select(-matches("^Q8e.*(1|2|4)$")) %>%
  rename_at(vars(matches("^Q8(a|b|c|d|e)")), ~str_remove(., "(1|2|3|4)$"))

E4 = d %>%
  mutate(E = "E") %>%
  mutate(E = as.factor(E)) %>%
  select(-matches("^Q8a.*(1|2|3)$")) %>%
  select(-matches("^Q8b.*(1|2|3)$")) %>%
  select(-matches("^Q8c.*(1|2|3)$")) %>%
  select(-matches("^Q8d.*(1|2|3)$")) %>%
  select(-matches("^Q8e.*(1|2|3)$")) %>%
  rename_at(vars(matches("^Q8(a|b|c|d|e)")), ~str_remove(., "(1|2|3|4)$"))

然后,我绑定这些数据框以创建长格式。

E_all = do.call("rbind", list(E1, E2, E3, E4))

结果是这样的,很好。

glimpse(E_all) 

Observations: 40
Variables: 9
$ Q2    <dbl> 7, 6, 6, 9, 3, 3, 3, 3, 8, 5, 7, 6, 6, 9, 3, 3, 3, 3, 8, 5, 7, 6, 6, 9, 3, 3, 3, 3, 8, 5, 7,…
$ Q8ar  <dbl> 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,…
$ Q8ar5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Q8br  <dbl> 6, 6, 6, 6, 4, 2, NA, NA, 6, 5, 6, NA, NA, NA, 1, 6, 6, NA, 6, 6, 6, 6, NA, 6, 6, 3, 6, 6, 4…
$ Q8cr  <dbl> 5, 5, 5, 5, 1, 1, NA, NA, 1, 1, 5, NA, NA, NA, 1, 4, 2, NA, 2, 3, 5, 5, NA, 5, 2, 1, 1, 5, 1…
$ Q8dr  <dbl> NA, NA, NA, NA, 4, 2, NA, NA, NA, 6, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Q8er  <dbl> NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Q8f   <dbl> 5, 5, 5, 5, 2, 1, 3, 5, 3, 3, 5, 5, 5, 5, 2, 1, 3, 5, 3, 3, 5, 5, 5, 5, 2, 1, 3, 5, 3, 3, 5,…
$ E     <fct> Smava, Smava, Smava, Smava, Smava, Smava, Smava, Smava, Smava, Smava, Finanzcheck, Finanzche…

然而,这是一个非常简单的案例。可能有 20 个实体,还有更多的变量。有没有办法用更少的代码来做到这一点(例如通过收集功能)?感谢您的任何建议。

这里是一个小输出:

structure(list(record = structure(c(1227, 780, 480, 111, 1888, 
1602, 1800, 1322, 1474, 755), format.spss = "F7.0", display_width = 7L), 
    Q1 = c(1, 2, 1, 2, 2, 2, 2, 1, 2, 2), Q2 = c(7, 6, 6, 9, 
    3, 3, 3, 3, 8, 5), Q3 = c(3, 8, 4, 7, 7, 4, 7, 6, 7, 7), 
    ort = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2), Q13 = c(4, 6, 4, 3, 
    5, 5, 5, 4, 5, 6), Q5r1 = c(4, 4, 2, 4, 3, 4, 3, 4, 3, 2), 
    Q5r2 = c(4, 4, 4, 4, 2, 4, 3, 4, 3, 3), Q5r3 = c(4, 4, 4, 
    4, 3, 3, 3, 4, 3, 4), Q5r4 = c(4, 4, 4, 4, 1, 2, 3, 4, 3, 
    4), Q5r5 = c(4, 4, 4, 4, 2, 2, 3, 4, 4, 4), Q8ar1 = c(1, 
    1, 1, 1, 1, 1, 0, 0, 1, 1), Q8ar2 = c(1, 0, 0, 0, 1, 1, 1, 
    0, 1, 1), Q8ar3 = c(1, 1, 0, 1, 1, 1, 1, 1, 1, 1), Q8ar4 = c(1, 
    1, 0, 1, 0, 1, 0, 1, 1, 0), Q8ar5 = c(0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0), Q8br1 = c(6, 6, 6, 6, 4, 2, NA, NA, 6, 5), Q8br2 = c(6, 
    NA, NA, NA, 1, 6, 6, NA, 6, 6), Q8br3 = c(6, 6, NA, 6, 6, 
    3, 6, 6, 4, 4), Q8br4 = c(6, 6, NA, 6, NA, 6, NA, 6, 6, NA
    ), Q8cr1 = c(5, 5, 5, 5, 1, 1, NA, NA, 1, 1), Q8cr2 = c(5, 
    NA, NA, NA, 1, 4, 2, NA, 2, 3), Q8cr3 = c(5, 5, NA, 5, 2, 
    1, 1, 5, 1, 1), Q8cr4 = c(5, 4, NA, 5, NA, 4, NA, 5, 2, NA
    ), Q8dr1 = c(NA, NA, NA, NA, 4, 2, NA, NA, NA, 6), Q8dr2 = c(NA, 
    NA, NA, NA, 1, NA, NA, NA, NA, NA), Q8dr3 = c(NA, NA, NA, 
    NA, NA, 5, NA, NA, 6, 6), Q8dr4 = c(NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_), Q8er1 = c(NA, NA, NA, NA, 1, 1, NA, NA, NA, NA
    ), Q8er2 = c(NA, NA, NA, NA, 1, NA, NA, NA, NA, NA), Q8er3 = c(NA, 
    NA, NA, NA, NA, 1, NA, NA, NA, NA), Q8er4 = c(NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_), Q8f = c(5, 5, 5, 5, 2, 1, 3, 5, 3, 3
    ), Q9 = c(2, 1, 1, 2, 1, 1, 2, 1, 1, 1), Q10r1 = c(NA, 1, 
    1, NA, 0, 1, NA, 1, 0, 1), Q10r2 = c(NA, 0, 0, NA, 0, 0, 
    NA, 0, 0, 0), Q10r3 = c(NA, 0, 0, NA, 0, 1, NA, 0, 0, 0), 
    Q10r4 = c(NA, 0, 0, NA, 1, 1, NA, 0, 1, 0), Q10r5 = c(NA, 
    0, 0, NA, 0, 0, NA, 0, 0, 0), Q10r6 = c(NA, 0, 0, NA, 0, 
    0, NA, 0, 0, 0), Q10r7 = c(NA, 0, 0, NA, 0, 0, NA, 0, 0, 
    0), Q10r8 = c(NA, 0, 0, NA, 0, 0, NA, 0, 0, 0), Q10r9 = c(NA, 
    0, 0, NA, 0, 0, NA, 0, 0, 0), Q10r10 = c(NA, 0, 0, NA, 0, 
    1, NA, 0, 0, 0), Q10r11 = c(NA, 0, 0, NA, 0, 0, NA, 0, 0, 
    0), Q10r12 = c(NA, 0, 0, NA, 0, 0, NA, 0, 0, 0), Q10r13 = c(NA, 
    0, 0, NA, 0, 0, NA, 0, 0, 0), Q11 = c(2, 1, 2, 3, 1, 1, 3, 
    1, 3, 1), Q12 = c(3, 2, 3, 5, 3, 2, 2, 3, 2, 2), Q14 = c(1, 
    1, 1, 1, 1, 1, 1, 2, 1, 1), Q15 = c(1, 2, 2, 2, 2, 3, 2, 
    4, 2, 2)), class = "data.frame", row.names = c(NA, -10L))

我们可以在这里使用pivot_longer

library(dplyr)
library(tidyr)
library(data.table)
library(stringr)

out1 <- samp %>%
         select(matches("^(Q2|Q8)")) %>%
         mutate(rn = row_number()) %>%
         pivot_longer(cols = -c(rn, Q2, Q8f), names_to = c(".value", "Q8"), 
            names_sep = "(?<=[a-z])(?=[1-5]$)", values_drop_na = TRUE) %>%
    mutate(E = str_c('E', rowid(rn))) %>%
    arrange(E)