R dplyr嵌套虚拟编码

Question

我需要重新编码一组测试响应以用于另一个应用程序（一个名为 BLIMP 的程序，用于估算缺失值）。具体来说，我需要用虚拟代码表示测试项目和子量表分配。

我在这里创建了一个数据框，它以嵌套格式保存对两个人的 10 项测试的响应。这些数据是实际输入的简化版本 table.

library(tidyverse)
df <- tibble(
  person = rep(101:102, each = 10),
  item = as.factor(rep(1:10, 2)),
  response = sample(1:4, 20, replace = T),
  scale = as.factor(rep(rep(1:2, each = 5), 2))
) %>% mutate(
  scale_last = case_when(
    as.integer(scale) != lead(as.integer(scale)) | is.na(lead(as.integer(scale))) ~ 1,
    TRUE ~ NA_real_
  )
)

df 的列包含：

person：人员的身份证号码（每人10行）
item：每人测试1-10题。请注意这些项目是如何嵌套在每个人身上的。
response: 每项得分
scale：测试有两个分量表。项目 1-5 分配给子量表 1，项目 6-10 分配给子量表 2。
scale_last：此列中的代码1表示该项目是其分配的子量表中的最后一个项目。这个特性在下面变得很重要。

然后我使用 recipes 包为项目创建虚拟代码。

library(recipes)
dum <- df %>% 
  recipe(~ .) %>% 
  step_dummy(item, one_hot = T) %>% 
  prep(training = df) %>%
  bake(new_data = df)
print(dum, width = Inf)

#   person response scale scale_last item_X1 item_X2 item_X3 item_X4 item_X5 item_X6 item_X7
#    <int>    <int> <fct>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
# 1    101        2 1             NA       1       0       0       0       0       0       0
# 2    101        3 1             NA       0       1       0       0       0       0       0
# 3    101        3 1             NA       0       0       1       0       0       0       0
# 4    101        1 1             NA       0       0       0       1       0       0       0
# 5    101        1 1              1       0       0       0       0       1       0       0
# 6    101        1 2             NA       0       0       0       0       0       1       0
# 7    101        3 2             NA       0       0       0       0       0       0       1
# 8    101        4 2             NA       0       0       0       0       0       0       0
# 9    101        2 2             NA       0       0       0       0       0       0       0
#10    101        4 2              1       0       0       0       0       0       0       0
#11    102        2 1             NA       1       0       0       0       0       0       0
#12    102        1 1             NA       0       1       0       0       0       0       0
#13    102        2 1             NA       0       0       1       0       0       0       0
#14    102        3 1             NA       0       0       0       1       0       0       0
#15    102        2 1              1       0       0       0       0       1       0       0
#16    102        1 2             NA       0       0       0       0       0       1       0
#17    102        4 2             NA       0       0       0       0       0       0       1
#18    102        2 2             NA       0       0       0       0       0       0       0
#19    102        4 2             NA       0       0       0       0       0       0       0
#20    102        3 2              1       0       0       0       0       0       0       0
#   item_X8 item_X9 item_X10
#     <dbl>   <dbl>    <dbl>
# 1       0       0        0
# 2       0       0        0
# 3       0       0        0
# 4       0       0        0
# 5       0       0        0
# 6       0       0        0
# 7       0       0        0
# 8       1       0        0
# 9       0       1        0
#10       0       0        1
#11       0       0        0
#12       0       0        0
#13       0       0        0
#14       0       0        0
#15       0       0        0
#16       0       0        0
#17       0       0        0
#18       1       0        0
#19       0       1        0
#20       0       0        1

输出显示带有 item_ 前缀的列中表示的项目虚拟代码。对于下游处理，我需要进一步的重新编码。在每个子量表中，项目必须相对于子量表的最后一个项目进行虚拟编码。这是 scale_last 变量发挥作用的地方；此变量标识输出中需要重新编码的行。

例如，这些行中的第一行是第 5 行，即 101 人子量表 1 中最后一项（第 5 项）的行。在这一行中，第 item_X5 列的值需要重新编码从 1 到 0。在下一行要重新编码的行（第 10 行）中，需要将 item_X10 的值从 1 重新编码为 0。等等。

我正在努力寻找 dplyr 动词的正确组合来完成此任务。让我感到困惑的是需要将特定行中的特定单元格隔离开来进行重新编码。

在此先感谢您的帮助！

Answer 1

我们可以使用 mutate_at 和 replace 值从 "item" 列到 0 其中 scale_last == 1

library(dplyr)

dum %>% mutate_at(vars(starts_with("item")), ~replace(., scale_last == 1, 0))

# A tibble: 20 x 14
#   person response scale scale_last item_X1 item_X2 item_X3 item_X4 item_X5
#    <int>    <int> <fct>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
# 1    101        2 1             NA       1       0       0       0       0
# 2    101        3 1             NA       0       1       0       0       0
# 3    101        1 1             NA       0       0       1       0       0
# 4    101        1 1             NA       0       0       0       1       0
# 5    101        3 1              1       0       0       0       0       0
# 6    101        4 2             NA       0       0       0       0       0
# 7    101        4 2             NA       0       0       0       0       0
# 8    101        3 2             NA       0       0       0       0       0
# 9    101        2 2             NA       0       0       0       0       0
#10    101        4 2              1       0       0       0       0       0
#11    102        2 1             NA       1       0       0       0       0
#12    102        1 1             NA       0       1       0       0       0
#13    102        4 1             NA       0       0       1       0       0
#14    102        4 1             NA       0       0       0       1       0
#15    102        4 1              1       0       0       0       0       0
#16    102        3 2             NA       0       0       0       0       0
#17    102        4 2             NA       0       0       0       0       0
#18    102        1 2             NA       0       0       0       0       0
#19    102        4 2             NA       0       0       0       0       0
#20    102        4 2              1       0       0       0       0       0
# … with 5 more variables: item_X6 <dbl>, item_X7 <dbl>, item_X8 <dbl>,
#   item_X9 <dbl>, item_X10 <dbl>

在基础 R 中，我们可以使用 lapply

cols <- grep("^item", names(dum))
dum[cols] <- lapply(dum[cols], function(x) replace(x, dum$scale_last == 1, 0))

R dplyr嵌套虚拟编码

R dplyr nested dummy coding

r

dplyr

dummy-variable