在 R 中有效地创建数字编码的虚拟变量?

Create numerically encoded dummy variables efficiently in R?

我们如何转换表单的数据

df <- structure(list(customer_number = c(3, 3, 1, 1, 3), 
                     item = c("milkshake","burger", "apple", "burger", "water")
                       ), 
                row.names = c(NA, -5L), class = "data.frame")


#   customer_number      item
# 1               3 milkshake
# 2               3    burger
# 3               1     apple
# 4               1    burger
# 5               3     water

进入数字编码的虚拟变量,像这样


data.frame(customer_number=c(1,3),
           item_milkshake=c(0,1),
           item_burger=c(1,1),
           item_apple=c(1,0),
           item_water=c(0,1))

#   customer_number item_milkshake item_burger item_apple item_water
# 1               1              0           1          1          0
# 2               3              1           1          0          1

我们可以创建一个值为 1 的虚拟列,并以宽格式获取数据。

library(dplyr)

df %>%
  mutate(n = 1) %>%
  arrange(customer_number) %>%
  tidyr::pivot_wider(names_from = item, values_from = n,
                     values_fill = list(n = 0), names_prefix = "item_")

# A tibble: 2 x 5
#  customer_number item_apple item_burger item_milkshake item_water
#            <dbl>      <dbl>       <dbl>          <dbl>      <dbl>
#1               1          1           1              0          0
#2               3          0           1              1          1

如果您想使用基本的 R 函数,这里有一个使用 table() 函数的简单解决方案:

#Create the dataset
df <- structure(list(customer_number = c(3, 3, 1, 1, 3), item = c("milkshake", 
                                                             "burger", "apple", "burger", "water")), row.names = c(NA, -5L

res <- as.matrix(table(df$customer_number,df$item))
res[res > 0 ] <- 1 #dummy variable
res

    apple burger milkshake water
  1     1      1         0     0
  3     0      1         1     1

您可以将 customer_number 作为单独的列添加到矩阵中:

res <- cbind(customer_number = as.numeric(rownames(res)), res)
res

  customer_number apple burger milkshake water
1               1     1      1         0     0
3               3     0      1         1     1

对于未来的 reference/future 读者:查看 dummies 包中的 dummy.data.frame 函数:

df <- df %>% 
  mutate(item = as.factor(item))

df_dummy <- dummies::dummy.data.frame(df,
                                      dummy.classes = "factor",
                                      sep = "_",
                                      omit.constants = TRUE,
                                      all = TRUE)

df_dummy

  customer_number item_apple item_burger item_milkshake item_water
1               3          0           0              1          0
2               3          0           1              0          0
3               1          1           0              0          0
4               1          0           1              0          0
5               3          0           0              0          1