在 R 中有效地创建数字编码的虚拟变量?
Create numerically encoded dummy variables efficiently in R?
我们如何转换表单的数据
df <- structure(list(customer_number = c(3, 3, 1, 1, 3),
item = c("milkshake","burger", "apple", "burger", "water")
),
row.names = c(NA, -5L), class = "data.frame")
# customer_number item
# 1 3 milkshake
# 2 3 burger
# 3 1 apple
# 4 1 burger
# 5 3 water
进入数字编码的虚拟变量,像这样
data.frame(customer_number=c(1,3),
item_milkshake=c(0,1),
item_burger=c(1,1),
item_apple=c(1,0),
item_water=c(0,1))
# customer_number item_milkshake item_burger item_apple item_water
# 1 1 0 1 1 0
# 2 3 1 1 0 1
我们可以创建一个值为 1 的虚拟列,并以宽格式获取数据。
library(dplyr)
df %>%
mutate(n = 1) %>%
arrange(customer_number) %>%
tidyr::pivot_wider(names_from = item, values_from = n,
values_fill = list(n = 0), names_prefix = "item_")
# A tibble: 2 x 5
# customer_number item_apple item_burger item_milkshake item_water
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 1 0 0
#2 3 0 1 1 1
如果您想使用基本的 R 函数,这里有一个使用 table() 函数的简单解决方案:
#Create the dataset
df <- structure(list(customer_number = c(3, 3, 1, 1, 3), item = c("milkshake",
"burger", "apple", "burger", "water")), row.names = c(NA, -5L
res <- as.matrix(table(df$customer_number,df$item))
res[res > 0 ] <- 1 #dummy variable
res
apple burger milkshake water
1 1 1 0 0
3 0 1 1 1
您可以将 customer_number 作为单独的列添加到矩阵中:
res <- cbind(customer_number = as.numeric(rownames(res)), res)
res
customer_number apple burger milkshake water
1 1 1 1 0 0
3 3 0 1 1 1
对于未来的 reference/future 读者:查看 dummies
包中的 dummy.data.frame
函数:
df <- df %>%
mutate(item = as.factor(item))
df_dummy <- dummies::dummy.data.frame(df,
dummy.classes = "factor",
sep = "_",
omit.constants = TRUE,
all = TRUE)
df_dummy
customer_number item_apple item_burger item_milkshake item_water
1 3 0 0 1 0
2 3 0 1 0 0
3 1 1 0 0 0
4 1 0 1 0 0
5 3 0 0 0 1
我们如何转换表单的数据
df <- structure(list(customer_number = c(3, 3, 1, 1, 3),
item = c("milkshake","burger", "apple", "burger", "water")
),
row.names = c(NA, -5L), class = "data.frame")
# customer_number item
# 1 3 milkshake
# 2 3 burger
# 3 1 apple
# 4 1 burger
# 5 3 water
进入数字编码的虚拟变量,像这样
data.frame(customer_number=c(1,3),
item_milkshake=c(0,1),
item_burger=c(1,1),
item_apple=c(1,0),
item_water=c(0,1))
# customer_number item_milkshake item_burger item_apple item_water
# 1 1 0 1 1 0
# 2 3 1 1 0 1
我们可以创建一个值为 1 的虚拟列,并以宽格式获取数据。
library(dplyr)
df %>%
mutate(n = 1) %>%
arrange(customer_number) %>%
tidyr::pivot_wider(names_from = item, values_from = n,
values_fill = list(n = 0), names_prefix = "item_")
# A tibble: 2 x 5
# customer_number item_apple item_burger item_milkshake item_water
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 1 0 0
#2 3 0 1 1 1
如果您想使用基本的 R 函数,这里有一个使用 table() 函数的简单解决方案:
#Create the dataset
df <- structure(list(customer_number = c(3, 3, 1, 1, 3), item = c("milkshake",
"burger", "apple", "burger", "water")), row.names = c(NA, -5L
res <- as.matrix(table(df$customer_number,df$item))
res[res > 0 ] <- 1 #dummy variable
res
apple burger milkshake water
1 1 1 0 0
3 0 1 1 1
您可以将 customer_number 作为单独的列添加到矩阵中:
res <- cbind(customer_number = as.numeric(rownames(res)), res)
res
customer_number apple burger milkshake water
1 1 1 1 0 0
3 3 0 1 1 1
对于未来的 reference/future 读者:查看 dummies
包中的 dummy.data.frame
函数:
df <- df %>%
mutate(item = as.factor(item))
df_dummy <- dummies::dummy.data.frame(df,
dummy.classes = "factor",
sep = "_",
omit.constants = TRUE,
all = TRUE)
df_dummy
customer_number item_apple item_burger item_milkshake item_water
1 3 0 0 1 0
2 3 0 1 0 0
3 1 1 0 0 0
4 1 0 1 0 0
5 3 0 0 0 1