使用 tidyverse 包为每个子类别创建虚拟变量
Dummy variable creation for each subcategory using tidyverse package
我有一个小问题(引用自)
library(tidyverse)
input_data <- tribble( ~Subcat, ~Date, ~COMM1,~COMM2,~UOM,~AUC_TYPE,
#--|----------|-----|-----|----|----------------|
1, 2017-03-07, 40750,41400,"MT","English",
1, 2017-03-15, 40750,40000,"MT","English",
2, 2017-10-16, 41000,40500,"METER","Yankee",
2, 2017-11-06, 41010,40510,"METER","Yankee",
2, 2019-01-26, 50010,50510,"METER","English",
3, 2017-03-07, 40750,41400,"MT","English",
3, 2018-05-26, 50010,50510,"MT","English",
3, 2019-01-21, 40750,40200,"MT","English",
3, 2019-01-21, 40750,40200,"MT","English",
4, 2017-11-08, 37500,39000,"LTR","Dynamic Sealbid",
4, 2017-11-08, 37500,39000,"LTR","Dynamic Sealbid",
)
期望的输出
output_data <- tribble( ~Subcat, ~Date, ~COMM1, ~COMM2, ~UOM_MT, ~UOM_METER ,~UOM_LTR, ~AUC_TYPE_English, ~`AUC_TYPE_Dynamic Sealbid`, ~AUC_TYPE_Yankee,
#--|----------|-----|-----|-|-|-|-|-|-|
1, 2017-03-07, 40750,41400,1,0,0,1,0,0,
1, 2017-03-15, 40750,40000,1,0,0,1,0,0,
2, 2017-10-16, 41000,40500,0,1,0,0,0,1,
2, 2017-11-06, 41010,40510,0,1,0,0,0,1,
2, 2019-01-26, 50010,50510,0,1,0,1,0,0,
3, 2017-03-07, 40750,41400,1,0,0,1,0,0,
3, 2018-05-26, 50010,50510,1,0,0,1,0,0,
3, 2019-01-21, 40750,40200,1,0,0,1,0,0,
3, 2019-01-21, 40750,40200,1,0,0,1,0,0,
4, 2017-11-08, 37500,39000,0,0,1,0,1,0,
4, 2017-11-08, 37500,39000,0,0,1,0,1,0,
)
你可以这样做:
library(dplyr)
library(tidyr)
input_data %>%
#Get unique row number
mutate(row = row_number()) %>%
#Get data in long format
pivot_longer(cols = c(UOM, AUC_TYPE)) %>%
#Combine columns
unite(col, name, value) %>%
#Get data in wide format
pivot_wider(names_from = col, values_from = col, values_fn = list(col = ~1),
values_fill = list(col = 0)) %>%
#Remove row column
select(-row)
# A tibble: 11 x 10
# Subcat Date COMM1 COMM2 UOM_MT AUC_TYPE_English UOM_METER AUC_TYPE_Yankee UOM_LTR `AUC_TYPE_Dynamic Sealbid`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 2007 40750 41400 1 1 0 0 0 0
# 2 1 1999 40750 40000 1 1 0 0 0 0
# 3 2 1991 41000 40500 0 0 1 1 0 0
# 4 2 2000 41010 40510 0 0 1 1 0 0
# 5 2 1992 50010 50510 0 1 1 0 0 0
# 6 3 2007 40750 41400 1 1 0 0 0 0
# 7 3 1987 50010 50510 1 1 0 0 0 0
# 8 3 1997 40750 40200 1 1 0 0 0 0
# 9 3 1997 40750 40200 1 1 0 0 0 0
#10 4 1998 37500 39000 0 0 0 0 1 1
#11 4 1998 37500 39000 0 0 0 0 1 1
该方法使用C()
和contrasts()
设置因子变量的对比矩阵,并调用model.matrix()
对这些因子变量进行变换给傻瓜。
请注意,如果因子变量有 k 个水平,model.matrix()
将默认创建 k-1 个虚拟变量。所以在这里我将 属性 调整为 C()
和 contrasts()
.
library(dplyr)
library(tibble)
df %>%
select("UOM_" = UOM, "AUC_TYPE_" = AUC_TYPE) %>% # select and rename
mutate_all(as.factor) %>%
mutate_all(~ C(., contrasts(., contrasts = F), how.many = n_distinct(.))) %>%
model.matrix(~ ., data = .) %>%
as_tibble %>%
select(-`(Intercept)`) %>%
bind_cols(select(df, -c(UOM, AUC_TYPE)), .)
# # A tibble: 11 x 10
# Subcat Date COMM1 COMM2 UOM_LTR UOM_METER UOM_MT `AUC_TYPE_Dynamic Sealbid` AUC_TYPE_English AUC_TYPE_Yankee
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 2007 40750 41400 0 0 1 0 1 0
# 2 1 1999 40750 40000 0 0 1 0 1 0
# 3 2 1991 41000 40500 0 1 0 0 0 1
# 4 2 2000 41010 40510 0 1 0 0 0 1
# 5 2 1992 50010 50510 0 1 0 0 1 0
# 6 3 2007 40750 41400 0 0 1 0 1 0
# 7 3 1987 50010 50510 0 0 1 0 1 0
# 8 3 1997 40750 40200 0 0 1 0 1 0
# 9 3 1997 40750 40200 0 0 1 0 1 0
# 10 4 1998 37500 39000 1 0 0 1 0 0
# 11 4 1998 37500 39000 1 0 0 1 0 0
我有一个小问题(引用自
library(tidyverse)
input_data <- tribble( ~Subcat, ~Date, ~COMM1,~COMM2,~UOM,~AUC_TYPE,
#--|----------|-----|-----|----|----------------|
1, 2017-03-07, 40750,41400,"MT","English",
1, 2017-03-15, 40750,40000,"MT","English",
2, 2017-10-16, 41000,40500,"METER","Yankee",
2, 2017-11-06, 41010,40510,"METER","Yankee",
2, 2019-01-26, 50010,50510,"METER","English",
3, 2017-03-07, 40750,41400,"MT","English",
3, 2018-05-26, 50010,50510,"MT","English",
3, 2019-01-21, 40750,40200,"MT","English",
3, 2019-01-21, 40750,40200,"MT","English",
4, 2017-11-08, 37500,39000,"LTR","Dynamic Sealbid",
4, 2017-11-08, 37500,39000,"LTR","Dynamic Sealbid",
)
期望的输出
output_data <- tribble( ~Subcat, ~Date, ~COMM1, ~COMM2, ~UOM_MT, ~UOM_METER ,~UOM_LTR, ~AUC_TYPE_English, ~`AUC_TYPE_Dynamic Sealbid`, ~AUC_TYPE_Yankee,
#--|----------|-----|-----|-|-|-|-|-|-|
1, 2017-03-07, 40750,41400,1,0,0,1,0,0,
1, 2017-03-15, 40750,40000,1,0,0,1,0,0,
2, 2017-10-16, 41000,40500,0,1,0,0,0,1,
2, 2017-11-06, 41010,40510,0,1,0,0,0,1,
2, 2019-01-26, 50010,50510,0,1,0,1,0,0,
3, 2017-03-07, 40750,41400,1,0,0,1,0,0,
3, 2018-05-26, 50010,50510,1,0,0,1,0,0,
3, 2019-01-21, 40750,40200,1,0,0,1,0,0,
3, 2019-01-21, 40750,40200,1,0,0,1,0,0,
4, 2017-11-08, 37500,39000,0,0,1,0,1,0,
4, 2017-11-08, 37500,39000,0,0,1,0,1,0,
)
你可以这样做:
library(dplyr)
library(tidyr)
input_data %>%
#Get unique row number
mutate(row = row_number()) %>%
#Get data in long format
pivot_longer(cols = c(UOM, AUC_TYPE)) %>%
#Combine columns
unite(col, name, value) %>%
#Get data in wide format
pivot_wider(names_from = col, values_from = col, values_fn = list(col = ~1),
values_fill = list(col = 0)) %>%
#Remove row column
select(-row)
# A tibble: 11 x 10
# Subcat Date COMM1 COMM2 UOM_MT AUC_TYPE_English UOM_METER AUC_TYPE_Yankee UOM_LTR `AUC_TYPE_Dynamic Sealbid`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 2007 40750 41400 1 1 0 0 0 0
# 2 1 1999 40750 40000 1 1 0 0 0 0
# 3 2 1991 41000 40500 0 0 1 1 0 0
# 4 2 2000 41010 40510 0 0 1 1 0 0
# 5 2 1992 50010 50510 0 1 1 0 0 0
# 6 3 2007 40750 41400 1 1 0 0 0 0
# 7 3 1987 50010 50510 1 1 0 0 0 0
# 8 3 1997 40750 40200 1 1 0 0 0 0
# 9 3 1997 40750 40200 1 1 0 0 0 0
#10 4 1998 37500 39000 0 0 0 0 1 1
#11 4 1998 37500 39000 0 0 0 0 1 1
该方法使用C()
和contrasts()
设置因子变量的对比矩阵,并调用model.matrix()
对这些因子变量进行变换给傻瓜。
请注意,如果因子变量有 k 个水平,model.matrix()
将默认创建 k-1 个虚拟变量。所以在这里我将 属性 调整为 C()
和 contrasts()
.
library(dplyr)
library(tibble)
df %>%
select("UOM_" = UOM, "AUC_TYPE_" = AUC_TYPE) %>% # select and rename
mutate_all(as.factor) %>%
mutate_all(~ C(., contrasts(., contrasts = F), how.many = n_distinct(.))) %>%
model.matrix(~ ., data = .) %>%
as_tibble %>%
select(-`(Intercept)`) %>%
bind_cols(select(df, -c(UOM, AUC_TYPE)), .)
# # A tibble: 11 x 10
# Subcat Date COMM1 COMM2 UOM_LTR UOM_METER UOM_MT `AUC_TYPE_Dynamic Sealbid` AUC_TYPE_English AUC_TYPE_Yankee
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 2007 40750 41400 0 0 1 0 1 0
# 2 1 1999 40750 40000 0 0 1 0 1 0
# 3 2 1991 41000 40500 0 1 0 0 0 1
# 4 2 2000 41010 40510 0 1 0 0 0 1
# 5 2 1992 50010 50510 0 1 0 0 1 0
# 6 3 2007 40750 41400 0 0 1 0 1 0
# 7 3 1987 50010 50510 0 0 1 0 1 0
# 8 3 1997 40750 40200 0 0 1 0 1 0
# 9 3 1997 40750 40200 0 0 1 0 1 0
# 10 4 1998 37500 39000 1 0 0 1 0 0
# 11 4 1998 37500 39000 1 0 0 1 0 0