根据 R 中的列创建多个虚拟变量

Create multiple dummy variables based on column in R

我有一个数据框,如下所示,我想为流派列中可用的每个唯一字符串创建虚拟列

       tconst                   genres
 1: tt0000001        Documentary,Short
 2: tt0000002          Animation,Short
 3: tt0000003 Animation,Comedy,Romance
 4: tt0000004          Animation,Short
 5: tt0000005             Comedy,Short
 6: tt0000006                    Short
 7: tt0000007              Short,Sport
 8: tt0000008        Documentary,Short
 9: tt0000009            Romance,Short
10: tt0000010        Documentary,Short
11: tt0000011        Documentary,Short
12: tt0000012        Documentary,Short
13: tt0000013        Documentary,Short
14: tt0000014             Comedy,Short
15: tt0000015          Animation,Short

我尝试使用下面的代码,除了效率不高之外,它还返回了错误的输出。

uniqueGenre <- MovieRating_test %>% 
  separate_rows(genres) %>% 
  pull() %>%
  unique()

for(i in 1:nrow(MovieRating_test)){
  for(j in uniqueGenre){
    MovieRating_test[i,j] <- ifelse(j %in% strsplit(as.character(MovieRating_test[,"genres"][i]),","), 1, 0)
  }
}

数据集

MovieRating_test <- structure(list(tconst = c("tt0000001", "tt0000002", "tt0000003", 
"tt0000004", "tt0000005", "tt0000006", "tt0000007", "tt0000008", 
"tt0000009", "tt0000010", "tt0000011", "tt0000012", "tt0000013", 
"tt0000014", "tt0000015"), genres = c("Documentary,Short", "Animation,Short", 
"Animation,Comedy,Romance", "Animation,Short", "Comedy,Short", 
"Short", "Short,Sport", "Documentary,Short", "Romance,Short", 
"Documentary,Short", "Documentary,Short", "Documentary,Short", 
"Documentary,Short", "Comedy,Short", "Animation,Short")), row.names = c(NA, 
-15L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000023514d61ef0>)

首选 data.table 解决方案,但欢迎任何解决方案。

我们可以使用 dummy_colsfastDummies

library(fastDummies)
dummy_cols(MovieRating_test, 'genres', split = ",")

-输出

       tconst                   genres genres_Animation genres_Comedy genres_Romance genres_Short genres_Documentary
       <char>                   <char>            <int>         <int>          <int>        <int>              <int>
 1: tt0000001        Documentary,Short                0             0              0            1                  1
 2: tt0000002          Animation,Short                1             0              0            1                  0
 3: tt0000003 Animation,Comedy,Romance                1             1              1            0                  0
 4: tt0000004          Animation,Short                1             0              0            1                  0
 5: tt0000005             Comedy,Short                0             1              0            1                  0
 6: tt0000006                    Short                0             0              0            1                  0
 7: tt0000007              Short,Sport                0             0              0            1                  0
 8: tt0000008        Documentary,Short                0             0              0            1                  1
 9: tt0000009            Romance,Short                0             0              1            1                  0
10: tt0000010        Documentary,Short                0             0              0            1                  1
11: tt0000011        Documentary,Short                0             0              0            1                  1
12: tt0000012        Documentary,Short                0             0              0            1                  1
13: tt0000013        Documentary,Short                0             0              0            1                  1
14: tt0000014             Comedy,Short                0             1              0            1                  0
15: tt0000015          Animation,Short                1             0              0            1                  0
    genres_Sport
           <int>
 1:            0
 2:            0
 3:            0
 4:            0
 5:            0
 6:            0
 7:            1
 8:            0
 9:            0
10:            0
11:            0
12:            0
13:            0
14:            0
15:            0

mtabulate

的另一个选项
library(data.table)
library(qdapTools)
m1 <- MovieRating_test[, +(mtabulate(strsplit(genres, ",")) > 0)]
MovieRating_test[, colnames(m1) := as.data.frame(m1)]

-输出

> MovieRating_test
       tconst                   genres Animation Comedy Documentary Romance Short Sport
       <char>                   <char>     <int>  <int>       <int>   <int> <int> <int>
 1: tt0000001        Documentary,Short         0      0           1       0     1     0
 2: tt0000002          Animation,Short         1      0           0       0     1     0
 3: tt0000003 Animation,Comedy,Romance         1      1           0       1     0     0
 4: tt0000004          Animation,Short         1      0           0       0     1     0
 5: tt0000005             Comedy,Short         0      1           0       0     1     0
 6: tt0000006                    Short         0      0           0       0     1     0
 7: tt0000007              Short,Sport         0      0           0       0     1     1
 8: tt0000008        Documentary,Short         0      0           1       0     1     0
 9: tt0000009            Romance,Short         0      0           0       1     1     0
10: tt0000010        Documentary,Short         0      0           1       0     1     0
11: tt0000011        Documentary,Short         0      0           1       0     1     0
12: tt0000012        Documentary,Short         0      0           1       0     1     0
13: tt0000013        Documentary,Short         0      0           1       0     1     0
14: tt0000014             Comedy,Short         0      1           0       0     1     0
15: tt0000015          Animation,Short         1      0           0       0     1     0