如何在 R 中为间隔创建多列虚拟对象

How to creat multiple columns of dummies for intervals in R

data$Distance_100<-0
data$Distance_100[data$Distance<100]<-1

data$Distance_200<-0
data$Distance_200[data$Distance>=101&data$Distance<200]<-1

data$Distance_300<-0
data$Distance_300[data$Distance>=201&data$Distance<300]<-1

data$Distance_400<-0
data$Distance_400[data$Distance>=301&data$Distance<400]<-1

data$Distance_500<-0
data$Distance_500[data$Distance>=401&data$Distance<500]<-1

结果必须是多列。此代码仅创建一列 data$DistanceCut5 = cut(data$Distance, breaks=c(0,100,200,300,400,500))

cut 将创建一个单独的列,但是如果你想为每个切割级别创建 1 列,你可以这样做:

例子

图书馆

library(tidyverse)

代码

# Vector with a sequence from 0 to 500 by 100
seq_0_500 <- seq(0,500,100)

# Example data.frame
tibble(
  # Variable distance = sequence from 1 to 500 by 1
  distance = 1:500
) %>% 
  mutate(
    #Create a categoric variable by 100: `(0,100]` `(100,200]` `(200,300]` `(300,400]` `(400,500]`
    distance_cut = cut(distance,seq_0_500, labels = paste0("Distance_",seq_0_500[-1])),
    #Auxiliar variable
    aux = 1
    ) %>% 
  # Pivot data to make one column for each cut level
  pivot_wider(names_from = distance_cut,values_from = aux) %>% 
  # Replace every NA for 0
  replace(is.na(.),0)

输出

# A tibble: 500 x 6
   distance Distance_100 Distance_200 Distance_300 Distance_400 Distance_500
      <int>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
 1        1            1            0            0            0            0
 2        2            1            0            0            0            0
 3        3            1            0            0            0            0
 4        4            1            0            0            0            0
 5        5            1            0            0            0            0
 6        6            1            0            0            0            0
 7        7            1            0            0            0            0
 8        8            1            0            0            0            0
 9        9            1            0            0            0            0
10       10            1            0            0            0            0
# ... with 490 more rows

这是另一种方法。首先提供可重现的数据:

set.seed(42)
var <- round(runif(50, 0, 500))
dummy <- cut(var, breaks=c(0, 100, 200, 300, 400, 500))
table(dummy)
# dummy
#   (0,100] (100,200] (200,300] (300,400] (400,500] 
#         7         6         9        10        18 

现在为每个值创建列:

dumvar <-table(row(as.matrix(dummy)), dummy)
head(dumvar); tail(dumvar)
#    dummy
#     (0,100] (100,200] (200,300] (300,400] (400,500]
#   1       0         0         0         0         1
#   2       0         0         0         0         1
#   3       0         1         0         0         0
#   4       0         0         0         0         1
#   5       0         0         0         1         0
#   6       0         0         1         0         0
#     dummy
#      (0,100] (100,200] (200,300] (300,400] (400,500]
#   45       0         0         1         0         0
#   46       0         0         0         0         1
#   47       0         0         0         0         1
#   48       0         0         0         1         0
#   49       0         0         0         0         1
#   50       0         0         0         1         0

如果要重命名列:

dimnames(dumvar)$dummy <- paste0("Distance_", seq(100, 500, by=100))

这是一个不错的方法:首先剪切数据,然后使用 model.matrix() 创建虚拟变量。

data <- data.frame(Distance = runif(20, 0, 500))
DistanceCut5 = cut(data$Distance, breaks=c(0,100,200,300,400,500))
dummies <- model.matrix(~ DistanceCut5 + 0) # + 0 so we don't have a column of 1s
data <- cbind(data, dummies)

确保 DistanceCut5 中没有任何 NA。否则你的虚拟矩阵中的行会太少。