如何在 R 中为间隔创建多列虚拟对象
How to creat multiple columns of dummies for intervals in R
data$Distance_100<-0
data$Distance_100[data$Distance<100]<-1
data$Distance_200<-0
data$Distance_200[data$Distance>=101&data$Distance<200]<-1
data$Distance_300<-0
data$Distance_300[data$Distance>=201&data$Distance<300]<-1
data$Distance_400<-0
data$Distance_400[data$Distance>=301&data$Distance<400]<-1
data$Distance_500<-0
data$Distance_500[data$Distance>=401&data$Distance<500]<-1
结果必须是多列。此代码仅创建一列 data$DistanceCut5 = cut(data$Distance, breaks=c(0,100,200,300,400,500))
cut
将创建一个单独的列,但是如果你想为每个切割级别创建 1 列,你可以这样做:
例子
图书馆
library(tidyverse)
代码
# Vector with a sequence from 0 to 500 by 100
seq_0_500 <- seq(0,500,100)
# Example data.frame
tibble(
# Variable distance = sequence from 1 to 500 by 1
distance = 1:500
) %>%
mutate(
#Create a categoric variable by 100: `(0,100]` `(100,200]` `(200,300]` `(300,400]` `(400,500]`
distance_cut = cut(distance,seq_0_500, labels = paste0("Distance_",seq_0_500[-1])),
#Auxiliar variable
aux = 1
) %>%
# Pivot data to make one column for each cut level
pivot_wider(names_from = distance_cut,values_from = aux) %>%
# Replace every NA for 0
replace(is.na(.),0)
输出
# A tibble: 500 x 6
distance Distance_100 Distance_200 Distance_300 Distance_400 Distance_500
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 0 0
2 2 1 0 0 0 0
3 3 1 0 0 0 0
4 4 1 0 0 0 0
5 5 1 0 0 0 0
6 6 1 0 0 0 0
7 7 1 0 0 0 0
8 8 1 0 0 0 0
9 9 1 0 0 0 0
10 10 1 0 0 0 0
# ... with 490 more rows
这是另一种方法。首先提供可重现的数据:
set.seed(42)
var <- round(runif(50, 0, 500))
dummy <- cut(var, breaks=c(0, 100, 200, 300, 400, 500))
table(dummy)
# dummy
# (0,100] (100,200] (200,300] (300,400] (400,500]
# 7 6 9 10 18
现在为每个值创建列:
dumvar <-table(row(as.matrix(dummy)), dummy)
head(dumvar); tail(dumvar)
# dummy
# (0,100] (100,200] (200,300] (300,400] (400,500]
# 1 0 0 0 0 1
# 2 0 0 0 0 1
# 3 0 1 0 0 0
# 4 0 0 0 0 1
# 5 0 0 0 1 0
# 6 0 0 1 0 0
# dummy
# (0,100] (100,200] (200,300] (300,400] (400,500]
# 45 0 0 1 0 0
# 46 0 0 0 0 1
# 47 0 0 0 0 1
# 48 0 0 0 1 0
# 49 0 0 0 0 1
# 50 0 0 0 1 0
如果要重命名列:
dimnames(dumvar)$dummy <- paste0("Distance_", seq(100, 500, by=100))
这是一个不错的方法:首先剪切数据,然后使用 model.matrix()
创建虚拟变量。
data <- data.frame(Distance = runif(20, 0, 500))
DistanceCut5 = cut(data$Distance, breaks=c(0,100,200,300,400,500))
dummies <- model.matrix(~ DistanceCut5 + 0) # + 0 so we don't have a column of 1s
data <- cbind(data, dummies)
确保 DistanceCut5
中没有任何 NA。否则你的虚拟矩阵中的行会太少。
data$Distance_100<-0
data$Distance_100[data$Distance<100]<-1
data$Distance_200<-0
data$Distance_200[data$Distance>=101&data$Distance<200]<-1
data$Distance_300<-0
data$Distance_300[data$Distance>=201&data$Distance<300]<-1
data$Distance_400<-0
data$Distance_400[data$Distance>=301&data$Distance<400]<-1
data$Distance_500<-0
data$Distance_500[data$Distance>=401&data$Distance<500]<-1
结果必须是多列。此代码仅创建一列 data$DistanceCut5 = cut(data$Distance, breaks=c(0,100,200,300,400,500))
cut
将创建一个单独的列,但是如果你想为每个切割级别创建 1 列,你可以这样做:
例子
图书馆
library(tidyverse)
代码
# Vector with a sequence from 0 to 500 by 100
seq_0_500 <- seq(0,500,100)
# Example data.frame
tibble(
# Variable distance = sequence from 1 to 500 by 1
distance = 1:500
) %>%
mutate(
#Create a categoric variable by 100: `(0,100]` `(100,200]` `(200,300]` `(300,400]` `(400,500]`
distance_cut = cut(distance,seq_0_500, labels = paste0("Distance_",seq_0_500[-1])),
#Auxiliar variable
aux = 1
) %>%
# Pivot data to make one column for each cut level
pivot_wider(names_from = distance_cut,values_from = aux) %>%
# Replace every NA for 0
replace(is.na(.),0)
输出
# A tibble: 500 x 6
distance Distance_100 Distance_200 Distance_300 Distance_400 Distance_500
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 0 0
2 2 1 0 0 0 0
3 3 1 0 0 0 0
4 4 1 0 0 0 0
5 5 1 0 0 0 0
6 6 1 0 0 0 0
7 7 1 0 0 0 0
8 8 1 0 0 0 0
9 9 1 0 0 0 0
10 10 1 0 0 0 0
# ... with 490 more rows
这是另一种方法。首先提供可重现的数据:
set.seed(42)
var <- round(runif(50, 0, 500))
dummy <- cut(var, breaks=c(0, 100, 200, 300, 400, 500))
table(dummy)
# dummy
# (0,100] (100,200] (200,300] (300,400] (400,500]
# 7 6 9 10 18
现在为每个值创建列:
dumvar <-table(row(as.matrix(dummy)), dummy)
head(dumvar); tail(dumvar)
# dummy
# (0,100] (100,200] (200,300] (300,400] (400,500]
# 1 0 0 0 0 1
# 2 0 0 0 0 1
# 3 0 1 0 0 0
# 4 0 0 0 0 1
# 5 0 0 0 1 0
# 6 0 0 1 0 0
# dummy
# (0,100] (100,200] (200,300] (300,400] (400,500]
# 45 0 0 1 0 0
# 46 0 0 0 0 1
# 47 0 0 0 0 1
# 48 0 0 0 1 0
# 49 0 0 0 0 1
# 50 0 0 0 1 0
如果要重命名列:
dimnames(dumvar)$dummy <- paste0("Distance_", seq(100, 500, by=100))
这是一个不错的方法:首先剪切数据,然后使用 model.matrix()
创建虚拟变量。
data <- data.frame(Distance = runif(20, 0, 500))
DistanceCut5 = cut(data$Distance, breaks=c(0,100,200,300,400,500))
dummies <- model.matrix(~ DistanceCut5 + 0) # + 0 so we don't have a column of 1s
data <- cbind(data, dummies)
确保 DistanceCut5
中没有任何 NA。否则你的虚拟矩阵中的行会太少。