使用列中的值将 N 行空行插入到 R 数据框中
Insert N Blank Rows to R Data Frame Using Value in a Column
我有一个数据框:
Var_1 = c("A","B","C","D","E","F","G","H")
Var_2 = c(0,1,0,2,1,0,0,1)
DF = data.frame(Var_1,Var_2)
print(DF)
Var_1 Var_2
1 A 0
2 B 1
3 C 0
4 D 2
5 E 1
6 F 0
7 G 0
8 H 1
我需要使用 Var_2 中找到的值将 N 个用 NA 填充的空白行插入到数据框中确定 N。这些新行应在 Var_2 值 >=1 后立即插入。所以我希望我的数据框看起来像这样:
print(DF)
Var_1 Var_2
1 A 0
2 B 1
3 <NA> NA
4 C 0
5 D 2
6 <NA> NA
7 <NA> NA
8 E 0
9 F 0
10 G 0
11 H 1
12 <NA> NA
我对此深有体会,如有任何帮助,我们将不胜感激。谢谢。
有趣的问题:
ind <- which(DF$Var_2 > 0)
ind
# [1] 2 4 5 8
starts <- 1L + unique(c(0L, head(ind, n = -1)))
stops <- unique(c(ind, nrow(DF))) # in case the last !0 is not on bottom row
starts
# [1] 1 3 5 6
stops
# [1] 2 4 5 8
DFaug_list <- Map(
function(a, b) rbind(DF[a:b,], DF[b,][rep(NA, DF$Var_2[b]), ]),
starts, stops)
我们现在有一个框架列表:
str(DFaug_list)
# List of 4
# $ :'data.frame': 3 obs. of 2 variables:
# ..$ Var_1: Factor w/ 8 levels "A","B","C","D",..: 1 2 NA
# ..$ Var_2: int [1:3] 0 1 NA
# $ :'data.frame': 4 obs. of 2 variables:
# ..$ Var_1: Factor w/ 8 levels "A","B","C","D",..: 3 4 NA NA
# ..$ Var_2: int [1:4] 0 2 NA NA
# $ :'data.frame': 2 obs. of 2 variables:
# ..$ Var_1: Factor w/ 8 levels "A","B","C","D",..: 5 NA
# ..$ Var_2: int [1:2] 1 NA
# $ :'data.frame': 4 obs. of 2 variables:
# ..$ Var_1: Factor w/ 8 levels "A","B","C","D",..: 6 7 8 NA
# ..$ Var_2: int [1:4] 0 0 1 NA
我们需要做的就是将它们结合起来,使用 do.call
的技巧或使用 data.table
或 dplyr
包中的函数:
DFaug <- do.call(rbind.data.frame, DFaug_list)
DFaug
# Var_1 Var_2
# 1 A 0
# 2 B 1
# NA <NA> NA
# 3 C 0
# 4 D 2
# NA1 <NA> NA
# NA.1 <NA> NA
# 5 E 1
# NA2 <NA> NA
# 6 F 0
# 7 G 0
# 8 H 1
# NA3 <NA> NA
DFaug <- data.table::rbindlist(DFaug_list)
DFaug <- dplyr::bind_rows(DFaug_list)
序列和子集来拯救。
当值为 > 0
时,按 Var_2 + 1
复制每一行会生成 Var_2
额外的行,并在值为 == 0
时保留该行
然后你可以在序列上使用 duplicated
TRUE/FALSE
只用 NA
.
替换额外的行
s <- rep(sequence(nrow(DF)), DF$Var_2 + 1)
DFnew <- DF[s,]
DFnew[duplicated(s),] <- NA
DFnew
# Var_1 Var_2
#1 A 0
#2 B 1
#2.1 <NA> NA
#3 C 0
#4 D 2
#4.1 <NA> NA
#4.2 <NA> NA
#5 E 1
#5.1 <NA> NA
#6 F 0
#7 G 0
#8 H 1
#8.1 <NA> NA
除了最庞大的数据集外,这在所有数据集上也应该非常快。
# 800K records
DF <- DF[rep(1:8,1e5),]
system.time({
s <- rep(sequence(nrow(DF)), DF$Var_2 + 1)
DFnew <- DF[s,]
DFnew[duplicated(s),] <- NA
})
# user system elapsed
# 0.600 0.000 0.601
您可以将每一行重复 Var_2
次并将 duplicated
行替换为 NA
library(dplyr)
DF1 <- DF %>% mutate(Var_3 = Var_2 + 1, row = row_number()) %>%
tidyr::uncount(Var_3)
DF1[duplicated(DF1$row), ] <- NA
DF1$row <- NULL
# Var_1 Var_2
#1 A 0
#2 B 1
#2.1 <NA> NA
#3 C 0
#4 D 2
#4.1 <NA> NA
#4.2 <NA> NA
#5 E 1
#5.1 <NA> NA
#6 F 0
#7 G 0
#8 H 1
#8.1 <NA> NA
我有一个数据框:
Var_1 = c("A","B","C","D","E","F","G","H")
Var_2 = c(0,1,0,2,1,0,0,1)
DF = data.frame(Var_1,Var_2)
print(DF)
Var_1 Var_2
1 A 0
2 B 1
3 C 0
4 D 2
5 E 1
6 F 0
7 G 0
8 H 1
我需要使用 Var_2 中找到的值将 N 个用 NA 填充的空白行插入到数据框中确定 N。这些新行应在 Var_2 值 >=1 后立即插入。所以我希望我的数据框看起来像这样:
print(DF)
Var_1 Var_2
1 A 0
2 B 1
3 <NA> NA
4 C 0
5 D 2
6 <NA> NA
7 <NA> NA
8 E 0
9 F 0
10 G 0
11 H 1
12 <NA> NA
我对此深有体会,如有任何帮助,我们将不胜感激。谢谢。
有趣的问题:
ind <- which(DF$Var_2 > 0)
ind
# [1] 2 4 5 8
starts <- 1L + unique(c(0L, head(ind, n = -1)))
stops <- unique(c(ind, nrow(DF))) # in case the last !0 is not on bottom row
starts
# [1] 1 3 5 6
stops
# [1] 2 4 5 8
DFaug_list <- Map(
function(a, b) rbind(DF[a:b,], DF[b,][rep(NA, DF$Var_2[b]), ]),
starts, stops)
我们现在有一个框架列表:
str(DFaug_list)
# List of 4
# $ :'data.frame': 3 obs. of 2 variables:
# ..$ Var_1: Factor w/ 8 levels "A","B","C","D",..: 1 2 NA
# ..$ Var_2: int [1:3] 0 1 NA
# $ :'data.frame': 4 obs. of 2 variables:
# ..$ Var_1: Factor w/ 8 levels "A","B","C","D",..: 3 4 NA NA
# ..$ Var_2: int [1:4] 0 2 NA NA
# $ :'data.frame': 2 obs. of 2 variables:
# ..$ Var_1: Factor w/ 8 levels "A","B","C","D",..: 5 NA
# ..$ Var_2: int [1:2] 1 NA
# $ :'data.frame': 4 obs. of 2 variables:
# ..$ Var_1: Factor w/ 8 levels "A","B","C","D",..: 6 7 8 NA
# ..$ Var_2: int [1:4] 0 0 1 NA
我们需要做的就是将它们结合起来,使用 do.call
的技巧或使用 data.table
或 dplyr
包中的函数:
DFaug <- do.call(rbind.data.frame, DFaug_list)
DFaug
# Var_1 Var_2
# 1 A 0
# 2 B 1
# NA <NA> NA
# 3 C 0
# 4 D 2
# NA1 <NA> NA
# NA.1 <NA> NA
# 5 E 1
# NA2 <NA> NA
# 6 F 0
# 7 G 0
# 8 H 1
# NA3 <NA> NA
DFaug <- data.table::rbindlist(DFaug_list)
DFaug <- dplyr::bind_rows(DFaug_list)
序列和子集来拯救。
当值为 > 0
时,按 Var_2 + 1
复制每一行会生成 Var_2
额外的行,并在值为 == 0
时保留该行
然后你可以在序列上使用 duplicated
TRUE/FALSE
只用 NA
.
s <- rep(sequence(nrow(DF)), DF$Var_2 + 1)
DFnew <- DF[s,]
DFnew[duplicated(s),] <- NA
DFnew
# Var_1 Var_2
#1 A 0
#2 B 1
#2.1 <NA> NA
#3 C 0
#4 D 2
#4.1 <NA> NA
#4.2 <NA> NA
#5 E 1
#5.1 <NA> NA
#6 F 0
#7 G 0
#8 H 1
#8.1 <NA> NA
除了最庞大的数据集外,这在所有数据集上也应该非常快。
# 800K records
DF <- DF[rep(1:8,1e5),]
system.time({
s <- rep(sequence(nrow(DF)), DF$Var_2 + 1)
DFnew <- DF[s,]
DFnew[duplicated(s),] <- NA
})
# user system elapsed
# 0.600 0.000 0.601
您可以将每一行重复 Var_2
次并将 duplicated
行替换为 NA
library(dplyr)
DF1 <- DF %>% mutate(Var_3 = Var_2 + 1, row = row_number()) %>%
tidyr::uncount(Var_3)
DF1[duplicated(DF1$row), ] <- NA
DF1$row <- NULL
# Var_1 Var_2
#1 A 0
#2 B 1
#2.1 <NA> NA
#3 C 0
#4 D 2
#4.1 <NA> NA
#4.2 <NA> NA
#5 E 1
#5.1 <NA> NA
#6 F 0
#7 G 0
#8 H 1
#8.1 <NA> NA