每2列加1个空列
Add 1 blank column every 2 columns
我有一个像这样的数据框“df”:
col1 col2 col3 col4 col5 col6
1 2 2 3 5 7
2 4 6 4 8 2
5 9 7 3 2 5
3 4 5 6 8 1
我想创建一个新的数据框“new_df”,其中每 2 列有 1 个空白列(称为“空”),如下所示:
empty col1 col2 empty col3 col4 empty col5 col6
NA 1 2 NA 2 3 NA 5 7
NA 2 4 NA 6 4 NA 8 2
NA 5 9 NA 7 3 NA 2 5
NA 3 4 NA 5 6 NA 8 1
这种方式如何添加空白列?
我试过使用:
n = length(df)
empty <- NA
for (i in seq(1,n-2,2))
{
new_df <- add_column(df, empty, .before=i)
}
但它只记住最后一步,给了我这个结果:
col1 col2 col3 col4 empty col5 col6
1 2 2 3 NA 5 7
2 4 6 4 NA 8 2
5 9 7 3 NA 2 5
3 4 5 6 NA 8 1
这是一个基本的 R 选项 -
我们可以将数据每 2 列拆分为数据框列表,并使用 Map
在每个数据框中添加一个新列 NA
。
split_data <- split.default(df,rep(seq_along(df), each = 2, length.out = ncol(df)))
do.call(cbind, Map(function(x, y)
cbind(setNames(data.frame(NA), paste0('empty', x)), y),
seq_along(split_data), split_data)) -> result
result
# empty1 col1 col2 empty2 col3 col4 empty3 col5 col6
#1 NA 1 2 NA 2 3 NA 5 7
#2 NA 2 4 NA 6 4 NA 8 2
#3 NA 5 9 NA 7 3 NA 2 5
#4 NA 3 4 NA 5 6 NA 8 1
在数据框中使用重复的列名不是一个好习惯,因此我将它们命名为 empty1
、empty2
等
数据
df <- structure(list(col1 = c(1L, 2L, 5L, 3L), col2 = c(2L, 4L, 9L,
4L), col3 = c(2L, 6L, 7L, 5L), col4 = c(3L, 4L, 3L, 6L), col5 = c(5L,
8L, 2L, 8L), col6 = c(7L, 2L, 5L, 1L)),
class = "data.frame", row.names = c(NA, -4L))
另一个基础 R 解决方案
tmp1=seq(1,ncol(df),3)
tmp2=!(1:ncol(df) %in% tmp1)
df2=data.frame(matrix(NA,nrow(df),ncol(df)+ncol(df)/2))
df2[tmp2]=df
colnames(df2)[tmp1]=paste0("empty",1:length(tmp1))
colnames(df2)[tmp2]=colnames(df)
empty1 col1 col2 empty2 col3 col4 empty3 col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
使用 append()
.
for (i in 0:2*ncol(dat)/2) dat <- as.data.frame(append(dat, list(emp=NA), i))
dat
# emp col1 col2 emp.1 col3 col4 emp.2 col5 col6
# 1 NA 1 2 NA 2 3 NA 5 7
# 2 NA 2 4 NA 6 4 NA 8 2
# 3 NA 5 9 NA 7 3 NA 2 5
# 4 NA 3 4 NA 5 6 NA 8 1
数据:
dat <- structure(list(col1 = c(1L, 2L, 5L, 3L), col2 = c(2L, 4L, 9L,
4L), col3 = c(2L, 6L, 7L, 5L), col4 = c(3L, 4L, 3L, 6L), col5 = c(5L,
8L, 2L, 8L), col6 = c(7L, 2L, 5L, 1L)), class = "data.frame", row.names = c(NA,
-4L))
然后...
微基准测试
# Unit: microseconds
# expr min lq mean median uq max neval cld
# ronak() 969.707 990.9945 1001.4807 1012.282 1017.368 1022.453 3 d
# user() 349.937 358.0145 364.3877 366.092 371.613 377.134 3 a
# jay() 2098.003 2100.8540 2115.7640 2103.705 2124.644 2145.584 3 e
# groth1() 2164.896 2262.5745 2363.6133 2360.253 2462.972 2565.691 3 f
# groth2() 424.546 438.0185 455.0820 451.491 470.350 489.209 3 ab
# groth3() 722.551 728.0910 733.1910 733.631 738.511 743.391 3 c
# r.user() 612.432 619.6570 636.9573 626.882 649.220 671.558 3 bc
## and with the usual expanded data frame:
set.seed(42)
dat <- dat[sample(nrow(dat), 1e6, replace=T), ]
microbenchmark::microbenchmark(ronak(), user(), jay(), groth1(), groth2(), groth3(), r.user(), times=3L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# ronak() 1375.139030 1456.858743 1564.509886 1538.578457 1659.19531 1779.81217 3 c
# user() 89.017416 200.845539 251.548652 312.673662 332.81427 352.95488 3 a
# jay() 7.655812 8.382333 9.941684 9.108855 11.08462 13.06039 3 a
# groth1() 501.263785 514.097103 621.755474 526.930421 682.00132 837.07222 3 b
# groth2() 143.438836 147.783741 189.033391 152.128645 211.83067 271.53269 3 a
# groth3() 1387.314877 1406.898863 1469.493158 1426.482849 1510.58230 1594.68175 3 c
# r.user() 1469.543881 1472.770464 1483.834022 1475.997046 1490.97909 1505.96114 3 c
代码:
ronak <- \() {
split_data <- split.default(dat,rep(seq_along(dat), each=2, length.out=ncol(dat)))
do.call(cbind, Map(function(x, y) cbind(setNames(data.frame(NA), paste0('empty', x)), y),
seq_along(split_data), split_data))
}
user <- \() {
tmp1=seq(1, 9,3);tmp2=!(1:9 %in% tmp1);dat2=data.frame(matrix(NA,nrow(dat),ncol(dat)+ncol(dat)/2))
dat2[tmp2]=dat;colnames(dat2)[tmp1]=paste0("empty",1:length(tmp1))
colnames(dat2)[tmp2]=colnames(dat);dat2
}
jay <- \() {for (i in 0:2*ncol(dat)/2) dat <- as.data.frame(append(dat, list(emp=NA), i));dat}
groth1 <- \() suppressMessages({
require(dplyr):require(purrr)
dat %>% split.default(as.numeric(gl(ncol(.), 2, ncol(.)))) %>% map(~ bind_cols(empty=NA, .)) %>%
bind_cols
})
groth2 <- \() {
ix <- cumsum(seq_along(dat) %% 2 + 1);dat2 <- replace(data.frame(matrix(NA, nrow(dat), max(ix))), ix, dat)
names(dat2) <- replace(rep("empty", ncol(dat2)), ix, names(dat));dat2
}
groth3 <- \() {
ix <- as.numeric(gl(ncol(dat), 2, ncol(dat))) # 1 1 2 2 3 3
do.call("cbind", Map(cbind, empty = NA, split.default(dat, ix)))
}
r.user <- \() do.call(cbind, lapply(seq(1, ncol(dat), by=2), function(i)
cbind(empty=rep(NA, nrow(dat)), dat[, seq(i, i+1)])))
!) dplyr/purrr 拆分数据框,DF,在每个组件之前绑定一个 NA 列,并将生成的组件重新绑定在一起。在多个列中使用与问题示例输出中相同的列名存在无法按名称识别列的问题,因此使用唯一名称。
library(dplyr)
library(purrr)
DF %>%
split.default(as.numeric(gl(ncol(.), 2, ncol(.)))) %>%
map(~ bind_cols(empty = NA, .)) %>%
bind_cols
给予:
empty...1 col1 col2 empty...4 col3 col4 empty...7 col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
2) Base R 创建一个向量ix,它在结果数据框中给出原始数据框的索引,然后创建一个空结果并将DF及其名称复制到它。
ix <- cumsum(seq_along(DF) %% 2 + 1) # 2 3 5 6 8 9
DF2 <- replace(data.frame(matrix(NA, nrow(DF), max(ix))), ix, DF)
names(DF2) <- replace(rep("empty", ncol(DF2)), ix, names(DF))
DF2
给予:
empty col1 col2 empty col3 col4 empty col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
3) Base R 这是另一个Base R的解决方案。它粗略地将 (1) 翻译成 Base R。它给出与 (2) 相同的结果。
ix <- as.numeric(gl(ncol(DF), 2, ncol(DF))) # 1 1 2 2 3 3
do.call("cbind", Map(cbind, empty = NA, split.default(DF, ix)))
4) eList eList包可以用于特别短的解决方案。
library(eList)
DF(for(i in seq(1, ncol(DF), 2)) list(empty = NA, DF[seq(i, len = 2)]))
给予:
empty col1 col2 empty.1 col3 col4 empty.2 col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
备注
可重现形式的输入。
Lines <- "col1 col2 col3 col4 col5 col6
1 2 2 3 5 7
2 4 6 4 8 2
5 9 7 3 2 5
3 4 5 6 8 1"
DF <- read.table(text = Lines, header = TRUE)
基本的 R 解决方案是:
do.call(cbind, lapply(seq(1, ncol(df), by = 2), function(i) cbind(empty = rep(NA, nrow(df)), df[, seq(i, i+1)])))
# empty col1 col2 empty col3 col4 empty col5 col6
#1 NA 1 2 NA 2 3 NA 5 7
#2 NA 2 4 NA 6 4 NA 8 2
#3 NA 5 9 NA 7 3 NA 2 5
#4 NA 3 4 NA 5 6 NA 8 1
“整洁”的解决方案可以是:
library(tidyverse)
map_dfc(seq(from = 1, to = ncol(df), by = 2),
~df %>%
mutate(empty = NA) %>%
select(empty, .x, .x+1))
#New names:
#* empty -> empty...1
#* empty -> empty...4
#* empty -> empty...7
#empty...1 col1 col2 empty...4 col3 col4 empty...7 col5 col6
#1 NA 1 2 NA 2 3 NA 5 7
#2 NA 2 4 NA 6 4 NA 8 2
#3 NA 5 9 NA 7 3 NA 2 5
#4 NA 3 4 NA 5 6 NA 8 1
我有一个像这样的数据框“df”:
col1 col2 col3 col4 col5 col6
1 2 2 3 5 7
2 4 6 4 8 2
5 9 7 3 2 5
3 4 5 6 8 1
我想创建一个新的数据框“new_df”,其中每 2 列有 1 个空白列(称为“空”),如下所示:
empty col1 col2 empty col3 col4 empty col5 col6
NA 1 2 NA 2 3 NA 5 7
NA 2 4 NA 6 4 NA 8 2
NA 5 9 NA 7 3 NA 2 5
NA 3 4 NA 5 6 NA 8 1
这种方式如何添加空白列? 我试过使用:
n = length(df)
empty <- NA
for (i in seq(1,n-2,2))
{
new_df <- add_column(df, empty, .before=i)
}
但它只记住最后一步,给了我这个结果:
col1 col2 col3 col4 empty col5 col6
1 2 2 3 NA 5 7
2 4 6 4 NA 8 2
5 9 7 3 NA 2 5
3 4 5 6 NA 8 1
这是一个基本的 R 选项 -
我们可以将数据每 2 列拆分为数据框列表,并使用 Map
在每个数据框中添加一个新列 NA
。
split_data <- split.default(df,rep(seq_along(df), each = 2, length.out = ncol(df)))
do.call(cbind, Map(function(x, y)
cbind(setNames(data.frame(NA), paste0('empty', x)), y),
seq_along(split_data), split_data)) -> result
result
# empty1 col1 col2 empty2 col3 col4 empty3 col5 col6
#1 NA 1 2 NA 2 3 NA 5 7
#2 NA 2 4 NA 6 4 NA 8 2
#3 NA 5 9 NA 7 3 NA 2 5
#4 NA 3 4 NA 5 6 NA 8 1
在数据框中使用重复的列名不是一个好习惯,因此我将它们命名为 empty1
、empty2
等
数据
df <- structure(list(col1 = c(1L, 2L, 5L, 3L), col2 = c(2L, 4L, 9L,
4L), col3 = c(2L, 6L, 7L, 5L), col4 = c(3L, 4L, 3L, 6L), col5 = c(5L,
8L, 2L, 8L), col6 = c(7L, 2L, 5L, 1L)),
class = "data.frame", row.names = c(NA, -4L))
另一个基础 R 解决方案
tmp1=seq(1,ncol(df),3)
tmp2=!(1:ncol(df) %in% tmp1)
df2=data.frame(matrix(NA,nrow(df),ncol(df)+ncol(df)/2))
df2[tmp2]=df
colnames(df2)[tmp1]=paste0("empty",1:length(tmp1))
colnames(df2)[tmp2]=colnames(df)
empty1 col1 col2 empty2 col3 col4 empty3 col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
使用 append()
.
for (i in 0:2*ncol(dat)/2) dat <- as.data.frame(append(dat, list(emp=NA), i))
dat
# emp col1 col2 emp.1 col3 col4 emp.2 col5 col6
# 1 NA 1 2 NA 2 3 NA 5 7
# 2 NA 2 4 NA 6 4 NA 8 2
# 3 NA 5 9 NA 7 3 NA 2 5
# 4 NA 3 4 NA 5 6 NA 8 1
数据:
dat <- structure(list(col1 = c(1L, 2L, 5L, 3L), col2 = c(2L, 4L, 9L,
4L), col3 = c(2L, 6L, 7L, 5L), col4 = c(3L, 4L, 3L, 6L), col5 = c(5L,
8L, 2L, 8L), col6 = c(7L, 2L, 5L, 1L)), class = "data.frame", row.names = c(NA,
-4L))
然后...
微基准测试
# Unit: microseconds
# expr min lq mean median uq max neval cld
# ronak() 969.707 990.9945 1001.4807 1012.282 1017.368 1022.453 3 d
# user() 349.937 358.0145 364.3877 366.092 371.613 377.134 3 a
# jay() 2098.003 2100.8540 2115.7640 2103.705 2124.644 2145.584 3 e
# groth1() 2164.896 2262.5745 2363.6133 2360.253 2462.972 2565.691 3 f
# groth2() 424.546 438.0185 455.0820 451.491 470.350 489.209 3 ab
# groth3() 722.551 728.0910 733.1910 733.631 738.511 743.391 3 c
# r.user() 612.432 619.6570 636.9573 626.882 649.220 671.558 3 bc
## and with the usual expanded data frame:
set.seed(42)
dat <- dat[sample(nrow(dat), 1e6, replace=T), ]
microbenchmark::microbenchmark(ronak(), user(), jay(), groth1(), groth2(), groth3(), r.user(), times=3L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# ronak() 1375.139030 1456.858743 1564.509886 1538.578457 1659.19531 1779.81217 3 c
# user() 89.017416 200.845539 251.548652 312.673662 332.81427 352.95488 3 a
# jay() 7.655812 8.382333 9.941684 9.108855 11.08462 13.06039 3 a
# groth1() 501.263785 514.097103 621.755474 526.930421 682.00132 837.07222 3 b
# groth2() 143.438836 147.783741 189.033391 152.128645 211.83067 271.53269 3 a
# groth3() 1387.314877 1406.898863 1469.493158 1426.482849 1510.58230 1594.68175 3 c
# r.user() 1469.543881 1472.770464 1483.834022 1475.997046 1490.97909 1505.96114 3 c
代码:
ronak <- \() {
split_data <- split.default(dat,rep(seq_along(dat), each=2, length.out=ncol(dat)))
do.call(cbind, Map(function(x, y) cbind(setNames(data.frame(NA), paste0('empty', x)), y),
seq_along(split_data), split_data))
}
user <- \() {
tmp1=seq(1, 9,3);tmp2=!(1:9 %in% tmp1);dat2=data.frame(matrix(NA,nrow(dat),ncol(dat)+ncol(dat)/2))
dat2[tmp2]=dat;colnames(dat2)[tmp1]=paste0("empty",1:length(tmp1))
colnames(dat2)[tmp2]=colnames(dat);dat2
}
jay <- \() {for (i in 0:2*ncol(dat)/2) dat <- as.data.frame(append(dat, list(emp=NA), i));dat}
groth1 <- \() suppressMessages({
require(dplyr):require(purrr)
dat %>% split.default(as.numeric(gl(ncol(.), 2, ncol(.)))) %>% map(~ bind_cols(empty=NA, .)) %>%
bind_cols
})
groth2 <- \() {
ix <- cumsum(seq_along(dat) %% 2 + 1);dat2 <- replace(data.frame(matrix(NA, nrow(dat), max(ix))), ix, dat)
names(dat2) <- replace(rep("empty", ncol(dat2)), ix, names(dat));dat2
}
groth3 <- \() {
ix <- as.numeric(gl(ncol(dat), 2, ncol(dat))) # 1 1 2 2 3 3
do.call("cbind", Map(cbind, empty = NA, split.default(dat, ix)))
}
r.user <- \() do.call(cbind, lapply(seq(1, ncol(dat), by=2), function(i)
cbind(empty=rep(NA, nrow(dat)), dat[, seq(i, i+1)])))
!) dplyr/purrr 拆分数据框,DF,在每个组件之前绑定一个 NA 列,并将生成的组件重新绑定在一起。在多个列中使用与问题示例输出中相同的列名存在无法按名称识别列的问题,因此使用唯一名称。
library(dplyr)
library(purrr)
DF %>%
split.default(as.numeric(gl(ncol(.), 2, ncol(.)))) %>%
map(~ bind_cols(empty = NA, .)) %>%
bind_cols
给予:
empty...1 col1 col2 empty...4 col3 col4 empty...7 col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
2) Base R 创建一个向量ix,它在结果数据框中给出原始数据框的索引,然后创建一个空结果并将DF及其名称复制到它。
ix <- cumsum(seq_along(DF) %% 2 + 1) # 2 3 5 6 8 9
DF2 <- replace(data.frame(matrix(NA, nrow(DF), max(ix))), ix, DF)
names(DF2) <- replace(rep("empty", ncol(DF2)), ix, names(DF))
DF2
给予:
empty col1 col2 empty col3 col4 empty col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
3) Base R 这是另一个Base R的解决方案。它粗略地将 (1) 翻译成 Base R。它给出与 (2) 相同的结果。
ix <- as.numeric(gl(ncol(DF), 2, ncol(DF))) # 1 1 2 2 3 3
do.call("cbind", Map(cbind, empty = NA, split.default(DF, ix)))
4) eList eList包可以用于特别短的解决方案。
library(eList)
DF(for(i in seq(1, ncol(DF), 2)) list(empty = NA, DF[seq(i, len = 2)]))
给予:
empty col1 col2 empty.1 col3 col4 empty.2 col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
备注
可重现形式的输入。
Lines <- "col1 col2 col3 col4 col5 col6
1 2 2 3 5 7
2 4 6 4 8 2
5 9 7 3 2 5
3 4 5 6 8 1"
DF <- read.table(text = Lines, header = TRUE)
基本的 R 解决方案是:
do.call(cbind, lapply(seq(1, ncol(df), by = 2), function(i) cbind(empty = rep(NA, nrow(df)), df[, seq(i, i+1)])))
# empty col1 col2 empty col3 col4 empty col5 col6
#1 NA 1 2 NA 2 3 NA 5 7
#2 NA 2 4 NA 6 4 NA 8 2
#3 NA 5 9 NA 7 3 NA 2 5
#4 NA 3 4 NA 5 6 NA 8 1
“整洁”的解决方案可以是:
library(tidyverse)
map_dfc(seq(from = 1, to = ncol(df), by = 2),
~df %>%
mutate(empty = NA) %>%
select(empty, .x, .x+1))
#New names:
#* empty -> empty...1
#* empty -> empty...4
#* empty -> empty...7
#empty...1 col1 col2 empty...4 col3 col4 empty...7 col5 col6
#1 NA 1 2 NA 2 3 NA 5 7
#2 NA 2 4 NA 6 4 NA 8 2
#3 NA 5 9 NA 7 3 NA 2 5
#4 NA 3 4 NA 5 6 NA 8 1