每2列加1个空列

Add 1 blank column every 2 columns

我有一个像这样的数据框“df”:

col1 col2 col3 col4 col5 col6
 1    2    2    3    5    7
 2    4    6    4    8    2
 5    9    7    3    2    5
 3    4    5    6    8    1

我想创建一个新的数据框“new_df”,其中每 2 列有 1 个空白列(称为“空”),如下所示:

empty col1 col2 empty col3 col4 empty col5 col6
 NA    1    2    NA    2    3    NA    5    7
 NA    2    4    NA    6    4    NA    8    2
 NA    5    9    NA    7    3    NA    2    5
 NA    3    4    NA    5    6    NA    8    1

这种方式如何添加空白列? 我试过使用:

n = length(df)
empty <- NA

for (i in seq(1,n-2,2))
  {
  new_df <- add_column(df, empty, .before=i)
  }

但它只记住最后一步,给了我这个结果:

col1 col2 col3 col4 empty col5 col6
 1    2     2    3   NA    5    7
 2    4     6    4   NA    8    2
 5    9     7    3   NA    2    5
 3    4     5    6   NA    8    1

这是一个基本的 R 选项 -

我们可以将数据每 2 列拆分为数据框列表,并使用 Map 在每个数据框中添加一个新列 NA

split_data <- split.default(df,rep(seq_along(df), each = 2, length.out = ncol(df)))

do.call(cbind, Map(function(x, y) 
  cbind(setNames(data.frame(NA), paste0('empty', x)), y), 
  seq_along(split_data), split_data)) -> result

result

#  empty1 col1 col2 empty2 col3 col4 empty3 col5 col6
#1     NA    1    2     NA    2    3     NA    5    7
#2     NA    2    4     NA    6    4     NA    8    2
#3     NA    5    9     NA    7    3     NA    2    5
#4     NA    3    4     NA    5    6     NA    8    1

在数据框中使用重复的列名不是一个好习惯,因此我将它们命名为 empty1empty2

数据

df <- structure(list(col1 = c(1L, 2L, 5L, 3L), col2 = c(2L, 4L, 9L, 
4L), col3 = c(2L, 6L, 7L, 5L), col4 = c(3L, 4L, 3L, 6L), col5 = c(5L, 
8L, 2L, 8L), col6 = c(7L, 2L, 5L, 1L)), 
class = "data.frame", row.names = c(NA, -4L))

另一个基础 R 解决方案

tmp1=seq(1,ncol(df),3)
tmp2=!(1:ncol(df) %in% tmp1)

df2=data.frame(matrix(NA,nrow(df),ncol(df)+ncol(df)/2))
df2[tmp2]=df

colnames(df2)[tmp1]=paste0("empty",1:length(tmp1))
colnames(df2)[tmp2]=colnames(df)

  empty1 col1 col2 empty2 col3 col4 empty3 col5 col6
1     NA    1    2     NA    2    3     NA    5    7
2     NA    2    4     NA    6    4     NA    8    2
3     NA    5    9     NA    7    3     NA    2    5
4     NA    3    4     NA    5    6     NA    8    1

使用 append().

for (i in 0:2*ncol(dat)/2) dat <- as.data.frame(append(dat, list(emp=NA), i))
dat
#   emp col1 col2 emp.1 col3 col4 emp.2 col5 col6
# 1  NA    1    2    NA    2    3    NA    5    7
# 2  NA    2    4    NA    6    4    NA    8    2
# 3  NA    5    9    NA    7    3    NA    2    5
# 4  NA    3    4    NA    5    6    NA    8    1

数据:

dat <- structure(list(col1 = c(1L, 2L, 5L, 3L), col2 = c(2L, 4L, 9L, 
4L), col3 = c(2L, 6L, 7L, 5L), col4 = c(3L, 4L, 3L, 6L), col5 = c(5L, 
8L, 2L, 8L), col6 = c(7L, 2L, 5L, 1L)), class = "data.frame", row.names = c(NA, 
-4L))

然后...

微基准测试

# Unit: microseconds
# expr      min        lq      mean   median       uq      max neval    cld
#  ronak()  969.707  990.9945 1001.4807 1012.282 1017.368 1022.453     3    d  
#   user()  349.937  358.0145  364.3877  366.092  371.613  377.134     3 a     
#    jay() 2098.003 2100.8540 2115.7640 2103.705 2124.644 2145.584     3     e 
# groth1() 2164.896 2262.5745 2363.6133 2360.253 2462.972 2565.691     3      f
# groth2()  424.546  438.0185  455.0820  451.491  470.350  489.209     3 ab    
# groth3()  722.551  728.0910  733.1910  733.631  738.511  743.391     3   c   
# r.user()  612.432  619.6570  636.9573  626.882  649.220  671.558     3  bc   


## and with the usual expanded data frame:
set.seed(42)
dat <- dat[sample(nrow(dat), 1e6, replace=T), ]
microbenchmark::microbenchmark(ronak(), user(), jay(), groth1(), groth2(), groth3(), r.user(), times=3L)
# Unit: milliseconds
# expr         min          lq        mean      median         uq        max neval cld
#  ronak() 1375.139030 1456.858743 1564.509886 1538.578457 1659.19531 1779.81217     3   c
#   user()   89.017416  200.845539  251.548652  312.673662  332.81427  352.95488     3 a  
#    jay()    7.655812    8.382333    9.941684    9.108855   11.08462   13.06039     3 a  
# groth1()  501.263785  514.097103  621.755474  526.930421  682.00132  837.07222     3  b 
# groth2()  143.438836  147.783741  189.033391  152.128645  211.83067  271.53269     3 a  
# groth3() 1387.314877 1406.898863 1469.493158 1426.482849 1510.58230 1594.68175     3   c
# r.user() 1469.543881 1472.770464 1483.834022 1475.997046 1490.97909 1505.96114     3   c

代码:

ronak <- \() {
  split_data <- split.default(dat,rep(seq_along(dat), each=2, length.out=ncol(dat)))
  do.call(cbind, Map(function(x, y) cbind(setNames(data.frame(NA), paste0('empty', x)), y), 
                     seq_along(split_data), split_data))
}

user <- \() {
  tmp1=seq(1, 9,3);tmp2=!(1:9 %in% tmp1);dat2=data.frame(matrix(NA,nrow(dat),ncol(dat)+ncol(dat)/2))
  dat2[tmp2]=dat;colnames(dat2)[tmp1]=paste0("empty",1:length(tmp1))
  colnames(dat2)[tmp2]=colnames(dat);dat2
}

jay <- \() {for (i in 0:2*ncol(dat)/2) dat <- as.data.frame(append(dat, list(emp=NA), i));dat}

groth1 <- \() suppressMessages({
  require(dplyr):require(purrr)
  dat %>% split.default(as.numeric(gl(ncol(.), 2, ncol(.)))) %>% map(~ bind_cols(empty=NA, .)) %>%
    bind_cols
})

groth2 <- \() {
  ix <- cumsum(seq_along(dat) %% 2 + 1);dat2 <- replace(data.frame(matrix(NA, nrow(dat), max(ix))), ix, dat)
  names(dat2) <- replace(rep("empty", ncol(dat2)), ix, names(dat));dat2
}

groth3 <- \() {
  ix <- as.numeric(gl(ncol(dat), 2, ncol(dat)))  # 1 1 2 2 3 3
  do.call("cbind", Map(cbind, empty = NA, split.default(dat, ix)))
}

r.user <- \() do.call(cbind, lapply(seq(1, ncol(dat), by=2), function(i)
  cbind(empty=rep(NA, nrow(dat)), dat[, seq(i, i+1)])))

!) dplyr/purrr 拆分数据框,DF,在每个组件之前绑定一个 NA 列,并将生成的组件重新绑定在一起。在多个列中使用与问题示例输出中相同的列名存在无法按名称识别列的问题,因此使用唯一名称。

library(dplyr)
library(purrr)

DF %>%
  split.default(as.numeric(gl(ncol(.), 2, ncol(.)))) %>%
  map(~ bind_cols(empty = NA, .)) %>%
  bind_cols

给予:

  empty...1 col1 col2 empty...4 col3 col4 empty...7 col5 col6
1        NA    1    2        NA    2    3        NA    5    7
2        NA    2    4        NA    6    4        NA    8    2
3        NA    5    9        NA    7    3        NA    2    5
4        NA    3    4        NA    5    6        NA    8    1

2) Base R 创建一个向量ix,它在结果数据框中给出原始数据框的索引,然后创建一个空结果并将DF及其名称复制到它。

ix <- cumsum(seq_along(DF) %% 2 + 1)    # 2 3 5 6 8 9
DF2 <- replace(data.frame(matrix(NA, nrow(DF), max(ix))), ix, DF)
names(DF2) <- replace(rep("empty", ncol(DF2)), ix, names(DF))

DF2

给予:

  empty col1 col2 empty col3 col4 empty col5 col6
1    NA    1    2    NA    2    3    NA    5    7
2    NA    2    4    NA    6    4    NA    8    2
3    NA    5    9    NA    7    3    NA    2    5
4    NA    3    4    NA    5    6    NA    8    1

3) Base R 这是另一个Base R的解决方案。它粗略地将 (1) 翻译成 Base R。它给出与 (2) 相同的结果。

ix <- as.numeric(gl(ncol(DF), 2, ncol(DF)))  # 1 1 2 2 3 3
do.call("cbind", Map(cbind, empty = NA, split.default(DF, ix)))

4) eList eList包可以用于特别短的解决方案。

library(eList)

DF(for(i in seq(1, ncol(DF), 2)) list(empty = NA, DF[seq(i, len = 2)]))

给予:

  empty col1 col2 empty.1 col3 col4 empty.2 col5 col6
1    NA    1    2      NA    2    3      NA    5    7
2    NA    2    4      NA    6    4      NA    8    2
3    NA    5    9      NA    7    3      NA    2    5
4    NA    3    4      NA    5    6      NA    8    1

备注

可重现形式的输入。

Lines <- "col1 col2 col3 col4 col5 col6
 1    2    2    3    5    7
 2    4    6    4    8    2
 5    9    7    3    2    5
 3    4    5    6    8    1"
DF <- read.table(text = Lines, header = TRUE)

基本的 R 解决方案是:

do.call(cbind, lapply(seq(1, ncol(df), by = 2), function(i) cbind(empty = rep(NA, nrow(df)), df[, seq(i, i+1)])))

#  empty col1 col2 empty col3 col4 empty col5 col6
#1    NA    1    2    NA    2    3    NA    5    7
#2    NA    2    4    NA    6    4    NA    8    2
#3    NA    5    9    NA    7    3    NA    2    5
#4    NA    3    4    NA    5    6    NA    8    1

“整洁”的解决方案可以是:

library(tidyverse)

map_dfc(seq(from = 1, to = ncol(df), by = 2),
        ~df %>%
          mutate(empty = NA) %>%
          select(empty, .x, .x+1))

#New names:
#* empty -> empty...1
#* empty -> empty...4
#* empty -> empty...7
#empty...1 col1 col2 empty...4 col3 col4 empty...7 col5 col6
#1        NA    1    2        NA    2    3        NA    5    7
#2        NA    2    4        NA    6    4        NA    8    2
#3        NA    5    9        NA    7    3        NA    2    5
#4        NA    3    4        NA    5    6        NA    8    1