使用 r 将多列拆分为多列
Split multiple columns into multiple columns using r
我有一个包含 20 列和 300 行的 txt 文件。下面给出了我的数据样本。
id sub A1 A2 B1 B2 C1
96 AAA 01:01:01:01/01:01:01:02N 29:02:01 08:01:01/08:19N 44:03:01/44:03:03/44:03:04 07:01:01/07:01:02
97 AAA 03:01:01:01/03:01:01:02N 30:08:01 09:02:01/08:19N 44:03:01/44:03:03/44:03:04 07:01:01/07:01:02
98 AAA 01:01:01:01/01:01:01:02N/01:22N 29:02:01 08:01:01/08:19N 44:03:01/44:03:03/44:03:04 07:09:01/07:01:02
99 AAA 03:01:01:01 30:08:01 09:02:01/08:19N 44:03:01/44:03:03/44:03:04 07:08:01/07:01:02
我需要使用 r 用分隔符“/”分隔列 (A1、A2、B1....)。
输出将是:
id sub A1_1 A1_2 A2 B1_1 B1_2 B2_1 B2_2 ..
96 AAA 01:01:01:01 01:01:01:02N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 ...
我可以找到将一列拆分为多列的函数。但我找不到实现此目的的解决方案。
我支持@Sotos 的建议,编写一个可重现的示例很重要,这样就可以只关注手头的问题。
我想出了这个假数据来尝试回答你的问题:
> df <- data.frame(
+ id = c(1:5),
+ sub = sample(c("GBR", "BRA"), size = 5, replace = T),
+ HLA_A = paste0(rep("01:01", 5), "/", rep("01:02N")),
+ HLA_B = paste0(rep("01:03", 5), "/", "01:42N", "/", "32:20"),
+ HLA_C = paste0(rep("01:03", 5)), stringsAsFactors = F)
>
>
> df
id sub HLA_A HLA_B HLA_C
1 1 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
2 2 BRA 01:01/01:02N 01:03/01:42N/32:20 01:03
3 3 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
4 4 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
5 5 BRA 01:01/01:02N 01:03/01:42N/32:20 01:03
您可以使用 strsplit()
按给定字符拆分列(在本例中为 "/"
)。使用 do.call(rbind, .)
以列格式绑定列表。对您希望定位的列重复此过程,并将它们全部与 id
和 sub
列绑定。这是解决方案:
不使用任何依赖项:
> col.ind <- grep(x = names(df), pattern = "HLA", value = T, ignore.case = T) # your target columns
>
> # lapply to loop the column split process, output is a list, so you need to columb-bind the resulting objects
>
> cols.list <- lapply(seq_along(col.ind), function(x){
+
+ p1 <- do.call(rbind, strsplit(df[[col.ind[[x]]]], split = "/")) # split col by "/"
+
+ p2 <- data.frame(p1, stringsAsFactors = F) # make it into a data.frame
+
+ i <- ncol(p2) # this is an index placeholder that will enable you to rename the recently split columns in a sequential manner
+
+ colnames(p2) <- paste0(col.ind[[x]], c(1:i)) # rename columns
+
+ return(p2) # return the object of interest
+ }
+ )
>
>
> new.df <- cbind(df[1:2], do.call(cbind, cols.list)) # do.call once again to bind the lapply object and column-bind those with the first two columns of your initial data.frame
> new.df
id sub HLA_A1 HLA_A2 HLA_B1 HLA_B2 HLA_B3 HLA_C1
1 1 GBR 01:01 01:02N 01:03 01:42N 32:20 01:03
2 2 BRA 01:01 01:02N 01:03 01:42N 32:20 01:03
3 3 GBR 01:01 01:02N 01:03 01:42N 32:20 01:03
4 4 GBR 01:01 01:02N 01:03 01:42N 32:20 01:03
5 5 BRA 01:01 01:02N 01:03 01:42N 32:20 01:03
这是一个tidyverse
解决方案。
library(tidyverse)
df %>%
gather(key, value, -c(1:2)) %>%
separate_rows(value, sep = "/") %>%
group_by(key, id) %>%
mutate(key2 = paste0(key, "_", seq_along(key))) %>%
ungroup() %>%
select(-key) %>%
spread(key2, value)
# A tibble: 4 x 13
# id sub A1_1 A1_2 A1_3 A2_1 B1_1 B1_2 B2_1 B2_2 B2_3 C1_1 C1_2
#* <fct> <fct> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 96 AAA 01:01:01:01 01:01:01:02N <NA> 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
#2 97 AAA 03:01:01:01 03:01:01:02N <NA> 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
#3 98 AAA 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:09:01 07:01:02
#4 99 AAA 03:01:01:01 <NA> <NA> 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:08:01 07:01:02
在 gather
列除第一列和第二列 (-c(1:2)
) 之外的所有列之后,我使用 tidyr::separate_rows
将新创建的列 value
中的值分隔为"/"
。在创建新列 key2
后,即扩展名为 _1:number of separators
的列 key
,我取消选择了列 key
和 spread
列 key2
by value
.
数据
df <- structure(list(id = structure(1:4, .Label = c("96", "97",
"98", "99"), class = "factor"), sub = structure(c(1L,
1L, 1L, 1L), .Label = "AAA", class = "factor"), A_A1 = structure(c(1L,
4L, 2L, 3L), .Label = c("01:01:01:01/01:01:01:02N", "01:01:01:01/01:01:01:02N/01:22N",
"03:01:01:01", "03:01:01:01/03:01:01:02N"), class = "factor"),
A_A2 = structure(c(1L, 2L, 1L, 2L), .Label = c("29:02:01",
"30:08:01"), class = "factor"), B_B1 = structure(c(1L,
2L, 1L, 2L), .Label = c("08:01:01/08:19N", "09:02:01/08:19N"
), class = "factor"), B_B2 = structure(c(1L, 1L, 1L, 1L
), .Label = "44:03:01/44:03:03/44:03:04", class = "factor"),
C1 = structure(c(1L, 1L, 3L, 2L), .Label = c("07:01:01/07:01:02",
"07:08:01/07:01:02", "07:09:01/07:01:02"), class = "factor")), .Names = c("id",
"sub", "A_A1", "A_A2", "B_B1", "B_B2", "C_C1"), class = "data.frame", row.names = c(NA,
-4L))
我建议使用 reshape2 解决方案来解决不知道零件数量的问题:
> dput(pz1)
structure(list(id = c("HG00096", "HG00097", "HG00098", "HG00099"
), sub = c("GBR", "GBR", "GBR", "GBR"), HLA_A1 = c("01:01:01:01/01:01:01:02N",
"03:01:01:01/03:01:01:02N", "01:01:01:01/01:01:01:02N/01:22N",
"03:01:01:01"), HLA_A2 = c("29:02:01", "30:08:01", "29:02:01",
"30:08:01"), HLA_B1 = c("08:01:01/08:19N", "09:02:01/08:19N",
"08:01:01/08:19N", "09:02:01/08:19N"), HLA_B2 = c("44:03:01/44:03:03/44:03:04",
"44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04"
), HLA_C1 = c("07:01:01/07:01:02", "07:01:01/07:01:02", "07:09:01/07:01:02",
"07:08:01/07:01:02")), .Names = c("id", "sub", "HLA_A1", "HLA_A2",
"HLA_B1", "HLA_B2", "HLA_C1"), row.names = c(NA, -4L), class = "data.frame")
add this function:
library("reshape2", lib.loc="~/R/win-library/3.3")
getIt <- function(df,col) {
x <- max(sapply(strsplit(df[,col],split="/"),length)) ### get the max parts for column
q <- colsplit(string = df[,col],pattern="/",names = paste0(names(df)[col],"_",LETTERS[1:x]))
return(q) }
有了这个功能后,你可以轻松做到:
> getIt(pz1,3)
HLA_A1_A HLA_A1_B HLA_A1_C
1 01:01:01:01 01:01:01:02N
2 03:01:01:01 03:01:01:02N
3 01:01:01:01 01:01:01:02N 01:22N
4 03:01:01:01
和一个简单的 cbind 与原始数据框(有或没有原始列):
> cbind(pz1[,1:2],getIt(pz1,3),getIt(pz1,4),getIt(pz1,5),getIt(pz1,6))
id sub HLA_A1_A HLA_A1_B HLA_A1_C HLA_A2_A HLA_B1_A HLA_B1_B HLA_B2_A HLA_B2_B HLA_B2_C
1 HG00096 GBR 01:01:01:01 01:01:01:02N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04
2 HG00097 GBR 03:01:01:01 03:01:01:02N 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04
3 HG00098 GBR 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04
4 HG00099 GBR 03:01:01:01 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04
我会采用如下方法:
library(data.table)
setDT(df) # convert to a data.table
# identify the columns you want to split
cols <- grep("^HLA", names(df), value = TRUE)
# loop through them and split them
# assign them back to the data.table, by reference
for (i in cols) {
temp <- tstrsplit(df[[i]], "/")
set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
set(df, j = i, value = NULL)
}
结果如下:
df[]
# id sub HLA_A1_1 HLA_A1_2 HLA_A1_3 HLA_A2_1 HLA_B1_1 HLA_B1_2 HLA_B2_1 HLA_B2_2 HLA_B2_3 HLA_C1_1 HLA_C1_2
# 1: HG00096 GBR 01:01:01:01 01:01:01:02N NA 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 2: HG00097 GBR 03:01:01:01 03:01:01:02N NA 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 3: HG00098 GBR 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:09:01 07:01:02
# 4: HG00099 GBR 03:01:01:01 NA NA 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:08:01 07:01:02
除了比接受的答案更容易扩展(事情并不是真正的硬编码)之外,这至少是那种方法的两倍,而且 很多 比 "tidyverse" 方法更快,后者效率很低,因为它首先使数据变长,然后再返回到宽格式。
基准
要了解性能差异,请尝试以下操作:
测试函数
myfun <- function(df) {
cols <- grep("^HLA", names(df), value = TRUE)
for (i in cols) {
temp <- tstrsplit(df[[i]], "/")
set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
set(df, j = i, value = NULL)
}
df[]
}
tidyfun <- function(df) {
df %>%
gather(key, value, -c(1:2)) %>%
separate_rows(value, sep = "/") %>%
group_by(key, id) %>%
mutate(key2 = paste0(key, "_", seq_along(key))) %>%
ungroup() %>%
select(-key) %>%
spread(key2, value)
}
getIt <- function(df,col) {
x <- max(sapply(strsplit(as.character(df[,col]),split="/"),length))
q <- colsplit(string = as.character(df[,col]),pattern="/",
names = paste0(names(df)[col],"_",LETTERS[1:x]))
return(q)
}
reshape2fun <- function(dfdf) {
cbind(dfdf[,1:2], getIt(dfdf,3), getIt(dfdf,4), getIt(dfdf,5), getIt(dfdf,6))
}
4 行....
library(microbenchmark)
dfdf <- as.data.frame(df)
microbenchmark(myfun(copy(df)), reshape2fun(dfdf), tidyfun(df))
# Unit: microseconds
# expr min lq mean median uq max neval
# myfun(copy(df)) 241.55 272.5965 625.7634 359.148 380.0395 28632.94 100
# reshape2fun(dfdf) 5076.24 5368.3835 5841.8784 5539.577 5639.8765 34176.13 100
# tidyfun(df) 37864.68 39435.1915 41152.5916 39801.499 40489.7055 70019.04 100
10,000 行....
biggerdf <- rbindlist(replicate(2500, df, FALSE)) # nrow = 10,000
dfdf <- as.data.frame(biggerdf)
microbenchmark(myfun(copy(biggerdf)), reshape2fun(dfdf), tidyfun(biggerdf), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# myfun(copy(biggerdf)) 50.87452 52.0059 54.59288 53.03503 53.79347 68.69892 10
# reshape2fun(dfdf) 120.90291 124.3893 137.54154 126.06213 157.50532 159.15069 10
# tidyfun(biggerdf) 1312.75422 1350.6651 1394.93082 1358.21612 1373.86793 1732.86521 10
1,000,000 行....
BIGGERdf <- rbindlist(replicate(100, biggerdf, FALSE)) # nrow = 1,000,000
dfdf <- as.data.frame(BIGGERdf)
system.time(tidyfun(BIGGERdf)) # > 2 minutes!
# user system elapsed
# 141.373 1.048 142.403
microbenchmark(myfun(copy(BIGGERdf)), reshape2fun(dfdf), times = 5)
# Unit: seconds
# expr min lq mean median uq max neval
# myfun(copy(BIGGERdf)) 5.180048 5.574677 6.026515 5.764467 6.498967 7.114415 5
# reshape2fun(dfdf) 8.858202 9.095027 9.629969 9.264896 10.192161 10.739560 5
我有一个包含 20 列和 300 行的 txt 文件。下面给出了我的数据样本。
id sub A1 A2 B1 B2 C1
96 AAA 01:01:01:01/01:01:01:02N 29:02:01 08:01:01/08:19N 44:03:01/44:03:03/44:03:04 07:01:01/07:01:02
97 AAA 03:01:01:01/03:01:01:02N 30:08:01 09:02:01/08:19N 44:03:01/44:03:03/44:03:04 07:01:01/07:01:02
98 AAA 01:01:01:01/01:01:01:02N/01:22N 29:02:01 08:01:01/08:19N 44:03:01/44:03:03/44:03:04 07:09:01/07:01:02
99 AAA 03:01:01:01 30:08:01 09:02:01/08:19N 44:03:01/44:03:03/44:03:04 07:08:01/07:01:02
我需要使用 r 用分隔符“/”分隔列 (A1、A2、B1....)。 输出将是:
id sub A1_1 A1_2 A2 B1_1 B1_2 B2_1 B2_2 ..
96 AAA 01:01:01:01 01:01:01:02N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 ...
我可以找到将一列拆分为多列的函数。但我找不到实现此目的的解决方案。
我支持@Sotos 的建议,编写一个可重现的示例很重要,这样就可以只关注手头的问题。
我想出了这个假数据来尝试回答你的问题:
> df <- data.frame(
+ id = c(1:5),
+ sub = sample(c("GBR", "BRA"), size = 5, replace = T),
+ HLA_A = paste0(rep("01:01", 5), "/", rep("01:02N")),
+ HLA_B = paste0(rep("01:03", 5), "/", "01:42N", "/", "32:20"),
+ HLA_C = paste0(rep("01:03", 5)), stringsAsFactors = F)
>
>
> df
id sub HLA_A HLA_B HLA_C
1 1 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
2 2 BRA 01:01/01:02N 01:03/01:42N/32:20 01:03
3 3 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
4 4 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
5 5 BRA 01:01/01:02N 01:03/01:42N/32:20 01:03
您可以使用 strsplit()
按给定字符拆分列(在本例中为 "/"
)。使用 do.call(rbind, .)
以列格式绑定列表。对您希望定位的列重复此过程,并将它们全部与 id
和 sub
列绑定。这是解决方案:
不使用任何依赖项:
> col.ind <- grep(x = names(df), pattern = "HLA", value = T, ignore.case = T) # your target columns
>
> # lapply to loop the column split process, output is a list, so you need to columb-bind the resulting objects
>
> cols.list <- lapply(seq_along(col.ind), function(x){
+
+ p1 <- do.call(rbind, strsplit(df[[col.ind[[x]]]], split = "/")) # split col by "/"
+
+ p2 <- data.frame(p1, stringsAsFactors = F) # make it into a data.frame
+
+ i <- ncol(p2) # this is an index placeholder that will enable you to rename the recently split columns in a sequential manner
+
+ colnames(p2) <- paste0(col.ind[[x]], c(1:i)) # rename columns
+
+ return(p2) # return the object of interest
+ }
+ )
>
>
> new.df <- cbind(df[1:2], do.call(cbind, cols.list)) # do.call once again to bind the lapply object and column-bind those with the first two columns of your initial data.frame
> new.df
id sub HLA_A1 HLA_A2 HLA_B1 HLA_B2 HLA_B3 HLA_C1
1 1 GBR 01:01 01:02N 01:03 01:42N 32:20 01:03
2 2 BRA 01:01 01:02N 01:03 01:42N 32:20 01:03
3 3 GBR 01:01 01:02N 01:03 01:42N 32:20 01:03
4 4 GBR 01:01 01:02N 01:03 01:42N 32:20 01:03
5 5 BRA 01:01 01:02N 01:03 01:42N 32:20 01:03
这是一个tidyverse
解决方案。
library(tidyverse)
df %>%
gather(key, value, -c(1:2)) %>%
separate_rows(value, sep = "/") %>%
group_by(key, id) %>%
mutate(key2 = paste0(key, "_", seq_along(key))) %>%
ungroup() %>%
select(-key) %>%
spread(key2, value)
# A tibble: 4 x 13
# id sub A1_1 A1_2 A1_3 A2_1 B1_1 B1_2 B2_1 B2_2 B2_3 C1_1 C1_2
#* <fct> <fct> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 96 AAA 01:01:01:01 01:01:01:02N <NA> 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
#2 97 AAA 03:01:01:01 03:01:01:02N <NA> 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
#3 98 AAA 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:09:01 07:01:02
#4 99 AAA 03:01:01:01 <NA> <NA> 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:08:01 07:01:02
在 gather
列除第一列和第二列 (-c(1:2)
) 之外的所有列之后,我使用 tidyr::separate_rows
将新创建的列 value
中的值分隔为"/"
。在创建新列 key2
后,即扩展名为 _1:number of separators
的列 key
,我取消选择了列 key
和 spread
列 key2
by value
.
数据
df <- structure(list(id = structure(1:4, .Label = c("96", "97",
"98", "99"), class = "factor"), sub = structure(c(1L,
1L, 1L, 1L), .Label = "AAA", class = "factor"), A_A1 = structure(c(1L,
4L, 2L, 3L), .Label = c("01:01:01:01/01:01:01:02N", "01:01:01:01/01:01:01:02N/01:22N",
"03:01:01:01", "03:01:01:01/03:01:01:02N"), class = "factor"),
A_A2 = structure(c(1L, 2L, 1L, 2L), .Label = c("29:02:01",
"30:08:01"), class = "factor"), B_B1 = structure(c(1L,
2L, 1L, 2L), .Label = c("08:01:01/08:19N", "09:02:01/08:19N"
), class = "factor"), B_B2 = structure(c(1L, 1L, 1L, 1L
), .Label = "44:03:01/44:03:03/44:03:04", class = "factor"),
C1 = structure(c(1L, 1L, 3L, 2L), .Label = c("07:01:01/07:01:02",
"07:08:01/07:01:02", "07:09:01/07:01:02"), class = "factor")), .Names = c("id",
"sub", "A_A1", "A_A2", "B_B1", "B_B2", "C_C1"), class = "data.frame", row.names = c(NA,
-4L))
我建议使用 reshape2 解决方案来解决不知道零件数量的问题:
> dput(pz1)
structure(list(id = c("HG00096", "HG00097", "HG00098", "HG00099"
), sub = c("GBR", "GBR", "GBR", "GBR"), HLA_A1 = c("01:01:01:01/01:01:01:02N",
"03:01:01:01/03:01:01:02N", "01:01:01:01/01:01:01:02N/01:22N",
"03:01:01:01"), HLA_A2 = c("29:02:01", "30:08:01", "29:02:01",
"30:08:01"), HLA_B1 = c("08:01:01/08:19N", "09:02:01/08:19N",
"08:01:01/08:19N", "09:02:01/08:19N"), HLA_B2 = c("44:03:01/44:03:03/44:03:04",
"44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04"
), HLA_C1 = c("07:01:01/07:01:02", "07:01:01/07:01:02", "07:09:01/07:01:02",
"07:08:01/07:01:02")), .Names = c("id", "sub", "HLA_A1", "HLA_A2",
"HLA_B1", "HLA_B2", "HLA_C1"), row.names = c(NA, -4L), class = "data.frame")
add this function:
library("reshape2", lib.loc="~/R/win-library/3.3")
getIt <- function(df,col) {
x <- max(sapply(strsplit(df[,col],split="/"),length)) ### get the max parts for column
q <- colsplit(string = df[,col],pattern="/",names = paste0(names(df)[col],"_",LETTERS[1:x]))
return(q) }
有了这个功能后,你可以轻松做到:
> getIt(pz1,3)
HLA_A1_A HLA_A1_B HLA_A1_C
1 01:01:01:01 01:01:01:02N
2 03:01:01:01 03:01:01:02N
3 01:01:01:01 01:01:01:02N 01:22N
4 03:01:01:01
和一个简单的 cbind 与原始数据框(有或没有原始列):
> cbind(pz1[,1:2],getIt(pz1,3),getIt(pz1,4),getIt(pz1,5),getIt(pz1,6))
id sub HLA_A1_A HLA_A1_B HLA_A1_C HLA_A2_A HLA_B1_A HLA_B1_B HLA_B2_A HLA_B2_B HLA_B2_C
1 HG00096 GBR 01:01:01:01 01:01:01:02N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04
2 HG00097 GBR 03:01:01:01 03:01:01:02N 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04
3 HG00098 GBR 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04
4 HG00099 GBR 03:01:01:01 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04
我会采用如下方法:
library(data.table)
setDT(df) # convert to a data.table
# identify the columns you want to split
cols <- grep("^HLA", names(df), value = TRUE)
# loop through them and split them
# assign them back to the data.table, by reference
for (i in cols) {
temp <- tstrsplit(df[[i]], "/")
set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
set(df, j = i, value = NULL)
}
结果如下:
df[]
# id sub HLA_A1_1 HLA_A1_2 HLA_A1_3 HLA_A2_1 HLA_B1_1 HLA_B1_2 HLA_B2_1 HLA_B2_2 HLA_B2_3 HLA_C1_1 HLA_C1_2
# 1: HG00096 GBR 01:01:01:01 01:01:01:02N NA 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 2: HG00097 GBR 03:01:01:01 03:01:01:02N NA 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 3: HG00098 GBR 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:09:01 07:01:02
# 4: HG00099 GBR 03:01:01:01 NA NA 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:08:01 07:01:02
除了比接受的答案更容易扩展(事情并不是真正的硬编码)之外,这至少是那种方法的两倍,而且 很多 比 "tidyverse" 方法更快,后者效率很低,因为它首先使数据变长,然后再返回到宽格式。
基准
要了解性能差异,请尝试以下操作:
测试函数
myfun <- function(df) {
cols <- grep("^HLA", names(df), value = TRUE)
for (i in cols) {
temp <- tstrsplit(df[[i]], "/")
set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
set(df, j = i, value = NULL)
}
df[]
}
tidyfun <- function(df) {
df %>%
gather(key, value, -c(1:2)) %>%
separate_rows(value, sep = "/") %>%
group_by(key, id) %>%
mutate(key2 = paste0(key, "_", seq_along(key))) %>%
ungroup() %>%
select(-key) %>%
spread(key2, value)
}
getIt <- function(df,col) {
x <- max(sapply(strsplit(as.character(df[,col]),split="/"),length))
q <- colsplit(string = as.character(df[,col]),pattern="/",
names = paste0(names(df)[col],"_",LETTERS[1:x]))
return(q)
}
reshape2fun <- function(dfdf) {
cbind(dfdf[,1:2], getIt(dfdf,3), getIt(dfdf,4), getIt(dfdf,5), getIt(dfdf,6))
}
4 行....
library(microbenchmark)
dfdf <- as.data.frame(df)
microbenchmark(myfun(copy(df)), reshape2fun(dfdf), tidyfun(df))
# Unit: microseconds
# expr min lq mean median uq max neval
# myfun(copy(df)) 241.55 272.5965 625.7634 359.148 380.0395 28632.94 100
# reshape2fun(dfdf) 5076.24 5368.3835 5841.8784 5539.577 5639.8765 34176.13 100
# tidyfun(df) 37864.68 39435.1915 41152.5916 39801.499 40489.7055 70019.04 100
10,000 行....
biggerdf <- rbindlist(replicate(2500, df, FALSE)) # nrow = 10,000
dfdf <- as.data.frame(biggerdf)
microbenchmark(myfun(copy(biggerdf)), reshape2fun(dfdf), tidyfun(biggerdf), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# myfun(copy(biggerdf)) 50.87452 52.0059 54.59288 53.03503 53.79347 68.69892 10
# reshape2fun(dfdf) 120.90291 124.3893 137.54154 126.06213 157.50532 159.15069 10
# tidyfun(biggerdf) 1312.75422 1350.6651 1394.93082 1358.21612 1373.86793 1732.86521 10
1,000,000 行....
BIGGERdf <- rbindlist(replicate(100, biggerdf, FALSE)) # nrow = 1,000,000
dfdf <- as.data.frame(BIGGERdf)
system.time(tidyfun(BIGGERdf)) # > 2 minutes!
# user system elapsed
# 141.373 1.048 142.403
microbenchmark(myfun(copy(BIGGERdf)), reshape2fun(dfdf), times = 5)
# Unit: seconds
# expr min lq mean median uq max neval
# myfun(copy(BIGGERdf)) 5.180048 5.574677 6.026515 5.764467 6.498967 7.114415 5
# reshape2fun(dfdf) 8.858202 9.095027 9.629969 9.264896 10.192161 10.739560 5