拆分数据框中一行内的列字符串元素

Split column string elements within a row inside a dataframe

我有一个这样的矩阵 (1000 x 2830):

        9178    3574    3547
160     B_B     B_B      A_A
301     B_B     A_B      A_B
303     B_B     B_B      A_A
311     A_B     A_B      A_A
312     B_B     A_B      A_A
314     B_B     A_B      A_A

我想获得以下内容(复制列名并拆分每列的每个元素):

      9178   9178   3574   3574   3547   3547
160     B      B      B      B      A      A
301     B      B      A      B      A      B
303     B      B      B      B      A      A
311     A      B      A      B      A      A
312     B      B      A      B      A      A
314     B      B      A      B      A      A

我尝试使用 strsplit 但我收到了错误消息,因为这是一个矩阵,而不是一个字符串。您能否提供一些解决此问题的想法?

这是一个选项,使用 dplyr(对于 bind_cols)和 tidyr(对于 separate_)以及基础 R 中的 lapply。它假设您的数据是 data.frame(即您可能需要先将其转换为 data.frame):

library(dplyr)
library(tidyr)

lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>% 
  bind_cols
#  X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
#1       B       B       B       B       A       A
#2       B       B       A       B       A       B
#3       B       B       B       B       A       A
#4       A       B       A       B       A       A
#5       B       B       A       B       A       A
#6       B       B       A       B       A       A

你可以做的事情,虽然看起来有点 "twisted"(yourmat 是你的矩阵)...:[=​​21=]

inter<-data.frame(t(sapply(as.vector(yourmat), function(x) {
                                                 strsplit(x, "_")[[1]]
                                             })),
                   row.names=paste0(rep(colnames(yourmat), e=nrow(yourmat)), 1:nrow(yourmat)),
                   stringsAsFactors=F)
res<-do.call("cbind", 
              split(inter, factor(substr(row.names(inter), 1, 4), level = colnames(yourmat))))
res
#       9178.X1 9178.X2 3574.X1 3574.X2 3547.X1 3547.X2
# 91781       B       B       B       B       A       A
# 91782       B       B       A       B       A       B
# 91783       B       B       B       B       A       A
# 91784       A       B       A       B       A       A
# 91785       B       B       A       B       A       A
# 91786       B       B       A       B       A       A

编辑
如果你想让resrow.namesyourmat一样,你可以这样做:

row.names(res)<-row.names(yourmat)

注意: 如果 yourmatdata.frame 而不是 matrix as.vector函数第一行需要改成unlist.

我有偏见,但我建议使用我的 "splitstackshape" 包中的 cSplit。由于您的输入中似乎有 rownames,因此请使用 as.data.table(., keep.rownames = TRUE):

library(splitstackshape)
cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
#     rn X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
# 1: 160       B       B       B       B       A       A
# 2: 301       B       B       A       B       A       B
# 3: 303       B       B       B       B       A       A
# 4: 311       A       B       A       B       A       A
# 5: 312       B       B       A       B       A       A
# 6: 314       B       B       A       B       A       A

cSplit 更不清晰(但目前可能更快)是使用 "stringi" 中的 stri_split_fixed,像这样:

library(stringi)
`dimnames<-`(do.call(cbind, 
                     lapply(mydf, stri_split_fixed, "_", simplify = TRUE)), 
             list(rownames(mydf), rep(colnames(mydf), each = 2)))
#     X9178 X9178 X3574 X3574 X3547 X3547
# 160 "B"   "B"   "B"   "B"   "A"   "A"  
# 301 "B"   "B"   "A"   "B"   "A"   "B"  
# 303 "B"   "B"   "B"   "B"   "A"   "A"  
# 311 "A"   "B"   "A"   "B"   "A"   "A"  
# 312 "B"   "B"   "A"   "B"   "A"   "A"  
# 314 "B"   "B"   "A"   "B"   "A"   "A" 

如果速度很重要,我建议检查 "iotools" package,尤其是 mstrsplit 函数。该方法类似于 "stringi" 方法:

library(iotools)
`dimnames<-`(do.call(cbind, 
                lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
             list(rownames(mydf), rep(colnames(mydf), each = 2)))

如果您在从 matrix 转换为 data.frame 时忘记使用 stringsAsFactors = FALSE,您可能需要在其中添加一个 lapply(mydf, as character),但它应该仍然有效甚至 stri_split 方法。

不使用数据框的base R解决方案:

# split
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))

# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]

# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)

#    9178  9178  3574  3574  3547  3547
# 160 "B"   "B"   "A"   "B"   "B"   "A"  
# 301 "B"   "A"   "A"   "B"   "B"   "B"  
# 303 "B"   "B"   "A"   "B"   "B"   "A"  
# 311 "A"   "A"   "A"   "B"   "B"   "A"  
# 312 "B"   "A"   "A"   "B"   "B"   "A"  
# 314 "B"   "A"   "A"   "B"   "B"   "A"  

[更新] 这是对所提议解决方案的小型基准研究(我没有包括 cSplit 解决方案,因为它太慢了):

设置:

m <- matrix('A_B',nrow=1000,ncol=2830)
d <- as.data.frame(m, stringsAsFactors = FALSE)

##### 
f.mtrx <- function(m) {
  z <- unlist(strsplit(m,'_'))
  M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))

  # properly order columns
  i <- 1:ncol(M)
  M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]

  # set dimnames
  rownames(M) <- rownames(m)
  colnames(M) <- rep(colnames(m),each=2)
  M
}

library(stringi)
f.mtrx2 <- function(m) {
  z <- unlist(stri_split_fixed(m,'_'))
  M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))

  # properly order columns
  i <- 1:ncol(M)
  M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]

  # set dimnames
  rownames(M) <- rownames(m)
  colnames(M) <- rep(colnames(m),each=2)
  M
}

#####
library(splitstackshape)
f.cSplit <- function(mydf) cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")

#####
library(stringi)
f.stringi <- function(mydf) `dimnames<-`(do.call(cbind, 
                     lapply(mydf, stri_split_fixed, "_", simplify = TRUE)), 
             list(rownames(mydf), rep(colnames(mydf), each = 2)))

#####
library(dplyr)
library(tidyr)

f.dplyr <- function(df) lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>% 
  bind_cols

#####
library(iotools)
f.mstrsplit <- function(mydf) `dimnames<-`(do.call(cbind, 
                     lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
             list(rownames(mydf), rep(colnames(mydf), each = 2)))



#####
library(rbenchmark)

benchmark(f.mtrx(m), f.mtrx2(m), f.dplyr(d), f.stringi(d), f.mstrsplit(d), replications = 10)

结果:

      test replications elapsed relative user.self sys.self user.child sys.child
3     f.dplyr(d)           10  27.722   10.162    27.360    0.269          0         0
5 f.mstrsplit(d)           10   2.728    1.000     2.607    0.098          0         0
1      f.mtrx(m)           10  37.943   13.909    34.885    0.799          0         0
2     f.mtrx2(m)           10  15.176    5.563    13.936    0.802          0         0
4   f.stringi(d)           10   8.107    2.972     7.815    0.247          0         0

在更新的基准测试中,获胜者是 f.mstrsplit