拆分数据框中一行内的列字符串元素
Split column string elements within a row inside a dataframe
我有一个这样的矩阵 (1000 x 2830):
9178 3574 3547
160 B_B B_B A_A
301 B_B A_B A_B
303 B_B B_B A_A
311 A_B A_B A_A
312 B_B A_B A_A
314 B_B A_B A_A
我想获得以下内容(复制列名并拆分每列的每个元素):
9178 9178 3574 3574 3547 3547
160 B B B B A A
301 B B A B A B
303 B B B B A A
311 A B A B A A
312 B B A B A A
314 B B A B A A
我尝试使用 strsplit
但我收到了错误消息,因为这是一个矩阵,而不是一个字符串。您能否提供一些解决此问题的想法?
这是一个选项,使用 dplyr
(对于 bind_cols
)和 tidyr
(对于 separate_
)以及基础 R 中的 lapply
。它假设您的数据是 data.frame(即您可能需要先将其转换为 data.frame):
library(dplyr)
library(tidyr)
lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>%
bind_cols
# X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
#1 B B B B A A
#2 B B A B A B
#3 B B B B A A
#4 A B A B A A
#5 B B A B A A
#6 B B A B A A
你可以做的事情,虽然看起来有点 "twisted"(yourmat
是你的矩阵)...:[=21=]
inter<-data.frame(t(sapply(as.vector(yourmat), function(x) {
strsplit(x, "_")[[1]]
})),
row.names=paste0(rep(colnames(yourmat), e=nrow(yourmat)), 1:nrow(yourmat)),
stringsAsFactors=F)
res<-do.call("cbind",
split(inter, factor(substr(row.names(inter), 1, 4), level = colnames(yourmat))))
res
# 9178.X1 9178.X2 3574.X1 3574.X2 3547.X1 3547.X2
# 91781 B B B B A A
# 91782 B B A B A B
# 91783 B B B B A A
# 91784 A B A B A A
# 91785 B B A B A A
# 91786 B B A B A A
编辑
如果你想让res
的row.names
和yourmat
一样,你可以这样做:
row.names(res)<-row.names(yourmat)
注意: 如果 yourmat
是 data.frame
而不是 matrix
as.vector
函数第一行需要改成unlist
.
我有偏见,但我建议使用我的 "splitstackshape" 包中的 cSplit
。由于您的输入中似乎有 rownames
,因此请使用 as.data.table(., keep.rownames = TRUE)
:
library(splitstackshape)
cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
# rn X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
# 1: 160 B B B B A A
# 2: 301 B B A B A B
# 3: 303 B B B B A A
# 4: 311 A B A B A A
# 5: 312 B B A B A A
# 6: 314 B B A B A A
比 cSplit
更不清晰(但目前可能更快)是使用 "stringi" 中的 stri_split_fixed
,像这样:
library(stringi)
`dimnames<-`(do.call(cbind,
lapply(mydf, stri_split_fixed, "_", simplify = TRUE)),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
# X9178 X9178 X3574 X3574 X3547 X3547
# 160 "B" "B" "B" "B" "A" "A"
# 301 "B" "B" "A" "B" "A" "B"
# 303 "B" "B" "B" "B" "A" "A"
# 311 "A" "B" "A" "B" "A" "A"
# 312 "B" "B" "A" "B" "A" "A"
# 314 "B" "B" "A" "B" "A" "A"
如果速度很重要,我建议检查 "iotools" package,尤其是 mstrsplit
函数。该方法类似于 "stringi" 方法:
library(iotools)
`dimnames<-`(do.call(cbind,
lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
如果您在从 matrix
转换为 data.frame
时忘记使用 stringsAsFactors = FALSE
,您可能需要在其中添加一个 lapply(mydf, as character)
,但它应该仍然有效甚至 stri_split
方法。
不使用数据框的base R解决方案:
# split
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
# 9178 9178 3574 3574 3547 3547
# 160 "B" "B" "A" "B" "B" "A"
# 301 "B" "A" "A" "B" "B" "B"
# 303 "B" "B" "A" "B" "B" "A"
# 311 "A" "A" "A" "B" "B" "A"
# 312 "B" "A" "A" "B" "B" "A"
# 314 "B" "A" "A" "B" "B" "A"
[更新]
这是对所提议解决方案的小型基准研究(我没有包括 cSplit
解决方案,因为它太慢了):
设置:
m <- matrix('A_B',nrow=1000,ncol=2830)
d <- as.data.frame(m, stringsAsFactors = FALSE)
#####
f.mtrx <- function(m) {
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
M
}
library(stringi)
f.mtrx2 <- function(m) {
z <- unlist(stri_split_fixed(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
M
}
#####
library(splitstackshape)
f.cSplit <- function(mydf) cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
#####
library(stringi)
f.stringi <- function(mydf) `dimnames<-`(do.call(cbind,
lapply(mydf, stri_split_fixed, "_", simplify = TRUE)),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
#####
library(dplyr)
library(tidyr)
f.dplyr <- function(df) lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>%
bind_cols
#####
library(iotools)
f.mstrsplit <- function(mydf) `dimnames<-`(do.call(cbind,
lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
#####
library(rbenchmark)
benchmark(f.mtrx(m), f.mtrx2(m), f.dplyr(d), f.stringi(d), f.mstrsplit(d), replications = 10)
结果:
test replications elapsed relative user.self sys.self user.child sys.child
3 f.dplyr(d) 10 27.722 10.162 27.360 0.269 0 0
5 f.mstrsplit(d) 10 2.728 1.000 2.607 0.098 0 0
1 f.mtrx(m) 10 37.943 13.909 34.885 0.799 0 0
2 f.mtrx2(m) 10 15.176 5.563 13.936 0.802 0 0
4 f.stringi(d) 10 8.107 2.972 7.815 0.247 0 0
在更新的基准测试中,获胜者是 f.mstrsplit
。
我有一个这样的矩阵 (1000 x 2830):
9178 3574 3547
160 B_B B_B A_A
301 B_B A_B A_B
303 B_B B_B A_A
311 A_B A_B A_A
312 B_B A_B A_A
314 B_B A_B A_A
我想获得以下内容(复制列名并拆分每列的每个元素):
9178 9178 3574 3574 3547 3547
160 B B B B A A
301 B B A B A B
303 B B B B A A
311 A B A B A A
312 B B A B A A
314 B B A B A A
我尝试使用 strsplit
但我收到了错误消息,因为这是一个矩阵,而不是一个字符串。您能否提供一些解决此问题的想法?
这是一个选项,使用 dplyr
(对于 bind_cols
)和 tidyr
(对于 separate_
)以及基础 R 中的 lapply
。它假设您的数据是 data.frame(即您可能需要先将其转换为 data.frame):
library(dplyr)
library(tidyr)
lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>%
bind_cols
# X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
#1 B B B B A A
#2 B B A B A B
#3 B B B B A A
#4 A B A B A A
#5 B B A B A A
#6 B B A B A A
你可以做的事情,虽然看起来有点 "twisted"(yourmat
是你的矩阵)...:[=21=]
inter<-data.frame(t(sapply(as.vector(yourmat), function(x) {
strsplit(x, "_")[[1]]
})),
row.names=paste0(rep(colnames(yourmat), e=nrow(yourmat)), 1:nrow(yourmat)),
stringsAsFactors=F)
res<-do.call("cbind",
split(inter, factor(substr(row.names(inter), 1, 4), level = colnames(yourmat))))
res
# 9178.X1 9178.X2 3574.X1 3574.X2 3547.X1 3547.X2
# 91781 B B B B A A
# 91782 B B A B A B
# 91783 B B B B A A
# 91784 A B A B A A
# 91785 B B A B A A
# 91786 B B A B A A
编辑
如果你想让res
的row.names
和yourmat
一样,你可以这样做:
row.names(res)<-row.names(yourmat)
注意: 如果 yourmat
是 data.frame
而不是 matrix
as.vector
函数第一行需要改成unlist
.
我有偏见,但我建议使用我的 "splitstackshape" 包中的 cSplit
。由于您的输入中似乎有 rownames
,因此请使用 as.data.table(., keep.rownames = TRUE)
:
library(splitstackshape)
cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
# rn X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
# 1: 160 B B B B A A
# 2: 301 B B A B A B
# 3: 303 B B B B A A
# 4: 311 A B A B A A
# 5: 312 B B A B A A
# 6: 314 B B A B A A
比 cSplit
更不清晰(但目前可能更快)是使用 "stringi" 中的 stri_split_fixed
,像这样:
library(stringi)
`dimnames<-`(do.call(cbind,
lapply(mydf, stri_split_fixed, "_", simplify = TRUE)),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
# X9178 X9178 X3574 X3574 X3547 X3547
# 160 "B" "B" "B" "B" "A" "A"
# 301 "B" "B" "A" "B" "A" "B"
# 303 "B" "B" "B" "B" "A" "A"
# 311 "A" "B" "A" "B" "A" "A"
# 312 "B" "B" "A" "B" "A" "A"
# 314 "B" "B" "A" "B" "A" "A"
如果速度很重要,我建议检查 "iotools" package,尤其是 mstrsplit
函数。该方法类似于 "stringi" 方法:
library(iotools)
`dimnames<-`(do.call(cbind,
lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
如果您在从 matrix
转换为 data.frame
时忘记使用 stringsAsFactors = FALSE
,您可能需要在其中添加一个 lapply(mydf, as character)
,但它应该仍然有效甚至 stri_split
方法。
不使用数据框的base R解决方案:
# split
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
# 9178 9178 3574 3574 3547 3547
# 160 "B" "B" "A" "B" "B" "A"
# 301 "B" "A" "A" "B" "B" "B"
# 303 "B" "B" "A" "B" "B" "A"
# 311 "A" "A" "A" "B" "B" "A"
# 312 "B" "A" "A" "B" "B" "A"
# 314 "B" "A" "A" "B" "B" "A"
[更新]
这是对所提议解决方案的小型基准研究(我没有包括 cSplit
解决方案,因为它太慢了):
设置:
m <- matrix('A_B',nrow=1000,ncol=2830)
d <- as.data.frame(m, stringsAsFactors = FALSE)
#####
f.mtrx <- function(m) {
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
M
}
library(stringi)
f.mtrx2 <- function(m) {
z <- unlist(stri_split_fixed(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
M
}
#####
library(splitstackshape)
f.cSplit <- function(mydf) cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
#####
library(stringi)
f.stringi <- function(mydf) `dimnames<-`(do.call(cbind,
lapply(mydf, stri_split_fixed, "_", simplify = TRUE)),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
#####
library(dplyr)
library(tidyr)
f.dplyr <- function(df) lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>%
bind_cols
#####
library(iotools)
f.mstrsplit <- function(mydf) `dimnames<-`(do.call(cbind,
lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
#####
library(rbenchmark)
benchmark(f.mtrx(m), f.mtrx2(m), f.dplyr(d), f.stringi(d), f.mstrsplit(d), replications = 10)
结果:
test replications elapsed relative user.self sys.self user.child sys.child
3 f.dplyr(d) 10 27.722 10.162 27.360 0.269 0 0
5 f.mstrsplit(d) 10 2.728 1.000 2.607 0.098 0 0
1 f.mtrx(m) 10 37.943 13.909 34.885 0.799 0 0
2 f.mtrx2(m) 10 15.176 5.563 13.936 0.802 0 0
4 f.stringi(d) 10 8.107 2.972 7.815 0.247 0 0
在更新的基准测试中,获胜者是 f.mstrsplit
。