转置的最佳方式 data.table
best way to transpose data.table
[更新:data.table
包中现在有一个原生 transpose()
函数]
我经常需要转置一个data.table
,每次都需要好几行代码,我想知道是否有比我更好的解决方案。
如果我们抽样table
library(data.table)
mydata <- data.table(col0=c("row1","row2","row3"),
col1=c(11,21,31),
col2=c(12,22,32),
col3=c(13,23,33))
mydata
# col0 col1 col2 col3
# row1 11 12 13
# row2 21 22 23
# row3 31 32 33
只要用t()
转置,就会转置成character
类型的矩阵,而对这种矩阵应用data.table
会丢失row.names
:
t(mydata)
# [,1] [,2] [,3]
# col0 "row1" "row2" "row3"
# col1 "11" "21" "31"
# col2 "12" "22" "32"
# col3 "13" "23" "33"
data.table(t(mydata))
# V1 V2 V3
# row1 row2 row3
# 11 21 31
# 12 22 32
# 13 23 33
所以我不得不为此编写一个函数:
tdt <- function(inpdt){
transposed <- t(inpdt[,-1,with=F]);
colnames(transposed) <- inpdt[[1]];
transposed <- data.table(transposed, keep.rownames=T);
setnames(transposed, 1, names(inpdt)[1]);
return(transposed);
}
tdt(mydata)
# col0 row1 row2 row3
# col1 11 21 31
# col2 12 22 32
# col3 13 23 33
有什么我可以在这里优化或以 "nicer" 方式进行的吗?
为什么不只是 melt
和 dcast
data.table
?
require(data.table)
dcast(melt(mydata, id.vars = "col0"), variable ~ col0)
# variable row1 row2 row3
# 1: col1 11 21 31
# 2: col2 12 22 32
# 3: col3 13 23 33
这里有一个仅使用data.table
的替代解决方案,更接近于使用t
进行转置的原始想法。
mydata[, data.table(t(.SD), keep.rownames=TRUE), .SDcols=-"col0"]
## rn V1 V2 V3
## 1: col1 11 21 31
## 2: col2 12 22 32
## 3: col3 13 23 33
如果保留行名很重要,可以使用 setnames
。诚然,这变得有点笨拙,可能重铸解决方案更可取。
setnames(mydata[, data.table(t(.SD), keep.rownames=TRUE), .SDcols=-"col0"],
mydata[, c('rn', col0)])[]
## rn row1 row2 row3
## 1: col1 11 21 31
## 2: col2 12 22 32
## 3: col3 13 23 33
current docs 显示了内置的 transpose
方法。
具体可以这样做:
transpose(mydata, keep.names = "col", make.names = "col0")
## col row1 row2 row3
## 1: col1 11 21 31
## 2: col2 12 22 32
## 3: col3 13 23 33
这是一个使用包装器整理 data.table transpose
函数输出的解决方案。
对于非常大的数据集,这似乎比 dcast/melt 方法更有效(我在 8000 行 x 29000 列数据集上测试了它,下面的函数在大约 3 分钟内运行,但 dcast/melt 坠毁 R):
# Function to clean up output of data.table transpose:
transposedt <- function(dt, varlabel) {
require(data.table)
dtrows = names(dt)
dtcols = as.list(c(dt[,1]))
dtt = transpose(dt)
dtt[, eval(varlabel) := dtrows]
setnames(dtt, old = names(dtt), new = c(dtcols[[1]], eval(varlabel)))
dtt = dtt[-1,]
setcolorder(dtt, c(eval(varlabel), names(dtt)[1:(ncol(dtt) - 1)]))
return(dtt)
}
# Some dummy data
mydt <- data.table(col0 = c(paste0("row", seq_along(1:100))),
col01 = c(sample(seq_along(1:100), 100)),
col02 = c(sample(seq_along(1:100), 100)),
col03 = c(sample(seq_along(1:100), 100)),
col04 = c(sample(seq_along(1:100), 100)),
col05 = c(sample(seq_along(1:100), 100)),
col06 = c(sample(seq_along(1:100), 100)),
col07 = c(sample(seq_along(1:100), 100)),
col08 = c(sample(seq_along(1:100), 100)),
col09 = c(sample(seq_along(1:100), 100)),
col10 = c(sample(seq_along(1:100), 100)))
# Apply the function:
mydtt <- transposedt(mydt, "myvariables")
# View the results:
> mydtt[,1:10]
myvariables row1 row2 row3 row4 row5 row6 row7 row8 row9
1: col01 58 53 14 96 51 30 26 15 68
2: col02 6 72 46 62 69 9 63 32 78
3: col03 21 36 94 41 54 74 82 64 15
4: col04 68 41 66 30 31 78 51 67 26
5: col05 49 30 52 78 73 71 5 66 44
6: col06 89 35 79 67 6 88 62 97 73
7: col07 66 15 27 29 58 40 35 82 57
8: col08 55 47 83 30 23 65 48 56 87
9: col09 41 10 21 33 55 81 94 25 34
10: col10 35 17 41 44 21 66 69 61 46
同样有用的是,列(前行)以其原始顺序出现,您可以将变量列命名为有意义的名称。
下面我提供的tdt函数应该会更快
tdt <- function(DT, transpose.col, ...) {
# The transpose function is efficient, but lacks the keeping of row and colnames
new.row.names <- colnames(DT)
new.row.names <- new.row.names[!new.row.names %in% transpose.col]
new.col.names <- DT[, transpose.col, with = F]
DT <- DT[, !colnames(DT) %in% transpose.col, with = F]
DT <- transpose(DT, ...)
colnames(DT) <- unlist(new.col.names)
DT$var <- new.row.names
# change order of DT after transposing
setcolorder(DT, c("var", setdiff(names(DT), "var")))
colnames(DT)[1] <- transpose.col
return(DT)
}
library(microbenchmark); library(microbenchmarkCore)
DT <- data.table(x=1:1000, y=paste("name", 1:1000, sep = "_"), z = paste("test", 1:1000, sep = "."))
rbind(microbenchmark(tdt(DT, "y")),
microbenchmark(dcast(melt(DT, id.vars = "y"), variable ~ y)),
microbenchmark(DT[, data.table(t(.SD), keep.rownames=TRUE), .SDcols=-"y"]))
Unit: milliseconds
expr min lq mean median uq max neval cld
tdt(DT, "y") 3.463842 3.719341 4.308158 3.911599 4.576477 20.406940 100 a
dcast(melt(DT, id.vars = "y"), variable ~ y) 5.146119 5.496761 5.826647 5.580796 5.870584 9.536541 100 a
DT[, data.table(t(.SD), keep.rownames = TRUE), .SDcols = -"y"] 29.975567 34.554989 40.807036 36.724430 39.102396 104.242218 100 b
d <- tdt(DT, "y")
d[1:2, 1:11]
y name_1 name_2 name_3 name_4 name_5 name_6 name_7 name_8 name_9 name_10
1: x 1 2 3 4 5 6 7 8 9 10
2: z test.1 test.2 test.3 test.4 test.5 test.6 test.7 test.8 test.9 test.10
DT[1:10, 1:3]
x y z
1: 1 name_1 test.1
2: 2 name_2 test.2
3: 3 name_3 test.3
4: 4 name_4 test.4
5: 5 name_5 test.5
6: 6 name_6 test.6
7: 7 name_7 test.7
8: 8 name_8 test.8
9: 9 name_9 test.9
10: 10 name_10 test.10
class(d)
[1] "data.table" "data.frame"
df <- as.data.frame(t(mydata))
是我尝试过的 df
是 data.frame
并且 mydata
上的列名现在是 df
上的行名
[更新:data.table
包中现在有一个原生 transpose()
函数]
我经常需要转置一个data.table
,每次都需要好几行代码,我想知道是否有比我更好的解决方案。
如果我们抽样table
library(data.table)
mydata <- data.table(col0=c("row1","row2","row3"),
col1=c(11,21,31),
col2=c(12,22,32),
col3=c(13,23,33))
mydata
# col0 col1 col2 col3
# row1 11 12 13
# row2 21 22 23
# row3 31 32 33
只要用t()
转置,就会转置成character
类型的矩阵,而对这种矩阵应用data.table
会丢失row.names
:
t(mydata)
# [,1] [,2] [,3]
# col0 "row1" "row2" "row3"
# col1 "11" "21" "31"
# col2 "12" "22" "32"
# col3 "13" "23" "33"
data.table(t(mydata))
# V1 V2 V3
# row1 row2 row3
# 11 21 31
# 12 22 32
# 13 23 33
所以我不得不为此编写一个函数:
tdt <- function(inpdt){
transposed <- t(inpdt[,-1,with=F]);
colnames(transposed) <- inpdt[[1]];
transposed <- data.table(transposed, keep.rownames=T);
setnames(transposed, 1, names(inpdt)[1]);
return(transposed);
}
tdt(mydata)
# col0 row1 row2 row3
# col1 11 21 31
# col2 12 22 32
# col3 13 23 33
有什么我可以在这里优化或以 "nicer" 方式进行的吗?
为什么不只是 melt
和 dcast
data.table
?
require(data.table)
dcast(melt(mydata, id.vars = "col0"), variable ~ col0)
# variable row1 row2 row3
# 1: col1 11 21 31
# 2: col2 12 22 32
# 3: col3 13 23 33
这里有一个仅使用data.table
的替代解决方案,更接近于使用t
进行转置的原始想法。
mydata[, data.table(t(.SD), keep.rownames=TRUE), .SDcols=-"col0"]
## rn V1 V2 V3
## 1: col1 11 21 31
## 2: col2 12 22 32
## 3: col3 13 23 33
如果保留行名很重要,可以使用 setnames
。诚然,这变得有点笨拙,可能重铸解决方案更可取。
setnames(mydata[, data.table(t(.SD), keep.rownames=TRUE), .SDcols=-"col0"],
mydata[, c('rn', col0)])[]
## rn row1 row2 row3
## 1: col1 11 21 31
## 2: col2 12 22 32
## 3: col3 13 23 33
current docs 显示了内置的 transpose
方法。
具体可以这样做:
transpose(mydata, keep.names = "col", make.names = "col0")
## col row1 row2 row3
## 1: col1 11 21 31
## 2: col2 12 22 32
## 3: col3 13 23 33
这是一个使用包装器整理 data.table transpose
函数输出的解决方案。
对于非常大的数据集,这似乎比 dcast/melt 方法更有效(我在 8000 行 x 29000 列数据集上测试了它,下面的函数在大约 3 分钟内运行,但 dcast/melt 坠毁 R):
# Function to clean up output of data.table transpose:
transposedt <- function(dt, varlabel) {
require(data.table)
dtrows = names(dt)
dtcols = as.list(c(dt[,1]))
dtt = transpose(dt)
dtt[, eval(varlabel) := dtrows]
setnames(dtt, old = names(dtt), new = c(dtcols[[1]], eval(varlabel)))
dtt = dtt[-1,]
setcolorder(dtt, c(eval(varlabel), names(dtt)[1:(ncol(dtt) - 1)]))
return(dtt)
}
# Some dummy data
mydt <- data.table(col0 = c(paste0("row", seq_along(1:100))),
col01 = c(sample(seq_along(1:100), 100)),
col02 = c(sample(seq_along(1:100), 100)),
col03 = c(sample(seq_along(1:100), 100)),
col04 = c(sample(seq_along(1:100), 100)),
col05 = c(sample(seq_along(1:100), 100)),
col06 = c(sample(seq_along(1:100), 100)),
col07 = c(sample(seq_along(1:100), 100)),
col08 = c(sample(seq_along(1:100), 100)),
col09 = c(sample(seq_along(1:100), 100)),
col10 = c(sample(seq_along(1:100), 100)))
# Apply the function:
mydtt <- transposedt(mydt, "myvariables")
# View the results:
> mydtt[,1:10]
myvariables row1 row2 row3 row4 row5 row6 row7 row8 row9
1: col01 58 53 14 96 51 30 26 15 68
2: col02 6 72 46 62 69 9 63 32 78
3: col03 21 36 94 41 54 74 82 64 15
4: col04 68 41 66 30 31 78 51 67 26
5: col05 49 30 52 78 73 71 5 66 44
6: col06 89 35 79 67 6 88 62 97 73
7: col07 66 15 27 29 58 40 35 82 57
8: col08 55 47 83 30 23 65 48 56 87
9: col09 41 10 21 33 55 81 94 25 34
10: col10 35 17 41 44 21 66 69 61 46
同样有用的是,列(前行)以其原始顺序出现,您可以将变量列命名为有意义的名称。
下面我提供的tdt函数应该会更快
tdt <- function(DT, transpose.col, ...) {
# The transpose function is efficient, but lacks the keeping of row and colnames
new.row.names <- colnames(DT)
new.row.names <- new.row.names[!new.row.names %in% transpose.col]
new.col.names <- DT[, transpose.col, with = F]
DT <- DT[, !colnames(DT) %in% transpose.col, with = F]
DT <- transpose(DT, ...)
colnames(DT) <- unlist(new.col.names)
DT$var <- new.row.names
# change order of DT after transposing
setcolorder(DT, c("var", setdiff(names(DT), "var")))
colnames(DT)[1] <- transpose.col
return(DT)
}
library(microbenchmark); library(microbenchmarkCore)
DT <- data.table(x=1:1000, y=paste("name", 1:1000, sep = "_"), z = paste("test", 1:1000, sep = "."))
rbind(microbenchmark(tdt(DT, "y")),
microbenchmark(dcast(melt(DT, id.vars = "y"), variable ~ y)),
microbenchmark(DT[, data.table(t(.SD), keep.rownames=TRUE), .SDcols=-"y"]))
Unit: milliseconds
expr min lq mean median uq max neval cld
tdt(DT, "y") 3.463842 3.719341 4.308158 3.911599 4.576477 20.406940 100 a
dcast(melt(DT, id.vars = "y"), variable ~ y) 5.146119 5.496761 5.826647 5.580796 5.870584 9.536541 100 a
DT[, data.table(t(.SD), keep.rownames = TRUE), .SDcols = -"y"] 29.975567 34.554989 40.807036 36.724430 39.102396 104.242218 100 b
d <- tdt(DT, "y")
d[1:2, 1:11]
y name_1 name_2 name_3 name_4 name_5 name_6 name_7 name_8 name_9 name_10
1: x 1 2 3 4 5 6 7 8 9 10
2: z test.1 test.2 test.3 test.4 test.5 test.6 test.7 test.8 test.9 test.10
DT[1:10, 1:3]
x y z
1: 1 name_1 test.1
2: 2 name_2 test.2
3: 3 name_3 test.3
4: 4 name_4 test.4
5: 5 name_5 test.5
6: 6 name_6 test.6
7: 7 name_7 test.7
8: 8 name_8 test.8
9: 9 name_9 test.9
10: 10 name_10 test.10
class(d)
[1] "data.table" "data.frame"
df <- as.data.frame(t(mydata))
是我尝试过的 df
是 data.frame
并且 mydata
上的列名现在是 df