如何将多个(相似的)列合并为多行并减少内存使用?
How to merge multiple (similar) columns into many rows and reduce memory usage?
我的数据
是 90730 观测值。 639 个变量。下面是一个(较小的)可重现示例:
structure(list(Match = c(1L, 1L, 1L, 1L), Standard = c("E",
"E", "E", "E"), Athlete = c("AA", "AA", "AA",
"AA"), Team = c("ONE", "ONE", "ONE", "ONE"), Quarter = c("1_1",
"1_1", "1_1", "1_1"), Position = c("Back", "Back", "Back", "Back"), Sample = 1:4, X = c(4.9244, 4.9242, 4.924, 4.9239), Y = c(-13.3858, -13.3866,
-13.3873, -13.388), Match.1 = c(1L, 1L, 1L, 1L), Standard.1 = c("E",
"E", "E", "E"), Athlete.1 = c("BB", "BB", "BB",
"BB"), Team.1 = c("ONE", "ONE", "ONE", "ONE"), Quarter.1 = c("1_1",
"1_1", "1_1", "1_1"), Position.1 = c("Forward", "Forward", "Forward", "Forward"), Sample.1 = 1:4, X.1 = c(-12.3725, -12.3566, -12.3398,
-12.322), Y.1 = c(-15.9311, -15.926, -15.9205, -15.9146)), .Names = c("Match",
"Standard", "Athlete", "Team", "Quarter", "Position", "Sample",
"X", "Y", "Match.1", "Standard.1", "Athlete.1", "Team.1", "Quarter.1",
"Position.1", "Sample.1", "X.1", "Y.1"), row.names = c(NA, 4L
), class = "data.frame")
我的理想输出
我希望将多个(相似的)列合并到仅包含 9 列的 data.frame (c("Match", "Standard", "Athlete", "Team", "Quarter", "Position", "Sample", "X", "Y"))
我尝试了什么
我尝试通过 reshape2
包和下面的代码进行熔化:
mdata <- melt(df, id=c("Match","Standard","Athlete","Team","Quarter","Position","Sample","X","Y"))
返回错误 Error: cannot allocate vector of size 218.0 Mb
我还尝试通过以下代码使用 grep
:
M1Compile <- data.frame(Match=unlist(df[grep('^Match', names(df))]),
Standard=unlist(df[grep('^Standard', names(df))]),
Athlete=unlist(df[grep('^Athlete', names(df))]),
Team=unlist(df[grep('^Team', names(df))]),
Quarter=unlist(df[grep('^Quarter', names(df))]),
Position=unlist(df[grep('^Position', names(df))]),
Sample=unlist(df[grep('^Sample', names(df))]),
X=unlist(df[grep('^X', names(df))]),
Y=unlist(df[grep('^Y', names(df))]),
stringsAsFactors=FALSE)
row.names(M1Compile) <- NULL<br><br>
但是,我收到错误 Error: cannot allocate vector of size 8.0 Mb
有什么建议么?我已将 df
保存为 .RData 文件,但没有成功重新打开 R。我还 运行 gc()
并关闭了所有其他后台程序以帮助记忆,同样没有成功。
我的问题
是否有更好的方法来构建我的数据/减少内存并获得我的理想输出,如上所述?
课程信息
R version 3.2.2 (2015-08-14)
Platform: i386-w64-mingw32/i386 (32-bit)
Running under: Windows 7 (build 7601) Service Pack 1
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] reshape2_1.4.1
loaded via a namespace (and not attached):
[1] magrittr_1.5 plyr_1.8.3 tools_3.2.2 Rcpp_0.12.2 stringi_1.0-1
stringr_1.0.0
对于您的示例数据
dd <- structure(list(Match = c(1L, 1L, 1L, 1L), Standard = c("E", "E", "E", "E"), Athlete = c("AA", "AA", "AA", "AA"), Team = c("ONE", "ONE", "ONE", "ONE"), Quarter = c("1_1", "1_1", "1_1", "1_1"), Position = c("Back", "Back", "Back", "Back"), Sample = 1:4, X = c(4.9244, 4.9242, 4.924, 4.9239), Y = c(-13.3858, -13.3866, -13.3873, -13.388), Match.1 = c(1L, 1L, 1L, 1L), Standard.1 = c("E", "E", "E", "E"), Athlete.1 = c("BB", "BB", "BB", "BB"), Team.1 = c("ONE", "ONE", "ONE", "ONE"), Quarter.1 = c("1_1", "1_1", "1_1", "1_1"), Position.1 = c("Forward", "Forward", "Forward", "Forward"), Sample.1 = 1:4, X.1 = c(-12.3725, -12.3566, -12.3398, -12.322), Y.1 = c(-15.9311, -15.926, -15.9205, -15.9146)), .Names = c("Match", "Standard", "Athlete", "Team", "Quarter", "Position", "Sample", "X", "Y", "Match.1", "Standard.1", "Athlete.1", "Team.1", "Quarter.1", "Position.1", "Sample.1", "X.1", "Y.1"), row.names = c(NA, 4L ), class = "data.frame")
dd <- dd[rep_len(seq.int(nrow(dd)), 90000), ]
system.time({
tmp <- reshape(dd, dir = 'l', varying = as.list(data.frame(t(matrix(1:18, ncol = 2)))))
})
# user system elapsed
# 0.144 0.014 0.158
dim(tmp)
# [1] 180000 11
head(tmp)
# time Match Standard Athlete Team Quarter Position Sample X Y id
# 1.1 1 1 E AA ONE 1_1 Back 1 4.9244 -13.3858 1
# 2.1 1 1 E AA ONE 1_1 Back 2 4.9242 -13.3866 2
# 3.1 1 1 E AA ONE 1_1 Back 3 4.9240 -13.3873 3
# 4.1 1 1 E AA ONE 1_1 Back 4 4.9239 -13.3880 4
# 5.1 1 1 E AA ONE 1_1 Back 1 4.9244 -13.3858 5
# 6.1 1 1 E AA ONE 1_1 Back 2 4.9242 -13.3866 6
对于 90,000 x 600 的数据,您可能想要使用更高效的方法
dd <- dd[, rep(1:18, 10)]
dim(dd)
# [1] 90000 630
system.time({
library('data.table')
setDT(dd)
dd <- melt(dd, id.vars = NULL, measure = patterns(names(dd)[1:9]),
value.name = names(dd)[1:9])
})
# user system elapsed
# 0.070 0.031 0.101
dim(dd)
# [1] 1800000 10
# variable Match Standard Athlete Team Quarter Position Sample X Y
# 1: 1 1 E AA ONE 1_1 Back 1 4.9244 -13.3858
# 2: 1 1 E AA ONE 1_1 Back 2 4.9242 -13.3866
# 3: 1 1 E AA ONE 1_1 Back 3 4.9240 -13.3873
# 4: 1 1 E AA ONE 1_1 Back 4 4.9239 -13.3880
# 5: 1 1 E AA ONE 1_1 Back 1 4.9244 -13.3858
# ---
# 1799996: 20 1 E BB ONE 1_1 Forward 4 -12.3220 -15.9146
# 1799997: 20 1 E BB ONE 1_1 Forward 1 -12.3725 -15.9311
# 1799998: 20 1 E BB ONE 1_1 Forward 2 -12.3566 -15.9260
# 1799999: 20 1 E BB ONE 1_1 Forward 3 -12.3398 -15.9205
# 1800000: 20 1 E BB ONE 1_1 Forward 4 -12.3220 -15.9146
我的数据
是 90730 观测值。 639 个变量。下面是一个(较小的)可重现示例:
structure(list(Match = c(1L, 1L, 1L, 1L), Standard = c("E",
"E", "E", "E"), Athlete = c("AA", "AA", "AA",
"AA"), Team = c("ONE", "ONE", "ONE", "ONE"), Quarter = c("1_1",
"1_1", "1_1", "1_1"), Position = c("Back", "Back", "Back", "Back"), Sample = 1:4, X = c(4.9244, 4.9242, 4.924, 4.9239), Y = c(-13.3858, -13.3866,
-13.3873, -13.388), Match.1 = c(1L, 1L, 1L, 1L), Standard.1 = c("E",
"E", "E", "E"), Athlete.1 = c("BB", "BB", "BB",
"BB"), Team.1 = c("ONE", "ONE", "ONE", "ONE"), Quarter.1 = c("1_1",
"1_1", "1_1", "1_1"), Position.1 = c("Forward", "Forward", "Forward", "Forward"), Sample.1 = 1:4, X.1 = c(-12.3725, -12.3566, -12.3398,
-12.322), Y.1 = c(-15.9311, -15.926, -15.9205, -15.9146)), .Names = c("Match",
"Standard", "Athlete", "Team", "Quarter", "Position", "Sample",
"X", "Y", "Match.1", "Standard.1", "Athlete.1", "Team.1", "Quarter.1",
"Position.1", "Sample.1", "X.1", "Y.1"), row.names = c(NA, 4L
), class = "data.frame")
我的理想输出
我希望将多个(相似的)列合并到仅包含 9 列的 data.frame (c("Match", "Standard", "Athlete", "Team", "Quarter", "Position", "Sample", "X", "Y"))
我尝试了什么
我尝试通过 reshape2
包和下面的代码进行熔化:
mdata <- melt(df, id=c("Match","Standard","Athlete","Team","Quarter","Position","Sample","X","Y"))
返回错误 Error: cannot allocate vector of size 218.0 Mb
我还尝试通过以下代码使用 grep
:
M1Compile <- data.frame(Match=unlist(df[grep('^Match', names(df))]),
Standard=unlist(df[grep('^Standard', names(df))]),
Athlete=unlist(df[grep('^Athlete', names(df))]),
Team=unlist(df[grep('^Team', names(df))]),
Quarter=unlist(df[grep('^Quarter', names(df))]),
Position=unlist(df[grep('^Position', names(df))]),
Sample=unlist(df[grep('^Sample', names(df))]),
X=unlist(df[grep('^X', names(df))]),
Y=unlist(df[grep('^Y', names(df))]),
stringsAsFactors=FALSE)
row.names(M1Compile) <- NULL<br><br>
但是,我收到错误 Error: cannot allocate vector of size 8.0 Mb
有什么建议么?我已将 df
保存为 .RData 文件,但没有成功重新打开 R。我还 运行 gc()
并关闭了所有其他后台程序以帮助记忆,同样没有成功。
我的问题
是否有更好的方法来构建我的数据/减少内存并获得我的理想输出,如上所述?
课程信息
R version 3.2.2 (2015-08-14)
Platform: i386-w64-mingw32/i386 (32-bit)
Running under: Windows 7 (build 7601) Service Pack 1
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] reshape2_1.4.1
loaded via a namespace (and not attached):
[1] magrittr_1.5 plyr_1.8.3 tools_3.2.2 Rcpp_0.12.2 stringi_1.0-1
stringr_1.0.0
对于您的示例数据
dd <- structure(list(Match = c(1L, 1L, 1L, 1L), Standard = c("E", "E", "E", "E"), Athlete = c("AA", "AA", "AA", "AA"), Team = c("ONE", "ONE", "ONE", "ONE"), Quarter = c("1_1", "1_1", "1_1", "1_1"), Position = c("Back", "Back", "Back", "Back"), Sample = 1:4, X = c(4.9244, 4.9242, 4.924, 4.9239), Y = c(-13.3858, -13.3866, -13.3873, -13.388), Match.1 = c(1L, 1L, 1L, 1L), Standard.1 = c("E", "E", "E", "E"), Athlete.1 = c("BB", "BB", "BB", "BB"), Team.1 = c("ONE", "ONE", "ONE", "ONE"), Quarter.1 = c("1_1", "1_1", "1_1", "1_1"), Position.1 = c("Forward", "Forward", "Forward", "Forward"), Sample.1 = 1:4, X.1 = c(-12.3725, -12.3566, -12.3398, -12.322), Y.1 = c(-15.9311, -15.926, -15.9205, -15.9146)), .Names = c("Match", "Standard", "Athlete", "Team", "Quarter", "Position", "Sample", "X", "Y", "Match.1", "Standard.1", "Athlete.1", "Team.1", "Quarter.1", "Position.1", "Sample.1", "X.1", "Y.1"), row.names = c(NA, 4L ), class = "data.frame")
dd <- dd[rep_len(seq.int(nrow(dd)), 90000), ]
system.time({
tmp <- reshape(dd, dir = 'l', varying = as.list(data.frame(t(matrix(1:18, ncol = 2)))))
})
# user system elapsed
# 0.144 0.014 0.158
dim(tmp)
# [1] 180000 11
head(tmp)
# time Match Standard Athlete Team Quarter Position Sample X Y id
# 1.1 1 1 E AA ONE 1_1 Back 1 4.9244 -13.3858 1
# 2.1 1 1 E AA ONE 1_1 Back 2 4.9242 -13.3866 2
# 3.1 1 1 E AA ONE 1_1 Back 3 4.9240 -13.3873 3
# 4.1 1 1 E AA ONE 1_1 Back 4 4.9239 -13.3880 4
# 5.1 1 1 E AA ONE 1_1 Back 1 4.9244 -13.3858 5
# 6.1 1 1 E AA ONE 1_1 Back 2 4.9242 -13.3866 6
对于 90,000 x 600 的数据,您可能想要使用更高效的方法
dd <- dd[, rep(1:18, 10)]
dim(dd)
# [1] 90000 630
system.time({
library('data.table')
setDT(dd)
dd <- melt(dd, id.vars = NULL, measure = patterns(names(dd)[1:9]),
value.name = names(dd)[1:9])
})
# user system elapsed
# 0.070 0.031 0.101
dim(dd)
# [1] 1800000 10
# variable Match Standard Athlete Team Quarter Position Sample X Y
# 1: 1 1 E AA ONE 1_1 Back 1 4.9244 -13.3858
# 2: 1 1 E AA ONE 1_1 Back 2 4.9242 -13.3866
# 3: 1 1 E AA ONE 1_1 Back 3 4.9240 -13.3873
# 4: 1 1 E AA ONE 1_1 Back 4 4.9239 -13.3880
# 5: 1 1 E AA ONE 1_1 Back 1 4.9244 -13.3858
# ---
# 1799996: 20 1 E BB ONE 1_1 Forward 4 -12.3220 -15.9146
# 1799997: 20 1 E BB ONE 1_1 Forward 1 -12.3725 -15.9311
# 1799998: 20 1 E BB ONE 1_1 Forward 2 -12.3566 -15.9260
# 1799999: 20 1 E BB ONE 1_1 Forward 3 -12.3398 -15.9205
# 1800000: 20 1 E BB ONE 1_1 Forward 4 -12.3220 -15.9146