加速或替换 loop:Group 重塑和行绑定
Speed up or replace for loop:Group reshape and row bind
大家晚上好,我想将相同id的行合并为一行,增加一列,这是我的部分数据。
sample=structure(list(crsp_fundno = c(18021, 18021, 18021, 18021, 22436,
22436, 22436, 22436, 22436, 22436, 49805, 49805, 49805, 55603,
55603, 93362), seq = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 5L, 6L,
1L, 2L, 3L, 1L, 2L, 1L), begdt = structure(c(13513, 14298, 15027,
16149, 12417, 13969, 14910, 14918, 15042, 15644, 14782, 14910,
15544, 15505, 15531, 17571), class = "Date"), enddt = structure(c(14297,
15026, 16148, 17621, 13968, 14909, 14917, 15041, 15643, 17621,
14909, 15543, 17621, 15530, 17621, 17621), class = "Date"), crsp_obj_cd = c("EDYG",
"EDYG", "EDYG", "EDYG", "EDYG", "EDYG", "EDYG", "EDYG", "EDYG",
"EDYG", "EF", "EF", "EF", "EDYB", "EDYB", "M"), lipper_class = c("MLGE",
"MCCE", "MCVE", "MLCE", "MLVE", "MLVE", "MLCE", "MLVE", "MLCE",
"MLVE", "IMLC", "IMLG", "IMLC", "MTAM", "MTAC", "MATJ"), lipper_obj_cd = c("G",
"G", "G", "G", "G", "G", "G", "G", "G", "G", "IF", "IF", "IF",
"GI", "GI", "I"), lipper_asset_cd = c("EQ", "EQ", "EQ", "EQ",
"EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ",
"EQ")), class = "data.frame", row.names = c(NA, -16L))
我试图将具有相同 ID 的行合并在一行中,这是我的代码。
temp=list()
dn=unique(sample$crsp_fundno)
for(i in 1:length(dn) ){
part=sample[which(sample$crsp_fundno %in% dn[i]),]
part=reshape(part,idvar='crsp_fundno',timevar='seq',direction='wide')
temp[[i]]=part
}
library(plyr)
sum=rbind.fill(temp[[1]],temp[[2]])
for (i in 3 :length(dn)){sum=rbind.fill(sum,temp[[i]])}
代码有效,但在我的整个数据中太低了(94000 obs 几乎需要 2 小时)。
我认为我不应该在大数据集中过度依赖for循环。
有人知道我该如何改进代码或我的逻辑吗?
感谢您的帮助。
因此,使用reshape
是正确的,但是实现起来并不理想。该函数已经过优化,可以在 long
和 wide
格式之间进行转换,无需任何 for
循环。
只需调用一次,节省时间:
library(reshape2)
sum <- reshape(sample,direction = "wide",idvar = "crsp_fundno",timevar = "seq")
正如您的预感,reshape
能够在格式之间顺利切换。
在你的情况下你有:
crsp_fundno
是long
格式的变量,识别来自同一组的多条记录
seq
是long
格式的变量,区分来自同一组的多条记录
就速度和内存效率而言,我强烈推荐 data.table
。
setDT(sample) # in place, no assignment needed
sum3 <- dcast(sample,
crsp_fundno ~ seq,
value.var = names(sample)[3:8])
这里是 OP
的 for loop
、其中一个答案中建议的 reshape
方式与此 [= 中建议的 data.table
方式的比较21=]:
Unit: milliseconds
expr min lq mean median uq max neval cld
for loop 23.735154 24.190626 25.948536 24.722330 26.176343 42.764253 100 c
reshape 6.448800 6.742147 7.196820 6.850390 7.379401 9.932432 100 b
data.table 1.928812 2.143367 2.362979 2.255964 2.447935 5.847116 100 a
大家晚上好,我想将相同id的行合并为一行,增加一列,这是我的部分数据。
sample=structure(list(crsp_fundno = c(18021, 18021, 18021, 18021, 22436,
22436, 22436, 22436, 22436, 22436, 49805, 49805, 49805, 55603,
55603, 93362), seq = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 5L, 6L,
1L, 2L, 3L, 1L, 2L, 1L), begdt = structure(c(13513, 14298, 15027,
16149, 12417, 13969, 14910, 14918, 15042, 15644, 14782, 14910,
15544, 15505, 15531, 17571), class = "Date"), enddt = structure(c(14297,
15026, 16148, 17621, 13968, 14909, 14917, 15041, 15643, 17621,
14909, 15543, 17621, 15530, 17621, 17621), class = "Date"), crsp_obj_cd = c("EDYG",
"EDYG", "EDYG", "EDYG", "EDYG", "EDYG", "EDYG", "EDYG", "EDYG",
"EDYG", "EF", "EF", "EF", "EDYB", "EDYB", "M"), lipper_class = c("MLGE",
"MCCE", "MCVE", "MLCE", "MLVE", "MLVE", "MLCE", "MLVE", "MLCE",
"MLVE", "IMLC", "IMLG", "IMLC", "MTAM", "MTAC", "MATJ"), lipper_obj_cd = c("G",
"G", "G", "G", "G", "G", "G", "G", "G", "G", "IF", "IF", "IF",
"GI", "GI", "I"), lipper_asset_cd = c("EQ", "EQ", "EQ", "EQ",
"EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ", "EQ",
"EQ")), class = "data.frame", row.names = c(NA, -16L))
我试图将具有相同 ID 的行合并在一行中,这是我的代码。
temp=list()
dn=unique(sample$crsp_fundno)
for(i in 1:length(dn) ){
part=sample[which(sample$crsp_fundno %in% dn[i]),]
part=reshape(part,idvar='crsp_fundno',timevar='seq',direction='wide')
temp[[i]]=part
}
library(plyr)
sum=rbind.fill(temp[[1]],temp[[2]])
for (i in 3 :length(dn)){sum=rbind.fill(sum,temp[[i]])}
代码有效,但在我的整个数据中太低了(94000 obs 几乎需要 2 小时)。
我认为我不应该在大数据集中过度依赖for循环。
有人知道我该如何改进代码或我的逻辑吗?
感谢您的帮助。
因此,使用reshape
是正确的,但是实现起来并不理想。该函数已经过优化,可以在 long
和 wide
格式之间进行转换,无需任何 for
循环。
只需调用一次,节省时间:
library(reshape2)
sum <- reshape(sample,direction = "wide",idvar = "crsp_fundno",timevar = "seq")
正如您的预感,reshape
能够在格式之间顺利切换。
在你的情况下你有:
crsp_fundno
是long
格式的变量,识别来自同一组的多条记录seq
是long
格式的变量,区分来自同一组的多条记录
就速度和内存效率而言,我强烈推荐 data.table
。
setDT(sample) # in place, no assignment needed
sum3 <- dcast(sample,
crsp_fundno ~ seq,
value.var = names(sample)[3:8])
这里是 OP
的 for loop
、其中一个答案中建议的 reshape
方式与此 [= 中建议的 data.table
方式的比较21=]:
Unit: milliseconds
expr min lq mean median uq max neval cld
for loop 23.735154 24.190626 25.948536 24.722330 26.176343 42.764253 100 c
reshape 6.448800 6.742147 7.196820 6.850390 7.379401 9.932432 100 b
data.table 1.928812 2.143367 2.362979 2.255964 2.447935 5.847116 100 a