按 ID 和时间在 R 中进行简单查找

simple lookup in R by ID and Time

我有一个如下所示的数据集:

set.seed(1234)
DT<-data.table(id=c(rep(c("a","b","c","d"),5)),
           year=rep(seq(from = 2010.5,to=2012.5,by = .5),each=4),
           value=rnorm(20,10,1))
DT
     id   year     value
 1:  a 2010.5  8.792934
 2:  b 2010.5 10.277429
 3:  c 2010.5 11.084441
 4:  d 2010.5  7.654302
 5:  a 2011.0 10.429125
 6:  b 2011.0 10.506056
 7:  c 2011.0  9.425260
 8:  d 2011.0  9.453368
 9:  a 2011.5  9.435548
10:  b 2011.5  9.109962
11:  c 2011.5  9.522807
12:  d 2011.5  9.001614
13:  a 2012.0  9.223746
14:  b 2012.0 10.064459
15:  c 2012.0 10.959494
16:  d 2012.0  9.889715
17:  a 2012.5  9.488990
18:  b 2012.5  9.088805
19:  c 2012.5  9.162828
20:  d 2012.5 12.415835

我想为每个 ID 添加 3 个非常相似的列,value_previous_6mvalue_previous_yvalue_next_y。第 10 行应该如下所示:

id   year    value value_previous_6m value_previous_y value_next_y
b  2011.5  9.109962    10.50606         10.27743        9.088805

我想避免使用 plyr 函数,因为整个数据集非常大。

非常感谢, 蒂姆

编辑:我知道可以用合并函数来完成:

set.seed(1234)
DT<-data.table(id=c(rep(c("a","b","c","d"),5)),
           year=rep(seq(from = 2010.5,to=2012.5,by = .5),each=4),
           value=rnorm(20,10,1))
DT6mp <- copy(DT)
DT12mp <- copy(DT)
DT6mp[,year:=year-.5]
setkey(DT6mp,id,year);setkey(DT,id,year);setnames(DT6mp,"value","value6mp")
DT <- merge(DT,DT6mp,all.x=T,all.y=F,allow.cartesian=T)
DT12mp[,year:=year-1]
setkey(DT12mp,id,year);setkey(DT,id,year);setnames(DT12mp,"value","value12mp")
DT <- merge(DT,DT12mp,all.x=T,all.y=F,allow.cartesian=T)
DT

不过我觉得应该有更好的方法。

你可以使用这种方法(为了更清楚,我自愿没有聚合3列加法):

DT[,c('value_previous_6m'):=.SD[match(year-0.5, .SD$year),value], id][
   ,c('value_previous_y'):= .SD[match(year-1, .SD$year),value], id][
   ,c('value_next_y'):=.SD[match(year+1, .SD$year),value], id][]

#    id   year     value value_previous_6m value_previous_y value_next_y
# 1:  a 2010.5  8.792934                NA               NA     9.435548
# 2:  b 2010.5 10.277429                NA               NA     9.109962
# 3:  c 2010.5 11.084441                NA               NA     9.522807
# 4:  d 2010.5  7.654302                NA               NA     9.001614
# 5:  a 2011.0 10.429125          8.792934               NA     9.223746
# 6:  b 2011.0 10.506056         10.277429               NA    10.064459
# 7:  c 2011.0  9.425260         11.084441               NA    10.959494
# 8:  d 2011.0  9.453368          7.654302               NA     9.889715
# 9:  a 2011.5  9.435548         10.429125         8.792934     9.488990
#10:  b 2011.5  9.109962         10.506056        10.277429     9.088805
#11:  c 2011.5  9.522807          9.425260        11.084441     9.162828
#12:  d 2011.5  9.001614          9.453368         7.654302    12.415835
#13:  a 2012.0  9.223746          9.435548        10.429125           NA
#14:  b 2012.0 10.064459          9.109962        10.506056           NA
#15:  c 2012.0 10.959494          9.522807         9.425260           NA
#16:  d 2012.0  9.889715          9.001614         9.453368           NA
#17:  a 2012.5  9.488990          9.223746         9.435548           NA
#18:  b 2012.5  9.088805         10.064459         9.109962           NA
#19:  c 2012.5  9.162828         10.959494         9.522807           NA
#20:  d 2012.5 12.415835          9.889715         9.001614           NA

您可以使用 data.table 的开发版本中的 shift,它具有 laglead 选项

library(data.table)#v >= 1.9.5
#library(devtools)
#install_github("Rdatatable/data.table", build_vignettes = FALSE)
DT[,c(paste0('val_previous_', c('6m', 'y')), "val_next_y"):=c(shift(value, 1:2), shift(value, 2, type="lead")), by=id]
 #   id   year     value val_previous_6m val_previous_y val_next_y
 #1:  a 2010.5  8.792934              NA             NA   9.435548
 #2:  b 2010.5 10.277429              NA             NA   9.109962
 #3:  c 2010.5 11.084441              NA             NA   9.522807
 #4:  d 2010.5  7.654302              NA             NA   9.001614
 #5:  a 2011.0 10.429125        8.792934             NA   9.223746
 #6:  b 2011.0 10.506056       10.277429             NA  10.064459
 #7:  c 2011.0  9.425260       11.084441             NA  10.959494
 #8:  d 2011.0  9.453368        7.654302             NA   9.889715
 #9:  a 2011.5  9.435548       10.429125       8.792934   9.488990
 #10:  b 2011.5  9.109962       10.506056      10.277429   9.088805
 #11:  c 2011.5  9.522807        9.425260      11.084441   9.162828
 #12:  d 2011.5  9.001614        9.453368       7.654302  12.415835
 #13:  a 2012.0  9.223746        9.435548      10.429125         NA
 #14:  b 2012.0 10.064459        9.109962      10.506056         NA
 #15:  c 2012.0 10.959494        9.522807       9.425260         NA
 #16:  d 2012.0  9.889715        9.001614       9.453368         NA
 #17:  a 2012.5  9.488990        9.223746       9.435548         NA
 #18:  b 2012.5  9.088805       10.064459       9.109962         NA
 #19:  c 2012.5  9.162828       10.959494       9.522807         NA
 #20:  d 2012.5 12.415835        9.889715       9.001614         NA

特意加长版本以免出错。

DT[, value_previous_6m := shift(value, 1), by=id
   ][, value_previous_y:= shift(value, 2), by=id
   ][, value_next_y:= shift(value, 2, type="lead"), by=id]

这里有一个dplyr+tidyr的解决方案,速度更快。

library(dplyr)
library(tidyr)
df <- data_frame(id=c(rep(c("a","b","c","d"),5)),
                 year=rep(seq(from = 2010.5,to=2012.5,by = .5),each=4),
                 value=rnorm(20,10,1))
df %>% 
  arrange(id) %>% 
  group_by(id) %>% 
  mutate(value_previous_6m = lag(value),
         value_previous_y = lag(value, 2),
         value_next_y = lead(value, 2)) %>% 
  ungroup %>% 
  arrange(year)

运行时间:0.016 秒

Beauvel 上校的 data.table 解决方案在同一台机器上花费了 0.023 秒。

编辑:Khashaa 的 data.table 解决方案比他 dplyr 我提供的解决方案更快。用了 0.007 秒。