R data.table 按条件列表或行索引
R data.table by list of conditionals or row indices
我有一个包含距离的数据 table。我想通过我的 "id" 变量和包含的距离阈值 运行 data.table 中的各种操作 (e.g. Dist<1, Dist<2, etc.).
我知道如何通过 id 和距离 运行 操作"by=list(id,Dist)"
,但我真的想要一个更像 "by=list(id,c(Dist<=1,Dist<=2,Dist<=3,Dist<=4,Dist<=5)
的变量。下面是我的数据结构和 objective.
的示例
#load library
library(data.table)
#create data
set.seed(123L)
dt<-data.table(id=factor(rep(1:10,5)),V1=rnorm(50,5,5),Dist=sample(1:5,50,replace=T))
#calculate mean of V1 by id and distance (wrong results)
dt2<-dt[,.(MeanV1=mean(V1)),by=list(id,Dist)]
#calculate mean of V1 by id and conditional distance (right results, wrong method)
dt2.1<-dt[Dist<=1,.(MeanV1=mean(V1)),by=id]
dt2.2<-dt[Dist<=2,.(MeanV1=mean(V1)),by=id]
dt2.3<-dt[Dist<=3,.(MeanV1=mean(V1)),by=id]
dt2.4<-dt[Dist<=4,.(MeanV1=mean(V1)),by=id]
dt2.5<-dt[Dist<=5,.(MeanV1=mean(V1)),by=id]
dt2<-rbind(dt2.1,dt2.2,dt2.3,dt2.4,dt2.5)
#ideal methods if either were valid
#syntax 1
dt2<-dt[,.(MeanV1=mean(V1)),by=list(id,c(Dist<=1,Dist<=2,Dist<=3,Dist<=4,Dist<=5))]
#syntax 2
rowindices<-list(dt$Dist<=1,dt$Dist<=2,dt$Dist<=3,dt$Dist<=4,dt$Dist<=5)
dt2<-dt[,.(MeanV1=mean(V1)),by=list(id,rowindices)]
提前致谢。
弗兰克在评论中的回答将实现您所追求的目标。这里有一个解释:
首先,您可以使用 data.table 做的一件事是 "non-equi join",这是第一个 data.table 调用正在做的事情。
首先,我们创建了一个 table 个我们想要操作的阈值:
> thresholds <- data.table(dist_threshold=1:5)
> thresholds
dist_threshold
1: 1
2: 2
3: 3
4: 4
5: 5
接下来,我们使用阈值 table 对原始 table 执行非相等连接:这会创建一个新的 table,其中 dist 列包含以下每个 ID 的所有条目该阈值:
> passes_threshold <- dt[thresholds, on=.(Dist < dist_threshold), # non-equi join
+ allow.cartesian=TRUE, # Fixes error, see details in ?data.table
+ nomatch=0 # Do not include thresholds which no row satisfies (i.e. Dist < 1)
+ ]
> passes_threshold
# Here the Dist column now means "Dist < dist_threshold".
# There will be 5 rows where Dist < 2, 19 where Dist < 3,
# 30 where Dist < 4, and 40 Where Dist < 5
id V1 Dist
1: 8 8.521825 2
2: 5 2.002523 2
3: 6 8.698732 2
4: 9 -1.701028 2
5: 2 6.114119 2
---
90: 6 -1.392776 5
91: 10 9.033493 5
92: 1 9.565713 5
93: 5 4.579124 5
94: 7 1.498690 5
我们现在可以将连接与 j
和 by
参数中的汇总操作结合起来计算每个阈值的平均距离:
> passes_threshold[,.(mean_Dist_by_threshold=mean(V1)), by=.(threshold=Dist)]
threshold mean_Dist_per_threshold
1: 2 4.727234
2: 3 4.615258
3: 4 4.202856
4: 5 4.559240
作为对的补充,他的解可以更简洁的写成
dt[.(1:5), on = .(Dist < V1), allow = TRUE, nomatch = 0][
, .(mean_Dist_by_threshold = mean(V1)), by = .(threshold = Dist)]
在这里,.(1:5)
即时创建 thresholds
并且 data.table
表达式被链接起来。
或者,聚合可以在连接期间使用by = .EACHI
:
完成
dt[.(1:5), on = .(Dist < V1), nomatch = 0,
.(mean_Dist_by_threshold = mean(V1)), by = .EACHI][
, setnames(.SD, "Dist", "threshold")]
调用 setnames()
只是为了方便 return 与 Scott 的回答相同的结果。
基准代码
library(data.table)
# create data
nr <- 5e2L
set.seed(123L) # to make the data reproducible
dt <-
data.table(
id = factor(rep(1:10, nr / 10)),
V1 = rnorm(nr, 5, 5),
Dist = sample(1:5, nr, replace = T)
)
str(dt)
microbenchmark::microbenchmark(
scott = {
thresholds <- data.table(dist_threshold=1:5)
passes_threshold <-
dt[thresholds, on = .(Dist < dist_threshold), # non-equi join
allow.cartesian = TRUE, # Fixes error, see details in ?data.table
nomatch = 0 # Do not include thresholds which no row satisfies (i.e. Dist < 1)
]
passes_threshold[, .(mean_Dist_by_threshold = mean(V1)), by = .(threshold = Dist)]
},
uwe1 = {
dt[.(1:5), on = .(Dist < V1), allow = TRUE, nomatch = 0][
, .(mean_Dist_by_threshold = mean(V1)), by = .(threshold = Dist)]
},
uwe2 = {
dt[.(1:5), on = .(Dist < V1), nomatch = 0,
.(mean_Dist_by_threshold = mean(V1)), by = .EACHI][
, setnames(.SD, "Dist", "threshold")]
},
times = 100L
)
基准测试结果
在 500 行的情况下,3 个变体之间只有细微差别,链接略微领先于 Scott 的,by = .EACHI
落后。
Unit: milliseconds
expr min lq mean median uq max neval cld
scott 1.460058 1.506854 1.618048 1.526019 1.726257 4.768493 100 a
uwe1 1.302760 1.327686 1.487237 1.338926 1.372498 12.733933 100 a
uwe2 1.827756 1.864777 1.944920 1.888349 2.020097 2.233269 100 b
在 50000 行的情况下,链接仍然略微领先于 Scott,但 by = .EACHI
的表现优于其他。
Unit: milliseconds
expr min lq mean median uq max neval cld
scott 3.692545 3.811466 4.016152 3.826423 3.853489 10.336598 100 b
uwe1 3.560786 3.632999 3.936583 3.642526 3.657992 13.579008 100 b
uwe2 2.503508 2.545722 2.577735 2.566869 2.602586 2.798692 100 a
对于 5 M 行,这变得更加明显:
Unit: milliseconds
expr min lq mean median uq max neval cld
scott 641.9945 675.3749 743.0761 708.7552 793.6170 878.4787 3 b
uwe1 587.1724 587.5557 589.1360 587.9391 590.1178 592.2965 3 b
uwe2 130.9358 134.6688 157.1860 138.4019 170.3110 202.2202 3 a
速度差异的一个解释可能是超过 10 M 行的中间结果 passes_threshold
的剪切大小(这就是为什么需要 allow.cartesian = TRUE
)。
我有一个包含距离的数据 table。我想通过我的 "id" 变量和包含的距离阈值 运行 data.table 中的各种操作 (e.g. Dist<1, Dist<2, etc.).
我知道如何通过 id 和距离 运行 操作"by=list(id,Dist)"
,但我真的想要一个更像 "by=list(id,c(Dist<=1,Dist<=2,Dist<=3,Dist<=4,Dist<=5)
的变量。下面是我的数据结构和 objective.
#load library
library(data.table)
#create data
set.seed(123L)
dt<-data.table(id=factor(rep(1:10,5)),V1=rnorm(50,5,5),Dist=sample(1:5,50,replace=T))
#calculate mean of V1 by id and distance (wrong results)
dt2<-dt[,.(MeanV1=mean(V1)),by=list(id,Dist)]
#calculate mean of V1 by id and conditional distance (right results, wrong method)
dt2.1<-dt[Dist<=1,.(MeanV1=mean(V1)),by=id]
dt2.2<-dt[Dist<=2,.(MeanV1=mean(V1)),by=id]
dt2.3<-dt[Dist<=3,.(MeanV1=mean(V1)),by=id]
dt2.4<-dt[Dist<=4,.(MeanV1=mean(V1)),by=id]
dt2.5<-dt[Dist<=5,.(MeanV1=mean(V1)),by=id]
dt2<-rbind(dt2.1,dt2.2,dt2.3,dt2.4,dt2.5)
#ideal methods if either were valid
#syntax 1
dt2<-dt[,.(MeanV1=mean(V1)),by=list(id,c(Dist<=1,Dist<=2,Dist<=3,Dist<=4,Dist<=5))]
#syntax 2
rowindices<-list(dt$Dist<=1,dt$Dist<=2,dt$Dist<=3,dt$Dist<=4,dt$Dist<=5)
dt2<-dt[,.(MeanV1=mean(V1)),by=list(id,rowindices)]
提前致谢。
弗兰克在评论中的回答将实现您所追求的目标。这里有一个解释:
首先,您可以使用 data.table 做的一件事是 "non-equi join",这是第一个 data.table 调用正在做的事情。
首先,我们创建了一个 table 个我们想要操作的阈值:
> thresholds <- data.table(dist_threshold=1:5)
> thresholds
dist_threshold
1: 1
2: 2
3: 3
4: 4
5: 5
接下来,我们使用阈值 table 对原始 table 执行非相等连接:这会创建一个新的 table,其中 dist 列包含以下每个 ID 的所有条目该阈值:
> passes_threshold <- dt[thresholds, on=.(Dist < dist_threshold), # non-equi join
+ allow.cartesian=TRUE, # Fixes error, see details in ?data.table
+ nomatch=0 # Do not include thresholds which no row satisfies (i.e. Dist < 1)
+ ]
> passes_threshold
# Here the Dist column now means "Dist < dist_threshold".
# There will be 5 rows where Dist < 2, 19 where Dist < 3,
# 30 where Dist < 4, and 40 Where Dist < 5
id V1 Dist
1: 8 8.521825 2
2: 5 2.002523 2
3: 6 8.698732 2
4: 9 -1.701028 2
5: 2 6.114119 2
---
90: 6 -1.392776 5
91: 10 9.033493 5
92: 1 9.565713 5
93: 5 4.579124 5
94: 7 1.498690 5
我们现在可以将连接与 j
和 by
参数中的汇总操作结合起来计算每个阈值的平均距离:
> passes_threshold[,.(mean_Dist_by_threshold=mean(V1)), by=.(threshold=Dist)]
threshold mean_Dist_per_threshold
1: 2 4.727234
2: 3 4.615258
3: 4 4.202856
4: 5 4.559240
作为对
dt[.(1:5), on = .(Dist < V1), allow = TRUE, nomatch = 0][
, .(mean_Dist_by_threshold = mean(V1)), by = .(threshold = Dist)]
在这里,.(1:5)
即时创建 thresholds
并且 data.table
表达式被链接起来。
或者,聚合可以在连接期间使用by = .EACHI
:
dt[.(1:5), on = .(Dist < V1), nomatch = 0,
.(mean_Dist_by_threshold = mean(V1)), by = .EACHI][
, setnames(.SD, "Dist", "threshold")]
调用 setnames()
只是为了方便 return 与 Scott 的回答相同的结果。
基准代码
library(data.table)
# create data
nr <- 5e2L
set.seed(123L) # to make the data reproducible
dt <-
data.table(
id = factor(rep(1:10, nr / 10)),
V1 = rnorm(nr, 5, 5),
Dist = sample(1:5, nr, replace = T)
)
str(dt)
microbenchmark::microbenchmark(
scott = {
thresholds <- data.table(dist_threshold=1:5)
passes_threshold <-
dt[thresholds, on = .(Dist < dist_threshold), # non-equi join
allow.cartesian = TRUE, # Fixes error, see details in ?data.table
nomatch = 0 # Do not include thresholds which no row satisfies (i.e. Dist < 1)
]
passes_threshold[, .(mean_Dist_by_threshold = mean(V1)), by = .(threshold = Dist)]
},
uwe1 = {
dt[.(1:5), on = .(Dist < V1), allow = TRUE, nomatch = 0][
, .(mean_Dist_by_threshold = mean(V1)), by = .(threshold = Dist)]
},
uwe2 = {
dt[.(1:5), on = .(Dist < V1), nomatch = 0,
.(mean_Dist_by_threshold = mean(V1)), by = .EACHI][
, setnames(.SD, "Dist", "threshold")]
},
times = 100L
)
基准测试结果
在 500 行的情况下,3 个变体之间只有细微差别,链接略微领先于 Scott 的,by = .EACHI
落后。
Unit: milliseconds expr min lq mean median uq max neval cld scott 1.460058 1.506854 1.618048 1.526019 1.726257 4.768493 100 a uwe1 1.302760 1.327686 1.487237 1.338926 1.372498 12.733933 100 a uwe2 1.827756 1.864777 1.944920 1.888349 2.020097 2.233269 100 b
在 50000 行的情况下,链接仍然略微领先于 Scott,但 by = .EACHI
的表现优于其他。
Unit: milliseconds expr min lq mean median uq max neval cld scott 3.692545 3.811466 4.016152 3.826423 3.853489 10.336598 100 b uwe1 3.560786 3.632999 3.936583 3.642526 3.657992 13.579008 100 b uwe2 2.503508 2.545722 2.577735 2.566869 2.602586 2.798692 100 a
对于 5 M 行,这变得更加明显:
Unit: milliseconds
expr min lq mean median uq max neval cld
scott 641.9945 675.3749 743.0761 708.7552 793.6170 878.4787 3 b
uwe1 587.1724 587.5557 589.1360 587.9391 590.1178 592.2965 3 b
uwe2 130.9358 134.6688 157.1860 138.4019 170.3110 202.2202 3 a
速度差异的一个解释可能是超过 10 M 行的中间结果 passes_threshold
的剪切大小(这就是为什么需要 allow.cartesian = TRUE
)。