通过更好的矢量化减少迭代时间
Decreasing iteration time through better vectorization
我正在尝试大幅减少代码的计算时间。
我有2个data.frames:
1.NewCus_withGeo
head(newCus)
U_ID U_LAT U_LONG
31902101 47.57080 -52.69588
26428781 45.50141 -73.55854
26428801 45.40768 -73.95121
28130841 45.44952 -73.82955
26428961 45.54130 -73.47751
26428981 45.35496 -75.67068
2.localD
head(localD)
ID D_ID D_LAT D_LONG
1533582_23821 1533582 45.40462 -75.62618
1533582_97592 1533582 45.21759 -75.71293
1533582_23811 1533582 46.83400 -71.28574
4126692_23811 4126692 46.79560 -71.27572
4134402_23811 4134402 46.74568 -71.44606
4135162_23811 4135162 46.82987 -71.23739
4138752_23841 4138752 46.34229 -72.54294
我想为每个 U_ID
获得最小欧氏距离的 5 D_ID
,但前提是该距离小于 0.3。
结果我想要另一个 data.frame 包含每个 U_ID
足够近 D_ID
5 个最近的 D_ID
和 U_ID
之间的距离和D_ID
看起来像这样:
head(result_df)
U_ID D_ID dist
26428781 4244252 0.0008656102
26428781 4088692 0.0055139426
26428781 4177752 0.0060150366
26428781 4182572 0.0067220964
26428781 4191862 0.0076759495
26428801 4135212 0.0455715423
26428801 4216202 0.0726350403
我想出了这个代码来做到这一点:
编辑感谢 Rui Barradas 改进了函数的性能
EDIT 2 为了进一步提高函数的性能,我们可以使用 data.table 来缩短子集化时间
library(data.table)
newCus = data.table(newCus)
distance <- function(x,y){
# Subsetting newCus to get only one U_ID
tb <- newCus[U_ID == x]
# distance calculations
dist = (tb$U_LAT - y$D_LAT)^2 + (tb$U_LONG - y$D_LONG)^2
if(min(dist) <= 0.3^2){
# Putting the data.frame together
NewCus_dist <- data.frame(U_ID = tb$U_ID, D_ID = y$D_ID, dist = dist)
# Keeping top 5 D_ID
NewCus_dist <- NewCus_dist[order(dist)[1:5], ]
NewCus_dist$dist <- sqrt(NewCus_dist$dist)
} else {
NewCus_dist <- NULL
}
NewCus_dist
}
然后在 do.call 和 lapply 函数中使用此函数迭代 U_ID
并在 data.frame
中得到结果
result_df = do.call("rbind", lapply(newCus$U_ID, distance, localD))
是否有更好更快的编码方式?
后面的代码没有向量化,但已经有了显着的速度提升。
现在,这基本上就是你的功能,有一些改进,主要是为了摆脱无用的东西。
distance2 <- function(x,y){
# Subsetting newCus to get only one U_ID
tb <- newCus[newCus$U_ID == x, ]
# distance calculations
dist = (tb$U_LAT - y$D_LAT)^2 + (tb$U_LONG - y$D_LONG)^2
if(min(dist) <= 0.3^2){
# Putting the data.frame together
NewCus_dist <- data.frame(U_ID = tb$U_ID, D_ID = y$D_ID, dist = dist)
# Keeping top 5 D_ID
NewCus_dist <- NewCus_dist[order(dist)[1:5], ]
NewCus_dist$dist <- sqrt(NewCus_dist$dist)
} else {
NewCus_dist <- NULL
}
NewCus_dist
}
res1 <- do.call("rbind", lapply(newCus$U_ID, distance, localD))
res2 <- do.call("rbind", lapply(newCus$U_ID, distance2, localD))
all.equal(res1, res2)
[1] TRUE
现在 distance
和 distance2
的时间安排。为此,我制作了更大的输入数据框。
newCus2 <- newCus
localD2 <- localD
for(i in 1:1e2){
newCus2 <- rbind(newCus2, newCus)
localD2 <- rbind(localD2, localD)
}
system.time({
res1 <- do.call("rbind", lapply(newCus2$U_ID, distance, localD2))
})
user system elapsed
1.7 0.0 1.7
system.time({
res2 <- do.call("rbind", lapply(newCus2$U_ID, distance2, localD2))
})
user system elapsed
0.17 0.00 0.17
十倍的速度增益,一个数量级。到此为止。也许有人会找到更好的矢量化解决方案。
require(Rcpp)
require(data.table)
cppFunction(
"List m(std::vector<std::string> & names, NumericMatrix & x,
std::vector<std::string> & names2, NumericMatrix & y){
unsigned int nx = x.nrow();
unsigned int ny = y.nrow();
std::vector<std::string> c1;
std::vector<std::string> c2;
NumericVector c3;
double d; double d1; double d2;
for (unsigned int i=0; i<nx; ++i) {
int sk = 0;
for (unsigned int j=0; j<ny; ++j) {
d1 = x(i, 0) - y(j, 0);
d2 = x(i, 1) - y(j, 1);
d = sqrt(d1*d1 + d2*d2);
if ((d < 0.3) & (sk < 5)) {
c1.push_back(names[i]);
c2.push_back(names2[j]);
c3.push_back(d);
++sk;
}
}
}
return List::create(
_[\"U_ID\"] = c1,
_[\"D_ID\"] = c2,
_[\"dist\"] = c3);
}")
cpp <- function(x, y) { # function which prepears data and executes Cpp function
n1 <- as.character(x[[1]])
c1 <- as.matrix(x[,-1])
n2 <- as.character(y[[2]])
c2 <- as.matrix(y[, -(1:2)])
r <- m(n1, c1, n2, c2)
as.data.table(r)
}
cpp(newCus, localD)
# U_ID D_ID dist
# 1: 26428981 1533582 0.06668107
# 2: 26428981 1533582 0.14372049
# Prevous function:
r <- function(x, y) do.call("rbind", lapply(x$U_ID, distance2, y))
r(newCus, localD)
# U_ID D_ID dist
# 1 26428981 1533582 0.06668107
# 2 26428981 1533582 0.14372049
# 7 26428981 4138752 3.27987470
# 5 26428981 4134402 4.44764165
# 4 26428981 4126692 4.62505319
我认为distance2函数中距离计算不正确,应该还有sqrt
。还是我错了?
基准测试
require(rbenchmark)
benchmark(r(newCus, localD),
cpp(newCus, localD),
replications = 1000,
columns = c("test", "replications", "elapsed", "relative"))
# test replications elapsed relative
# 2 cpp(newCus, localD) 1000 0.91 1.000
# 1 r(newCus, localD) 1000 2.35 2.582
# with bigger data:
benchmark(r(newCus2, localD2),
cpp(newCus2, localD2),
replications = 3,
columns = c("test", "replications", "elapsed", "relative"))
# test replications elapsed relative
# 2 cpp(newCus2, localD2) 3 0.86 1.000
# 1 r(newCus2, localD2) 3 9.11 10.593
比以前的解决方案快大约 10 倍。
更新
更正函数:
cppFunction(
"List m(std::vector<std::string> & names, NumericMatrix & x,
std::vector<std::string> & names2, NumericMatrix & y){
unsigned int nx = x.nrow();
unsigned int ny = y.nrow();
std::vector<std::string> c1;
std::vector<std::string> c2;
NumericVector c3;
double d; double d1; double d2;
for (unsigned int i=0; i<nx; ++i) {
for (unsigned int j=0; j<ny; ++j) {
d1 = x(i, 0) - y(j, 0);
d2 = x(i, 1) - y(j, 1);
d = sqrt(d1*d1 + d2*d2);
if ((d < 0.3)) {
c1.push_back(names[i]);
c2.push_back(names2[j]);
c3.push_back(d);
}
}
}
return List::create(
_[\"U_ID\"] = c1,
_[\"D_ID\"] = c2,
_[\"dist\"] = c3);
}")
cpp <- function(x, y) { # function which prepears data and executes Cpp function
n1 <- as.character(x[[1]])
c1 <- as.matrix(x[,-1])
n2 <- as.character(y[[2]])
c2 <- as.matrix(y[, -(1:2)])
r <- m(n1, c1, n2, c2)
r <- as.data.table(r)
setkey(r, U_ID, dist)
r[, head(.SD, 5), U_ID][]
}
现在你应该得到你想要的结果。
我正在尝试大幅减少代码的计算时间。
我有2个data.frames:
1.NewCus_withGeo
head(newCus)
U_ID U_LAT U_LONG
31902101 47.57080 -52.69588
26428781 45.50141 -73.55854
26428801 45.40768 -73.95121
28130841 45.44952 -73.82955
26428961 45.54130 -73.47751
26428981 45.35496 -75.67068
2.localD
head(localD)
ID D_ID D_LAT D_LONG
1533582_23821 1533582 45.40462 -75.62618
1533582_97592 1533582 45.21759 -75.71293
1533582_23811 1533582 46.83400 -71.28574
4126692_23811 4126692 46.79560 -71.27572
4134402_23811 4134402 46.74568 -71.44606
4135162_23811 4135162 46.82987 -71.23739
4138752_23841 4138752 46.34229 -72.54294
我想为每个 U_ID
获得最小欧氏距离的 5 D_ID
,但前提是该距离小于 0.3。
结果我想要另一个 data.frame 包含每个 U_ID
足够近 D_ID
5 个最近的 D_ID
和 U_ID
之间的距离和D_ID
看起来像这样:
head(result_df)
U_ID D_ID dist
26428781 4244252 0.0008656102
26428781 4088692 0.0055139426
26428781 4177752 0.0060150366
26428781 4182572 0.0067220964
26428781 4191862 0.0076759495
26428801 4135212 0.0455715423
26428801 4216202 0.0726350403
我想出了这个代码来做到这一点:
编辑感谢 Rui Barradas 改进了函数的性能
EDIT 2 为了进一步提高函数的性能,我们可以使用 data.table 来缩短子集化时间
library(data.table)
newCus = data.table(newCus)
distance <- function(x,y){
# Subsetting newCus to get only one U_ID
tb <- newCus[U_ID == x]
# distance calculations
dist = (tb$U_LAT - y$D_LAT)^2 + (tb$U_LONG - y$D_LONG)^2
if(min(dist) <= 0.3^2){
# Putting the data.frame together
NewCus_dist <- data.frame(U_ID = tb$U_ID, D_ID = y$D_ID, dist = dist)
# Keeping top 5 D_ID
NewCus_dist <- NewCus_dist[order(dist)[1:5], ]
NewCus_dist$dist <- sqrt(NewCus_dist$dist)
} else {
NewCus_dist <- NULL
}
NewCus_dist
}
然后在 do.call 和 lapply 函数中使用此函数迭代 U_ID
并在 data.frame
result_df = do.call("rbind", lapply(newCus$U_ID, distance, localD))
是否有更好更快的编码方式?
后面的代码没有向量化,但已经有了显着的速度提升。
现在,这基本上就是你的功能,有一些改进,主要是为了摆脱无用的东西。
distance2 <- function(x,y){
# Subsetting newCus to get only one U_ID
tb <- newCus[newCus$U_ID == x, ]
# distance calculations
dist = (tb$U_LAT - y$D_LAT)^2 + (tb$U_LONG - y$D_LONG)^2
if(min(dist) <= 0.3^2){
# Putting the data.frame together
NewCus_dist <- data.frame(U_ID = tb$U_ID, D_ID = y$D_ID, dist = dist)
# Keeping top 5 D_ID
NewCus_dist <- NewCus_dist[order(dist)[1:5], ]
NewCus_dist$dist <- sqrt(NewCus_dist$dist)
} else {
NewCus_dist <- NULL
}
NewCus_dist
}
res1 <- do.call("rbind", lapply(newCus$U_ID, distance, localD))
res2 <- do.call("rbind", lapply(newCus$U_ID, distance2, localD))
all.equal(res1, res2)
[1] TRUE
现在 distance
和 distance2
的时间安排。为此,我制作了更大的输入数据框。
newCus2 <- newCus
localD2 <- localD
for(i in 1:1e2){
newCus2 <- rbind(newCus2, newCus)
localD2 <- rbind(localD2, localD)
}
system.time({
res1 <- do.call("rbind", lapply(newCus2$U_ID, distance, localD2))
})
user system elapsed
1.7 0.0 1.7
system.time({
res2 <- do.call("rbind", lapply(newCus2$U_ID, distance2, localD2))
})
user system elapsed
0.17 0.00 0.17
十倍的速度增益,一个数量级。到此为止。也许有人会找到更好的矢量化解决方案。
require(Rcpp)
require(data.table)
cppFunction(
"List m(std::vector<std::string> & names, NumericMatrix & x,
std::vector<std::string> & names2, NumericMatrix & y){
unsigned int nx = x.nrow();
unsigned int ny = y.nrow();
std::vector<std::string> c1;
std::vector<std::string> c2;
NumericVector c3;
double d; double d1; double d2;
for (unsigned int i=0; i<nx; ++i) {
int sk = 0;
for (unsigned int j=0; j<ny; ++j) {
d1 = x(i, 0) - y(j, 0);
d2 = x(i, 1) - y(j, 1);
d = sqrt(d1*d1 + d2*d2);
if ((d < 0.3) & (sk < 5)) {
c1.push_back(names[i]);
c2.push_back(names2[j]);
c3.push_back(d);
++sk;
}
}
}
return List::create(
_[\"U_ID\"] = c1,
_[\"D_ID\"] = c2,
_[\"dist\"] = c3);
}")
cpp <- function(x, y) { # function which prepears data and executes Cpp function
n1 <- as.character(x[[1]])
c1 <- as.matrix(x[,-1])
n2 <- as.character(y[[2]])
c2 <- as.matrix(y[, -(1:2)])
r <- m(n1, c1, n2, c2)
as.data.table(r)
}
cpp(newCus, localD)
# U_ID D_ID dist
# 1: 26428981 1533582 0.06668107
# 2: 26428981 1533582 0.14372049
# Prevous function:
r <- function(x, y) do.call("rbind", lapply(x$U_ID, distance2, y))
r(newCus, localD)
# U_ID D_ID dist
# 1 26428981 1533582 0.06668107
# 2 26428981 1533582 0.14372049
# 7 26428981 4138752 3.27987470
# 5 26428981 4134402 4.44764165
# 4 26428981 4126692 4.62505319
我认为distance2函数中距离计算不正确,应该还有sqrt
。还是我错了?
基准测试
require(rbenchmark)
benchmark(r(newCus, localD),
cpp(newCus, localD),
replications = 1000,
columns = c("test", "replications", "elapsed", "relative"))
# test replications elapsed relative
# 2 cpp(newCus, localD) 1000 0.91 1.000
# 1 r(newCus, localD) 1000 2.35 2.582
# with bigger data:
benchmark(r(newCus2, localD2),
cpp(newCus2, localD2),
replications = 3,
columns = c("test", "replications", "elapsed", "relative"))
# test replications elapsed relative
# 2 cpp(newCus2, localD2) 3 0.86 1.000
# 1 r(newCus2, localD2) 3 9.11 10.593
比以前的解决方案快大约 10 倍。
更新
更正函数:
cppFunction(
"List m(std::vector<std::string> & names, NumericMatrix & x,
std::vector<std::string> & names2, NumericMatrix & y){
unsigned int nx = x.nrow();
unsigned int ny = y.nrow();
std::vector<std::string> c1;
std::vector<std::string> c2;
NumericVector c3;
double d; double d1; double d2;
for (unsigned int i=0; i<nx; ++i) {
for (unsigned int j=0; j<ny; ++j) {
d1 = x(i, 0) - y(j, 0);
d2 = x(i, 1) - y(j, 1);
d = sqrt(d1*d1 + d2*d2);
if ((d < 0.3)) {
c1.push_back(names[i]);
c2.push_back(names2[j]);
c3.push_back(d);
}
}
}
return List::create(
_[\"U_ID\"] = c1,
_[\"D_ID\"] = c2,
_[\"dist\"] = c3);
}")
cpp <- function(x, y) { # function which prepears data and executes Cpp function
n1 <- as.character(x[[1]])
c1 <- as.matrix(x[,-1])
n2 <- as.character(y[[2]])
c2 <- as.matrix(y[, -(1:2)])
r <- m(n1, c1, n2, c2)
r <- as.data.table(r)
setkey(r, U_ID, dist)
r[, head(.SD, 5), U_ID][]
}
现在你应该得到你想要的结果。