通过更好的矢量化减少迭代时间

Decreasing iteration time through better vectorization

我正在尝试大幅减少代码的计算时间。

我有2个data.frames:

1.NewCus_withGeo

head(newCus)
      U_ID    U_LAT    U_LONG
  31902101 47.57080 -52.69588
  26428781 45.50141 -73.55854
  26428801 45.40768 -73.95121
  28130841 45.44952 -73.82955
  26428961 45.54130 -73.47751
  26428981 45.35496 -75.67068

2.localD

head(localD)
           ID    D_ID    D_LAT    D_LONG
1533582_23821 1533582 45.40462 -75.62618
1533582_97592 1533582 45.21759 -75.71293
1533582_23811 1533582 46.83400 -71.28574
4126692_23811 4126692 46.79560 -71.27572
4134402_23811 4134402 46.74568 -71.44606
4135162_23811 4135162 46.82987 -71.23739
4138752_23841 4138752 46.34229 -72.54294

我想为每个 U_ID 获得最小欧氏距离的 5 D_ID,但前提是该距离小于 0.3。

结果我想要另一个 data.frame 包含每个 U_ID 足够近 D_ID 5 个最近的 D_IDU_ID 之间的距离和D_ID 看起来像这样:

head(result_df)
         U_ID    D_ID         dist
     26428781 4244252 0.0008656102
     26428781 4088692 0.0055139426
     26428781 4177752 0.0060150366
     26428781 4182572 0.0067220964
     26428781 4191862 0.0076759495
     26428801 4135212 0.0455715423
     26428801 4216202 0.0726350403

我想出了这个代码来做到这一点:

编辑感谢 Rui Barradas 改进了函数的性能

EDIT 2 为了进一步提高函数的性能,我们可以使用 data.table 来缩短子集化时间

library(data.table)
newCus = data.table(newCus)

distance <- function(x,y){
 # Subsetting newCus to get only one U_ID
 tb <- newCus[U_ID == x]

 # distance calculations
 dist = (tb$U_LAT - y$D_LAT)^2 + (tb$U_LONG - y$D_LONG)^2

 if(min(dist) <= 0.3^2){

   # Putting the data.frame together
   NewCus_dist <- data.frame(U_ID = tb$U_ID, D_ID = y$D_ID, dist = dist)
   # Keeping top 5 D_ID
   NewCus_dist <- NewCus_dist[order(dist)[1:5], ]
   NewCus_dist$dist <- sqrt(NewCus_dist$dist)

 } else {
   NewCus_dist <- NULL
 }
 NewCus_dist
}

然后在 do.call 和 lapply 函数中使用此函数迭代 U_ID 并在 data.frame

中得到结果
 result_df = do.call("rbind", lapply(newCus$U_ID, distance, localD))

是否有更好更快的编码方式?

后面的代码没有向量化,但已经有了显着的速度提升。

现在,这基本上就是你的功能,有一些改进,主要是为了摆脱无用的东西。

distance2 <- function(x,y){
   # Subsetting newCus to get only one U_ID
   tb <- newCus[newCus$U_ID == x, ]

   # distance calculations
   dist = (tb$U_LAT - y$D_LAT)^2 + (tb$U_LONG - y$D_LONG)^2

   if(min(dist) <= 0.3^2){

       # Putting the data.frame together
       NewCus_dist <- data.frame(U_ID = tb$U_ID, D_ID = y$D_ID, dist = dist)
       # Keeping top 5 D_ID
       NewCus_dist <- NewCus_dist[order(dist)[1:5], ]
       NewCus_dist$dist <- sqrt(NewCus_dist$dist)

   } else {
       NewCus_dist <- NULL
   }
   NewCus_dist
}

res1 <- do.call("rbind", lapply(newCus$U_ID, distance, localD))
res2 <- do.call("rbind", lapply(newCus$U_ID, distance2, localD))
all.equal(res1, res2)
[1] TRUE

现在 distancedistance2 的时间安排。为此,我制作了更大的输入数据框。

newCus2 <- newCus
localD2 <- localD
for(i in 1:1e2){
    newCus2 <- rbind(newCus2, newCus)
    localD2 <- rbind(localD2, localD)
}

system.time({
res1 <- do.call("rbind", lapply(newCus2$U_ID, distance, localD2))
})
   user  system elapsed 
    1.7     0.0     1.7
system.time({
res2 <- do.call("rbind", lapply(newCus2$U_ID, distance2, localD2))
})
   user  system elapsed 
   0.17    0.00    0.17

十倍的速度增益,一个数量级。到此为止。也许有人会找到更好的矢量化解决方案。

require(Rcpp)
require(data.table)

cppFunction(
  "List m(std::vector<std::string> & names, NumericMatrix & x,
  std::vector<std::string> & names2, NumericMatrix & y){

    unsigned int nx = x.nrow();
    unsigned int ny = y.nrow();
    std::vector<std::string> c1;
    std::vector<std::string> c2;
    NumericVector c3;
    double d; double d1; double d2;

    for (unsigned int i=0; i<nx; ++i) {
      int sk = 0;
      for (unsigned int j=0; j<ny; ++j)  {
        d1 = x(i, 0) - y(j, 0);
        d2 = x(i, 1) - y(j, 1);
        d = sqrt(d1*d1 + d2*d2);

        if ((d < 0.3) & (sk < 5)) {
          c1.push_back(names[i]);
          c2.push_back(names2[j]);
          c3.push_back(d);
          ++sk;
        }
      }
    }
    return List::create( 
      _[\"U_ID\"]  = c1, 
      _[\"D_ID\"]  = c2, 
      _[\"dist\"] = c3);
  }")

cpp <- function(x, y) { # function which prepears data and executes Cpp function
  n1 <- as.character(x[[1]])
  c1 <- as.matrix(x[,-1])
  n2 <- as.character(y[[2]])
  c2 <- as.matrix(y[, -(1:2)])
  r <- m(n1, c1, n2, c2)
  as.data.table(r)
}

cpp(newCus, localD)
# U_ID    D_ID       dist
# 1: 26428981 1533582 0.06668107
# 2: 26428981 1533582 0.14372049

# Prevous function:
r <- function(x, y) do.call("rbind", lapply(x$U_ID, distance2, y))
r(newCus, localD)
# U_ID    D_ID       dist
# 1 26428981 1533582 0.06668107
# 2 26428981 1533582 0.14372049
# 7 26428981 4138752 3.27987470
# 5 26428981 4134402 4.44764165
# 4 26428981 4126692 4.62505319

我认为distance2函数中距离计算不正确,应该还有sqrt。还是我错了?

基准测试

require(rbenchmark)
benchmark(r(newCus, localD),
          cpp(newCus, localD),
          replications = 1000,
          columns = c("test", "replications", "elapsed", "relative"))
# test replications elapsed relative
# 2 cpp(newCus, localD)         1000    0.91    1.000
# 1   r(newCus, localD)         1000    2.35    2.582

# with bigger data:
benchmark(r(newCus2, localD2),
          cpp(newCus2, localD2),
          replications = 3,
          columns = c("test", "replications", "elapsed", "relative"))
# test replications elapsed relative
# 2 cpp(newCus2, localD2)            3    0.86    1.000
# 1   r(newCus2, localD2)            3    9.11   10.593

比以前的解决方案快大约 10 倍。

更新

更正函数:

cppFunction(
  "List m(std::vector<std::string> & names, NumericMatrix & x,
  std::vector<std::string> & names2, NumericMatrix & y){

    unsigned int nx = x.nrow();
    unsigned int ny = y.nrow();
    std::vector<std::string> c1;
    std::vector<std::string> c2;
    NumericVector c3;
    double d; double d1; double d2;

    for (unsigned int i=0; i<nx; ++i) {
      for (unsigned int j=0; j<ny; ++j)  {
        d1 = x(i, 0) - y(j, 0);
        d2 = x(i, 1) - y(j, 1);
        d = sqrt(d1*d1 + d2*d2);

        if ((d < 0.3)) {
          c1.push_back(names[i]);
          c2.push_back(names2[j]);
          c3.push_back(d);
        }
      }
    }
    return List::create( 
      _[\"U_ID\"]  = c1, 
      _[\"D_ID\"]  = c2, 
      _[\"dist\"] = c3);
  }")

cpp <- function(x, y) { # function which prepears data and executes Cpp function
  n1 <- as.character(x[[1]])
  c1 <- as.matrix(x[,-1])
  n2 <- as.character(y[[2]])
  c2 <- as.matrix(y[, -(1:2)])
  r <- m(n1, c1, n2, c2)
  r <- as.data.table(r)
  setkey(r, U_ID, dist)
  r[, head(.SD, 5), U_ID][]
  }

现在你应该得到你想要的结果。