R: split-apply-combine 地理距离

R: split-apply-combine for geographic distance

我已经从人口普查局下载了美国所有城镇和城市等的列表。这是一个随机样本:

dput(somewhere)
structure(list(state = structure(c(30L, 31L, 5L, 31L, 24L, 36L, 
13L, 21L, 6L, 10L, 31L, 28L, 10L, 5L, 5L, 8L, 23L, 11L, 34L, 
19L, 29L, 4L, 24L, 13L, 21L, 31L, 2L, 3L, 29L, 24L, 1L, 13L, 
15L, 10L, 11L, 33L, 35L, 8L, 11L, 12L, 36L, 28L, 9L, 31L, 8L, 
14L, 11L, 12L, 36L, 13L, 8L, 5L, 29L, 8L, 7L, 23L, 25L, 39L, 
16L, 28L, 10L, 29L, 26L, 8L, 32L, 40L, 28L, 23L, 37L, 31L, 18L, 
5L, 1L, 31L, 18L, 13L, 11L, 10L, 25L, 18L, 21L, 18L, 11L, 35L, 
31L, 36L, 20L, 19L, 38L, 2L, 40L, 13L, 36L, 11L, 29L, 27L, 22L, 
17L, 12L, 20L), .Label = c("ak", "al", "ar", "az", "ca", "co", 
"fl", "ga", "hi", "ia", "il", "in", "ks", "ky", "la", "md", "mi", 
"mo", "ms", "mt", "nc", "nd", "ne", "nj", "nm", "nv", "ny", "oh", 
"ok", "or", "pa", "pr", "ri", "sc", "sd", "tx", "ut", "va", "wa", 
"wi"), class = "factor"), geoid = c(4120100L, 4280962L, 668028L, 
4243944L, 3460600L, 4871948L, 2046000L, 3747695L, 839965L, 1909910L, 
4244824L, 3902204L, 1963750L, 622360L, 669088L, 1382972L, 3125230L, 
1722736L, 4539265L, 2804705L, 4039625L, 451465L, 3467020L, 2077150L, 
3765260L, 4221792L, 133904L, 566320L, 4033150L, 3463180L, 223460L, 
2013675L, 2232405L, 1951600L, 1752142L, 4445010L, 4655684L, 1336416L, 
1729080L, 1840842L, 4804672L, 3932207L, 1523675L, 4260384L, 1321912L, 
2159232L, 1735307L, 1867176L, 4839304L, 2057350L, 1309656L, 655380L, 
4082250L, 1350680L, 1275475L, 3147745L, 3505010L, 5352285L, 2483337L, 
3909834L, 1912945L, 4068200L, 3227900L, 1366304L, 7286831L, 5505350L, 
3982390L, 3149915L, 4974480L, 4249440L, 2943346L, 677430L, 280770L, 
4247872L, 2902242L, 2039075L, 1735281L, 1932565L, 3580120L, 2973852L, 
3722620L, 2943238L, 1755938L, 4643100L, 4251904L, 4830920L, 3056575L, 
2801940L, 5156048L, 137000L, 5508925L, 2057300L, 4861172L, 1736477L, 
4021200L, 3677783L, 3832060L, 2614900L, 1820332L, 3043750L), 
    ansicode = c(2410344L, 2390453L, 2411793L, 1214759L, 885360L, 
    2412035L, 485621L, 2403359L, 2412812L, 2583481L, 2390095L, 
    2397971L, 2396237L, 2585422L, 2411819L, 2405746L, 2398338L, 
    2394628L, 2812929L, 2586582L, 2408478L, 2582836L, 885393L, 
    2397270L, 2402898L, 2584453L, 2405811L, 2405518L, 2412737L, 
    2389752L, 2418574L, 2393549L, 2402559L, 2629970L, 2399453L, 
    2378109L, 2812999L, 2402563L, 2398956L, 2396699L, 2409759L, 
    2393028L, 2414061L, 2805542L, 2404192L, 2404475L, 2398514L, 
    2629884L, 2408486L, 2396265L, 2405306L, 2411363L, 2413515L, 
    2405064L, 2402989L, 2583899L, 2629103L, 2585016L, 2390487L, 
    2397481L, 2393811L, 2413298L, 2583927L, 2812702L, 2415078L, 
    1582764L, 2400116L, 2400036L, 2412013L, 2633665L, 2787908L, 
    2583158L, 2418866L, 1214943L, 2393998L, 485611L, 2398513L, 
    2394969L, 2806756L, 2397053L, 2406485L, 2395719L, 2399572L, 
    1267480L, 2389516L, 2410660L, 2409026L, 2806379L, 2584894L, 
    2404746L, 2586459L, 2396263L, 2411528L, 2398556L, 2412443L, 
    2584298L, 1036064L, 2806333L, 2396920L, 2804282L), city = c("donald", 
    "warminster heights", "san juan capistrano", "littlestown", 
    "port republic", "taylor", "merriam", "northlakes", "julesburg", 
    "california junction", "lower allen", "antwerp", "pleasantville", 
    "el rancho", "santa clarita", "willacoochee", "kennard", 
    "effingham", "la france", "beechwood", "keys", "orange grove mobile manor", 
    "shiloh", "west mineral", "stony point", "east salem", "heath", 
    "stamps", "haworth", "rio grande", "ester", "clayton", "hackberry", 
    "middle amana", "new baden", "melville", "rolland colony", 
    "hannahs mill", "germantown hills", "la fontaine", "aurora", 
    "green meadows", "kaiminani", "pinecroft", "dawson", "park city", 
    "hinsdale", "st. meinrad", "kingsland", "powhattan", "bowersville", 
    "palos verdes estates", "wyandotte", "meigs", "waverly", 
    "sunol", "arroyo hondo", "outlook", "west pocomoke", "buchtel", 
    "chatsworth", "smith village", "glenbrook", "rock spring", 
    "villalba", "bayfield", "waynesfield", "utica", "sunset", 
    "milford square", "lithium", "swall meadows", "unalaska", 
    "martinsburg", "ashland", "leawood", "hindsboro", "gray", 
    "turley", "trimble", "falcon", "linn", "olympia fields", 
    "mitchell", "mount pleasant mills", "greenville", "park city", 
    "arkabutla", "new river", "huntsville", "boulder junction", 
    "potwin", "red lick", "huey", "dougherty", "wadsworth", "grand forks", 
    "chassell", "edgewood", "lindsay"), lsad = c("25", "57", 
    "25", "21", "25", "25", "25", "57", "43", "57", "57", "47", 
    "25", "57", "25", "25", "47", "25", "57", "57", "57", "57", 
    "21", "25", "57", "57", "43", "25", "43", "57", "57", "25", 
    "57", "57", "47", "57", "57", "57", "47", "43", "25", "57", 
    "57", "57", "25", "25", "47", "57", "57", "25", "43", "25", 
    "43", "25", "57", "57", "57", "57", "57", "47", "25", "43", 
    "57", "57", "62", "25", "47", "47", "25", "57", "57", "57", 
    "25", "21", "25", "25", "47", "25", "57", "25", "43", "25", 
    "47", "25", "57", "25", "57", "57", "57", "25", "57", "25", 
    "25", "47", "43", "57", "25", "57", "43", "57"), funcstat = c("a", 
    "s", "a", "a", "a", "a", "a", "s", "a", "s", "s", "a", "a", 
    "s", "a", "a", "a", "a", "s", "s", "s", "s", "a", "a", "s", 
    "s", "a", "a", "a", "s", "s", "a", "s", "s", "a", "s", "s", 
    "s", "a", "a", "a", "s", "s", "s", "a", "a", "a", "s", "s", 
    "a", "a", "a", "a", "a", "s", "s", "s", "s", "s", "a", "a", 
    "a", "s", "s", "s", "a", "a", "a", "a", "s", "s", "s", "a", 
    "a", "a", "a", "a", "a", "s", "a", "a", "a", "a", "a", "s", 
    "a", "s", "s", "s", "a", "s", "a", "a", "a", "a", "s", "a", 
    "s", "a", "s"), latitude = c(45.221487, 40.18837, 33.500889, 
    39.74517, 39.534798, 30.573263, 39.017607, 35.780523, 40.984864, 
    41.56017, 40.226748, 41.180176, 41.387011, 36.220684, 34.414083, 
    31.335094, 41.474697, 39.120662, 34.616281, 32.336723, 35.802786, 
    32.598451, 39.462418, 37.283906, 35.867809, 40.608713, 31.344839, 
    33.354959, 33.840898, 39.019051, 64.879056, 39.736866, 29.964958, 
    41.794765, 38.536765, 41.559549, 44.3437, 32.937302, 40.768954, 
    40.673893, 33.055942, 39.867193, 19.757709, 40.564189, 31.771864, 
    37.093499, 41.800683, 38.168142, 30.666141, 39.761734, 34.373065, 
    33.774271, 36.807143, 31.071788, 27.985282, 41.154105, 36.534599, 
    46.331153, 38.096527, 39.463511, 42.916301, 35.45079, 39.100123, 
    34.81467, 18.127809, 46.81399, 40.602442, 40.895279, 41.13806, 
    40.433182, 37.831844, 37.50606, 53.910255, 40.310917, 38.812464, 
    38.907263, 39.684775, 41.841711, 36.736661, 39.476152, 35.194804, 
    38.478798, 41.521996, 43.730057, 40.724697, 33.111939, 45.630946, 
    34.700227, 37.142945, 34.782275, 46.1148, 37.938624, 33.485081, 
    38.605285, 34.399808, 42.821447, 47.921291, 47.036116, 40.103208, 
    47.224885), longitude = c(-122.837813, -75.084089, -117.654388, 
    -77.089213, -74.476099, -97.427116, -94.693955, -81.367835, 
    -102.262708, -95.994752, -76.902769, -84.736099, -93.272787, 
    -119.068357, -118.494729, -83.044003, -96.203696, -88.550859, 
    -82.770697, -90.808692, -94.941358, -114.660588, -75.29244, 
    -94.926801, -81.044121, -77.23694, -86.46905, -93.497879, 
    -94.657035, -74.87787, -148.041153, -100.176484, -93.410178, 
    -91.901539, -89.707193, -71.301933, -96.59226, -84.340945, 
    -89.462982, -85.722023, -97.509615, -83.945334, -156.001765, 
    -78.353464, -84.443499, -86.048077, -87.928172, -86.832128, 
    -98.454026, -95.634011, -83.084305, -118.425754, -94.729305, 
    -84.092683, -81.625304, -102.762746, -105.666602, -120.092812, 
    -75.579197, -82.180426, -96.514499, -97.457006, -119.927289, 
    -85.238869, -66.481897, -90.822546, -83.973881, -97.345349, 
    -112.028388, -75.405024, -89.88325, -118.642656, -166.529029, 
    -78.324286, -92.239531, -94.62524, -88.134729, -94.985863, 
    -107.792147, -94.561898, -78.65389, -91.844989, -87.691648, 
    -98.029974, -77.026451, -96.110256, -108.925311, -90.121565, 
    -80.595817, -86.532599, -89.654438, -97.01835, -94.161474, 
    -89.289973, -97.05148, -77.893875, -97.08933, -88.530745, 
    -85.737461, -105.152791), designation = c("city", "cdp", 
    "city", "borough", "city", "city", "city", "cdp", "town", 
    "cdp", "cdp", "village", "city", "cdp", "city", "city", "village", 
    "city", "cdp", "cdp", "cdp", "cdp", "borough", "city", "cdp", 
    "cdp", "town", "city", "town", "cdp", "cdp", "city", "cdp", 
    "cdp", "village", "cdp", "cdp", "cdp", "village", "town", 
    "city", "cdp", "cdp", "cdp", "city", "city", "village", "cdp", 
    "cdp", "city", "town", "city", "town", "city", "cdp", "cdp", 
    "cdp", "cdp", "cdp", "village", "city", "town", "cdp", "cdp", 
    "urbana", "city", "village", "village", "city", "cdp", "cdp", 
    "cdp", "city", "borough", "city", "city", "village", "city", 
    "cdp", "city", "town", "city", "village", "city", "cdp", 
    "city", "cdp", "cdp", "cdp", "city", "cdp", "city", "city", 
    "village", "town", "cdp", "city", "cdp", "town", "cdp")), row.names = c(22769L, 
24845L, 3314L, 24015L, 17360L, 28139L, 10085L, 19881L, 3886L, 
8750L, 24027L, 20585L, 9362L, 2499L, 3333L, 6041L, 16321L, 6847L, 
25249L, 14051L, 22233L, 1210L, 17425L, 10353L, 20053L, 23545L, 
253L, 1951L, 22166L, 17386L, 685L, 9771L, 11134L, 9225L, 7386L, 
25001L, 25862L, 5663L, 6950L, 8239L, 26555L, 20991L, 6108L, 24388L, 
5551L, 10772L, 7056L, 8470L, 27292L, 10202L, 5451L, 3116L, 22660L, 
5776L, 5317L, 16546L, 17582L, 29958L, 12103L, 20709L, 8779L, 
22515L, 16665L, 5902L, 31901L, 30658L, 21745L, 16574L, 28632L, 
24127L, 15046L, 3455L, 930L, 24087L, 14494L, 10016L, 7055L, 8993L, 
18048L, 15434L, 19615L, 15043L, 7454L, 25775L, 24194L, 27115L, 
15857L, 14038L, 29305L, 276L, 30693L, 10201L, 27863L, 7075L, 
22046L, 19267L, 20311L, 12502L, 8093L, 15798L), class = "data.frame")

我想使用 longitudelatitude 列以及 gdist 函数计算 city 列中每个条目之间的距离。我知道以下 for 循环有效且易于阅读:

dist_list <- list()
for (i in 1:nrow(somewhere)) {
  
  dist_list[[i]] <- gdist(lon.1 = somewhere$longitude[i], 
                          lat.1 = somewhere$latitude[i], 
                          lon.2 = somewhere$longitude, 
                          lat.2 = somewhere$latitude,
                          units="miles")
  
}

但是:在完整数据集(31K+ 行)上,它需要永远 运行——以小时为单位。我正在寻找可以加快计算速度的东西。我认为 split-apply-combine 方法中的某些东西会很好用,因为我想最终涉及一对分组变量,geo_blockansi_block 但老实说,任何东西都会比我拥有的更好。

我试过以下方法:

somewhere$geo_block <- substr(somewhere$geoid, 1, 1)
somewhere$ansi_block <- substr(somewhere$ansicode, 1, 1)

somewhere <- somewhere %>%
  split(.$geo_block, .$ansi_block) %>%
  mutate(dist = gdist(longlat = somewhere[, c("longitude", "latitude")]))

但我不确定如何在标准 for 循环之外指定第二组 long-lat 输入。

我的问题:

  1. 如何使用 split-apply-combine 方法来解决上面以 geo_blockansi_block 作为分组变量的问题?我想return最短的距离和city的名字以及这个距离对应的geo_block的值

欢迎所有建议。理想情况下,所需的结果会相当快,因为​​我正在使用的实际数据集非常大。由于我在这里有点不知所措,因此我为这个问题增加了悬赏金以引起更多兴趣,并希望我可以从中学习到广泛的潜在答案。非常感谢!

我有这样的解决办法。我很惊讶自己使用了两个循环 for!!令人难以置信的是,我做到了。要事第一。

我的建议是基于一个简化。但是,你在近距离犯的错误会相对较小。但是时间增益是巨大的!

好吧,我建议用笛卡尔坐标计算距离,而不是球坐标。

所以我们需要一个简单的函数来计算基于两个参数 latitudelongitude 的笛卡尔坐标。 这是我们的 LatLong2Cart 功能。

LatLong2Cart = function(latitude, longitude, REarth = 6371) { 
  tibble(
    x = REarth * cos(latitude) * cos(longitude),
    y = REarth * cos(latitude) * sin(longitude),
    z = REarth *sin(latitude))
}

立即简短评论,我的函数以公里为单位计算距离(毕竟,它是 SI 单位)。如果您愿意,可以通过以英里、码、英尺或任何其他您想要的任何其他单位输入半径来更改它。

让我们看看它是如何工作的。但是让我先将您的 data.frame 转换为 tibble

library(tidyverse)
somewhere = somewhere %>% as_tibble()

somewhere %>%  
  mutate(LatLong2Cart(latitude, longitude))

输出

# A tibble: 100 x 12
   state   geoid ansicode city                lsad  funcstat latitude longitude designation      x      y      z
   <fct>   <int>    <int> <chr>               <chr> <chr>       <dbl>     <dbl> <chr>        <dbl>  <dbl>  <dbl>
 1 or    4120100  2410344 donald              25    a            45.2    -123.  city        -1972.   644.  6024.
 2 pa    4280962  2390453 warminster heights  57    s            40.2     -75.1 cdp         -4815. -1564.  3867.
 3 ca     668028  2411793 san juan capistrano 25    a            33.5    -118.  city          485. -3096.  5547.
 4 pa    4243944  1214759 littlestown         21    a            39.7     -77.1 borough       350.  2894.  5665.
 5 nj    3460600   885360 port republic       25    a            39.5     -74.5 city        -1008. -1329.  6149.
 6 tx    4871948  2412035 taylor              25    a            30.6     -97.4 city        -4237.   160. -4755.
 7 ks    2046000   485621 merriam             25    a            39.0     -94.7 city         1435.  -686.  6169.
 8 nc    3747695  2403359 northlakes          57    s            35.8     -81.4 cdp         -2066.  -670. -5990.
 9 co     839965  2412812 julesburg           43    a            41.0    -102.  town         1010.  6223.  -915.
10 ia    1909910  2583481 california junction 57    s            41.6     -96.0 cdp           840.  4718. -4198.
# ... with 90 more rows

有了笛卡尔坐标后,我们来计算合适的距离。我为此准备了 calcDist 函数。

calcDist = function(data, key){
  if(!all(c("x", "y", "z") %in% names(data))) {
    stop("date must contain the variables x, y and z!")
  }
  key=enquo(key)
  n = nrow(data)
  dist = array(data = as.double(0), dim=n*n)
  x = data$x
  y = data$y
  z = data$z
  keys = data %>% pull(!!key)
  for(i in 1:n){
    for(j in 1:n){
      dist[i+(j-1)*n] = sqrt((x[i]-x[j])^2+(y[i]-y[j])^2+(z[i]-z[j])^2)
    }
  }
  tibble(
    expand.grid(factor(keys), factor(keys)),
    dist = dist
    )
}

反正就是这个地方。在这里我使用了这两个 for 循环。 希望得到原谅!

好吧,让我们看看这些循环是否可以在那里做些什么。

somewhere %>%  
  mutate(LatLong2Cart(latitude, longitude)) %>% 
  calcDist(city)

输出

# A tibble: 10,000 x 3
   Var1                Var2     dist
   <fct>               <fct>   <dbl>
 1 donald              donald     0 
 2 warminster heights  donald  4197.
 3 san juan capistrano donald  4500.
 4 littlestown         donald  3253.
 5 port republic       donald  2200.
 6 taylor              donald 11025.
 7 merriam             donald  3660.
 8 northlakes          donald 12085.
 9 julesburg           donald  9390.
10 california junction donald 11358.
# ... with 9,990 more rows

如您所见,一切正常。好的,但是如何在分组数据上使用它呢?这没什么难的。让我们按州分组。

somewhere %>% 
  mutate(LatLong2Cart(latitude, longitude)) %>% 
  nest_by(state) %>% 
  mutate(dist = list(calcDist(data, city))) %>% 
  select(-data) %>% 
  unnest(dist)

输出

# A tibble: 400 x 4
# Groups:   state [40]
   state Var1                      Var2                        dist
   <fct> <fct>                     <fct>                      <dbl>
 1 ak    ester                     ester                         0 
 2 ak    unalaska                  ester                      9245.
 3 ak    ester                     unalaska                   9245.
 4 ak    unalaska                  unalaska                      0 
 5 al    heath                     heath                         0 
 6 al    huntsville                heath                     12597.
 7 al    heath                     huntsville                12597.
 8 al    huntsville                huntsville                    0 
 9 ar    stamps                    stamps                        0 
10 az    orange grove mobile manor orange grove mobile manor     0 
# ... with 390 more rows

这里可能有一点数据冗余(从A市到B市,从B市到A市都有距离),但整个事情相对容易实现,你可能会承认自己。

好的。现在是时候看看当我们按 geo_blockansi_block.

分组时它会如何工作
somewhere %>% 
  mutate(LatLong2Cart(latitude, longitude)) %>% 
  mutate(
    geo_block = substr(geoid, 1, 1),
    ansi_block = substr(ansicode, 1, 1)) %>% 
  nest_by(geo_block, ansi_block) %>% 
  mutate(dist = list(calcDist(data, city))) %>% 
  select(-data) %>% 
  unnest(dist)

输出

# A tibble: 1,716 x 5
# Groups:   geo_block, ansi_block [13]
   geo_block ansi_block Var1                Var2                  dist
   <chr>     <chr>      <fct>               <fct>                <dbl>
 1 1         2          california junction california junction     0 
 2 1         2          pleasantville       california junction 10051.
 3 1         2          willacoochee        california junction 11545.
 4 1         2          effingham           california junction 11735.
 5 1         2          heath               california junction  4097.
 6 1         2          middle amana        california junction  7618.
 7 1         2          new baden           california junction 12720.
 8 1         2          hannahs mill        california junction 11681.
 9 1         2          germantown hills    california junction  5097.
10 1         2          la fontaine         california junction 11397.
# ... with 1,706 more rows

如您所见,这没有问题。

最后,一个实用的笔记。使用此解决方案时,请确保分组的变量不超过大约 20,000 行。对于如此大的数据,这可能会导致内存分配问题。请注意,您的结果将是 20,000 * 20,000 行,这是相当多的,并且您可能 运行 在某些时候内存不足。虽然计算应该不会超过一分钟。

请检查它如何处理您的数据并提供一些反馈。我也希望您喜欢我的解决方案,并且笛卡尔坐标计算产生的距离误差不会以任何特定方式打扰您。

我将展示 tidyverse 和 base R 方法来拆分-应用-组合。我的理解是,对于每个城市组中的每个城市(无论您的分组变量可能是什么),您想要报告最近的组内城市以及到最近城市的距离。

首先,关于 Imap::gdist 的一些评论:

  • 它没有 lonlat 参数。您必须使用参数 (lon|lat).(1|2) 来传递坐标。
  • 它可能没有正确矢量化。它的主体包含一个循环while (abs(lamda - lamda.old) > 1e-11) {...},其中lamda的值为(lon.1 - lon.2) * pi / 180。对于长度为 1 的循环条件(它应该),参数 lon.1lon.2 的长度必须为 1。

所以,至少现在,我的回答将基于更规范的 geosphere::distm。它将整个 n-by-n 对称距离矩阵存储在内存中,因此您可能会达到内存分配限制,但在拆分应用组合中发生这种情况的可能性要小得多,其中 n 是组内(而不是总数)城市数。

我将处理您数据框的这一部分:

library("dplyr")

dd <- somewhere %>%
  as_tibble() %>%
  mutate(geo_block = as.factor(as.integer(substr(geoid, 1L, 1L))),
         ansi_block = as.factor(as.integer(substr(ansicode, 1L, 1L)))) %>%
  select(geo_block, ansi_block, city, longitude, latitude)
dd
# # A tibble: 100 × 5
#    geo_block ansi_block city                longitude latitude
#    <fct>     <fct>      <chr>                   <dbl>    <dbl>
#  1 4         2          donald                 -123.      45.2
#  2 4         2          warminster heights      -75.1     40.2
#  3 6         2          san juan capistrano    -118.      33.5
#  4 4         1          littlestown             -77.1     39.7
#  5 3         8          port republic           -74.5     39.5
#  6 4         2          taylor                  -97.4     30.6
#  7 2         4          merriam                 -94.7     39.0
#  8 3         2          northlakes              -81.4     35.8
#  9 8         2          julesburg              -102.      41.0
# 10 1         2          california junction     -96.0     41.6
# # … with 90 more rows

以下函数 find_nearest 执行识别最近邻的基本任务,给定 d 维度量 space 中的一组 n 点。其参数如下:

  1. data:具有 n 行(每个点一个),d 个变量的数据框 指定点坐标,以及 1 个或多个 ID 变量 与每个点相关联。
  2. dist:一个函数将坐标的 n-by-d 矩阵作为 一个参数并返回相应的 n-by-n 对称 距离矩阵。
  3. coordvar:字符向量列出坐标变量名称。
  4. idvar:字符向量列表 ID 变量名称。

在我们的数据框中,坐标变量是longitudelatitude,并且有一个ID变量city。为简单起见,我相应地定义了 coordvaridvar 的默认值。

find_nearest <- function(data, dist, 
                         coordvar = c("longitude", "latitude"), 
                         idvar = "city") {
  m <- match(coordvar, names(data), 0L)
  n <- nrow(data)
  if (n < 2L) {
    argmin <- NA_integer_[n]
    distance <- NA_real_[n]
  } else {
    ## Compute distance matrix
    D <- dist(as.matrix(data[m]))
    ## Extract minimum distances
    diag(D) <- Inf # want off-diagonal distances
    argmin <- apply(D, 2L, which.min)
    distance <- D[cbind(argmin, seq_len(n))]
  }
  ## Return focal point data, nearest neighbour ID, distance
  r1 <- data[-m]
  r2 <- data[argmin, idvar, drop = FALSE]
  names(r2) <- paste0(idvar, "_nearest")
  data.frame(r1, r2, distance, row.names = NULL, stringsAsFactors = FALSE)
}

这是应用于我们的数据框的 find_nearest 的输出,没有分组,距离由 Vincenty 椭球法计算。

dist_vel <- function(x) {
  geosphere::distm(x, fun = geosphere::distVincentyEllipsoid)
}

res <- find_nearest(dd, dist = dist_vel)
head(res, 10L)
#    geo_block ansi_block                city         city_nearest  distance
# 1          4          2              donald              outlook 246536.39
# 2          4          2  warminster heights       milford square  38512.79
# 3          6          2 san juan capistrano palos verdes estates  77722.35
# 4          4          1         littlestown          lower allen  55792.53
# 5          3          8       port republic           rio grande  66935.70
# 6          4          2              taylor            kingsland  98997.90
# 7          2          4             merriam              leawood  13620.87
# 8          3          2          northlakes          stony point  30813.46
# 9          8          2           julesburg                sunol  46037.81
# 10         1          2 california junction              kennard  19857.41

这是 tidyverse split-apply-combine,在 geo_blockansi_block 上分组:

dd %>%
  group_by(geo_block, ansi_block) %>%
  group_modify(~find_nearest(., dist = dist_vel))
# # A tibble: 100 × 5
# # Groups:   geo_block, ansi_block [13]
#    geo_block ansi_block city                city_nearest  distance
#    <fct>     <fct>      <chr>               <chr>            <dbl>
#  1 1         2          california junction gray            89610.
#  2 1         2          pleasantville       middle amana   122974.
#  3 1         2          willacoochee        meigs          104116.
#  4 1         2          effingham           hindsboro       72160.
#  5 1         2          heath               dawson         198052.
#  6 1         2          middle amana        pleasantville  122974.
#  7 1         2          new baden           huey            37147.
#  8 1         2          hannahs mill        dawson         129599.
#  9 1         2          germantown hills    hindsboro      165140.
# 10 1         2          la fontaine         edgewood        63384.
# # … with 90 more rows

例如,请注意,在该分组下,被认为最接近加利福尼亚交界处的城市如何从肯纳德变为距离更远的格雷。

这里是基础 R split-apply-combine,建立在基础函数 by 之上。除了结果中组的排序之外,它是等价的:

find_nearest_by <- function(data, by, ...) {
  do.call(rbind, base::by(data, by, find_nearest, ...))
}

res <- find_nearest_by(dd, by = dd[c("geo_block", "ansi_block")], dist = dist_vel)
head(res, 10L)
#    geo_block ansi_block                city city_nearest   distance
# 1          3          1         grand forks         <NA>         NA
# 2          4          1         littlestown  martinsburg  122718.95
# 3          4          1         martinsburg  littlestown  122718.95
# 4          4          1            mitchell  martinsburg 1671365.58
# 5          5          1            bayfield         <NA>         NA
# 6          1          2 california junction         gray   89609.71
# 7          1          2       pleasantville middle amana  122974.32
# 8          1          2        willacoochee        meigs  104116.21
# 9          1          2           effingham    hindsboro   72160.43
# 10         1          2               heath       dawson  198051.76

此排序显示 find_nearest returns NA 对于仅包含一个城市的组。如果我们打印整个 tibble,我们会在 tidyverse 结果中看到这些 NA

FWIW,这里是在 geosphere 中实现的用于计算测地线距离的各种函数的基准,不谈准确性,尽管您可以从结果中推测 Vincenty 椭球距离切角最少(哈哈)...

fn <- function(dist) find_nearest(dd, dist = dist)

library("geosphere")
dist_geo <- function(x) distm(x, fun = distGeo)
dist_cos <- function(x) distm(x, fun = distCosine)
dist_hav <- function(x) distm(x, fun = distHaversine)
dist_vsp <- function(x) distm(x, fun = distVincentySphere)
dist_vel <- function(x) distm(x, fun = distVincentyEllipsoid)
dist_mee <- function(x) distm(x, fun = distMeeus)

microbenchmark::microbenchmark(
  fn(dist_geo),
  fn(dist_cos),
  fn(dist_hav),
  fn(dist_vsp),
  fn(dist_vel),
  fn(dist_mee),
  times = 1000L
)
# Unit: milliseconds
#          expr        min         lq       mean     median         uq       max neval
#  fn(dist_geo)   6.143276   6.291737   6.718329   6.362257   6.459345  45.91131  1000
#  fn(dist_cos)   4.239236   4.399977   4.918079   4.461804   4.572033  45.70233  1000
#  fn(dist_hav)   4.005331   4.156067   4.641016   4.210721   4.307542  41.91619  1000
#  fn(dist_vsp)   3.827227   3.979829   4.446428   4.033621   4.123924  44.29160  1000
#  fn(dist_vel) 129.712069 132.549638 135.006170 133.935479 135.248135 174.88874  1000
#  fn(dist_mee)   3.716814   3.830999   4.234231   3.883582   3.962712  42.12947  1000

Imap::gdist 似乎是 Vincenty 椭球距离的纯 R 实现。这可能是它运行缓慢的原因...

一些最后的评论:

  • find_nearestdist 参数不需要建立在 geosphere 距离之一上。您可以传递满足我上面所述约束的任何函数。
  • find_nearest 可以接受函数 dist 返回 class "dist" 的对象。这些对象仅存储 n-by-n 对称距离矩阵的 n*(n-1)/2 下三角元素(参见 ?dist)。这将改进对大型 n 的支持并使 find_nearest memory-efficient implementation of the Haversine distance (suggested by @dww in the comments). It would just need to be worked out how to translate an operation like apply(D, 2L, which.min). [Update: I have implemented this change to find_nearest in 兼容,我在其中提供了新的基准。]

您的问题与 非常相似。一种选择是使用 spatialrisk::haversine():


library(spatialrisk)
library(data.table)
library(optiRum)
library(sf)
#> Linking to GEOS 3.9.1, GDAL 3.2.3, PROJ 7.2.1
library(s2)

# Create data.table
s_dt = data.table::data.table(city = somewhere$city,
                              x = somewhere$latitude, 
                              y = somewhere$longitude)

# Cross join two data tables
coordinates_dt <- optiRum::CJ.dt(s_dt, s_dt)
distance_m <- coordinates_dt[, dist_m := spatialrisk::haversine(y, x, i.y, i.x)][
  dist_m > 0, .SD[which.min(dist_m)], by = .(city, x, y)]
head(distance_m)
#>                   city        x          y               i.city      i.x
#> 1:  warminster heights 40.18837  -75.08409               shiloh 39.46242
#> 2: san juan capistrano 33.50089 -117.65439 palos verdes estates 33.77427
#> 3:         littlestown 39.74517  -77.08921          lower allen 40.22675
#> 4:       port republic 39.53480  -74.47610           rio grande 39.01905
#> 5:              taylor 30.57326  -97.42712               aurora 33.05594
#> 6:             merriam 39.01761  -94.69396              leawood 38.90726
#>           i.y    dist_m
#> 1:  -75.29244 31059.909
#> 2: -118.42575 87051.444
#> 3:  -76.90277 24005.689
#> 4:  -74.87787 47227.822
#> 5:  -97.50961 37074.461
#> 6:  -94.62524  7714.126

您还可以使用精彩的 sf 包。例如 sf::st_nearest_feature() 给出:

s_sf <- sf::st_as_sf(s_dt, coords = c("x", "y"))
cities <- sf::st_as_sfc(s2::as_s2_geography(s_sf))
s_dt$city_nearest <-  s_dt$city[sf::st_nearest_feature(cities)]

比较两种方法的速度:

method_1 <- function(){
  coordinates_dt[, dist_m := spatialrisk::haversine(y, x, i.y, i.x)][
    dist_m > 0, .SD[which.min(dist_m)], by = .(city, x, y)]
}

method_2 <- function(){
  s_dt$city[sf::st_nearest_feature(cities)]
}

microbenchmark::microbenchmark(
  method_1(), 
  method_2(), 
  times = 100
)
#> Unit: milliseconds
#>        expr        min         lq       mean     median         uq       max
#>  method_1()   5.385391   5.652444   6.234329   5.772923   6.003445  11.60981
#>  method_2() 182.730850 188.408202 203.348667 199.049937 211.682795 303.14904
#>  neval
#>    100
#>    100

reprex package (v2.0.1)

于 2021-11-14 创建

根据我在 末尾留下的评论,我想我会分享一个更强大的 find_nearest 版本,它也接受函数 dist returning class "dist" 的对象。这些对象是双向量,仅存储 n-by-n 距离矩阵的 n*(n-1)/2 非冗余元素(参见 ?dist)。当我们使用 "dist" 个对象而不是矩阵时,我们可以在不达到内存限制的情况下一次考虑的经纬度对的最大数量从 N 增加到大约 sqrt(2)*N

如果你跳到最后,你会发现将 find_nearest2 与@dww 的快速 calc_dist 函数(参见 )结合使用,returns "dist" 个对象,允许我在不到两分钟的时间内处理 n = 40000 个未分组的经纬度对。

find_nearest2 取决于 class "dist" 的子集方法的存在,该方法对 "dist" 对象 x 进行操作,例如相应的矩阵,without 构造那些矩阵。更准确地说,x[i, j] 应该 return as.matrix(x)[i, j] 的值而不构造 as.matrix(x)。幸运的是,为了我们的目的,这种优化只在两种特殊情况下是必要的:(i)列提取和(ii)使用矩阵索引的元素提取。

`[.dist` <- function(x, i, j, drop = TRUE) {
  class(x) <- NULL
  p <- length(x)
  n <- as.integer(round(0.5 * (1 + sqrt(1 + 8 * p)))) # p = n * (n - 1) / 2
  
  ## Column extraction
  if (missing(i) && !missing(j) && is.integer(j) && length(j) == 1L && !is.na(j) && j >= 1L && j <= n) {
    if (j == 1L) {
      return(c(0, x[seq_len(n - 1L)]))
    }
    ## Convert 2-ary index of 'D' to 1-ary index of 'D[lower.tri(D)]'
    ii <- rep.int(j - 1L, j - 1L)
    jj <- 1L:(j - 1L)
    if (j < n) {
      ii <- c(ii, j:(n - 1L))
      jj <- c(jj, rep.int(j, n - j))
    }
    kk <- ii + round(0.5 * (2L * (n - 1L) - jj) * (jj - 1L))
    ## Extract
    res <- double(n)
    res[-j] <- x[kk]
    nms <- attr(x, "Labels")
    if (drop) {
      names(res) <- nms
    } else {
      dim(res) <- c(n, 1L)
      dimnames(res) <- list(nms, nms[j])
    }
    return(res)
  }
  
  ## Element extraction with matrix indices
  if (missing(j) && !missing(i) && is.matrix(i) && dim(i)[2L] == 2L && is.integer(i) && !anyNA(i) && all(i >= 1L & i <= n)) {
    m <- dim(i)[1L]
    ## Subset off-diagonal entries
    d <- i[, 1L] == i[, 2L]
    i <- i[!d, , drop = FALSE]
    ## Transpose upper triangular entries
    u <- i[, 2L] > i[, 1L]
    i[u, 1:2] <- i[u, 2:1]
    ## Convert 2-ary index of 'D' to 1-ary index of 'D[lower.tri(D)]'
    ii <- i[, 1L] - 1L
    jj <- i[, 2L]
    kk <- ii + (jj > 1L) * round(0.5 * (2L * (n - 1L) - jj) * (jj - 1L))
    ## Extract
    res <- double(m)
    res[!d] <- x[kk] 
    return(res)
  }

  ## Fall back on coercion for any other subset operation
  as.matrix(x)[i, j, drop = drop]
}

这是一个简单的测试。

n <- 6L
do <- dist(seq_len(n))
dm <- unname(as.matrix(do))
ij <- cbind(sample(6L), sample(6L))
identical(do[, 4L], dm[, 4L]) # TRUE
identical(do[ij], dm[ij]) # TRUE

这里是 find_nearest2。请注意子集运算符 [ 应用于对象 D 的实例。由于我们子集方法的优化,其中 none 涉及从 "dist""matrix".

的强制转换
find_nearest2 <- function(data, dist, coordvar, idvar) {
  m <- match(coordvar, names(data), 0L)
  n <- nrow(data)
  if (n < 2L) {
    argmin <- NA_integer_[n]
    distance <- NA_real_[n]
  } else {
    ## Compute distance matrix
    D <- dist(data[m])
    ## Extract minimum off-diagonal distances
    patch.which.min <- function(x, i) {
      x[i] <- Inf
      which.min(x)
    }
    argmin <- integer(n)
    index <- seq_len(n)
    for (j in index) {
      argmin[j] <- forceAndCall(2L, patch.which.min, D[, j], j)
    }
    distance <- D[cbind(argmin, index)]
  }
  ## Return focal point data, nearest neighbour ID, distance
  r1 <- data[-m]
  r2 <- data[argmin, idvar, drop = FALSE]
  names(r2) <- paste0(idvar, "_nearest")
  data.frame(r1, r2, distance, row.names = NULL, stringsAsFactors = FALSE)
}

让我们获取@dww 的 C++ 代码,以便 calc_dist 在我们的 R 会话中可用。

code <- '#include <Rcpp.h>
using namespace Rcpp;

double distanceHaversine(double latf, double lonf, double latt, double lont, double tolerance) {
  double d;
  double dlat = latt - latf;
  double dlon =  lont - lonf;
  d = (sin(dlat * 0.5) * sin(dlat * 0.5)) + (cos(latf) * cos(latt)) * (sin(dlon * 0.5) * sin(dlon * 0.5));
  if(d > 1 && d <= tolerance){
    d = 1;
  }
  return 2 * atan2(sqrt(d), sqrt(1 - d)) * 6378137.0;
}

double toRadians(double deg){
  return deg * 0.01745329251;  // PI / 180;
}

// [[Rcpp::export]]
NumericVector calc_dist(Rcpp::NumericVector lat, 
                        Rcpp::NumericVector lon, 
                        double tolerance = 10000000000.0) {
  std::size_t nlat = lat.size();
  std::size_t nlon = lon.size();
  if (nlat != nlon) throw std::range_error("lat and lon different lengths");
  if (nlat < 2) throw std::range_error("Need at least 2 points");
  std::size_t size = nlat * (nlat - 1) / 2;
  NumericVector ans(size);
  std::size_t k = 0;
  double latf;
  double latt;
  double lonf;
  double lont;
  
  for (std::size_t j = 0; j < (nlat-1); j++) {
    for (std::size_t i = j + 1; i < nlat; i++) {
      latf = toRadians(lat[i]);
      lonf = toRadians(lon[i]);
      latt = toRadians(lat[j]);
      lont = toRadians(lon[j]);
      ans[k++] = distanceHaversine(latf, lonf, latt, lont, tolerance);
    }
  }
  
  return ans;
}
'
Rcpp::sourceCpp(code = code)

让我们比较一下 find_nearest2 的性能,因为 n 等于 100、20000 和 40000,并且对于两个函数 dist,其中一个使用 geosphere::distm 来构造一个全距离矩阵,另一个使用@dww 的 calc_dist 构造一个 "dist" 对象。这两个函数都计算 Haversine 距离。

rx <- function(n) {
  data.frame(id = seq_len(n), lon = rnorm(n), lat = rnorm(n))
}
dist_hav <- function(x) {
  geosphere::distm(x, fun = geosphere::distHaversine)
}
dist_dww <- function(x) {
  res <- calc_dist(x[, 2L], x[, 1L])
  attr(res, "class") <- "dist"
  attr(res, "Size") <- nrow(x)
  attr(res, "Diag") <- FALSE
  attr(res, "Upper") <- FALSE
  attr(res, "call") <- match.call()
  res
}

这是基准:

fn2 <- function(data, dist) {
  find_nearest2(data, dist = dist, coordvar = c("lon", "lat"), idvar = "id")
}

x1 <- rx(100L)
microbenchmark::microbenchmark(
  fn2(x1, dist_hav), 
  fn2(x1, dist_dww), 
  times = 1000L
)
# Unit: microseconds
#               expr      min       lq     mean   median       uq       max neval
#  fn2(x1, dist_hav) 3768.310 3886.452 4680.300 3977.492 4131.796 34461.361  1000
#  fn2(x1, dist_dww)  930.044  992.241 1128.272 1017.005 1045.746  7006.326  1000

x2 <- rx(20000L)
microbenchmark::microbenchmark(
  fn2(x2, dist_hav),
  fn2(x2, dist_dww),
  times = 100L
)
# Unit: seconds
#               expr      min       lq     mean   median       uq      max neval
#  fn2(x2, dist_hav) 29.60596 30.04249 30.29052 30.14016 30.45054 31.53976   100
#  fn2(x2, dist_dww) 18.71327 19.01204 19.12311 19.09058 19.26680 19.62273   100

x3 <- rx(40000L)
microbenchmark::microbenchmark(
  # fn2(x3, dist_hav), # runs out of memory
  fn2(x3, dist_dww),
  times = 10L
)
# Unit: seconds
#               expr      min      lq     mean  median       uq      max neval
#  fn2(x3, dist_dww) 104.8912 105.762 109.1512 109.653 112.2543 112.9265    10

因此,至少在我的机器上,find_nearest2(dist = dist_dww) 将在不到两分钟的时间内完成,而不会 运行 如果应用于完整的未分组数据集(n 左右32000).