如何计算多边形之间的所有成对相互作用以及 R 中 sf 的百分比覆盖率?

How to compute all pairwise interaction between polygons and the the percentage coverage in R with sf?

我有多个多边形,我想在其中计算它们之间的重叠百分比。这个想法是,当一个多边形与另一个多边形相交时,可以在一个多边形或另一个多边形的角度(即最大值)上计算百分比。因此,我想制作一个脚本,生成多边形之间的覆盖百分比,采用一个多边形和另一个多边形的重叠百分比,并将所有结果放入数据框中。

这是我目前拥有的代码:

set.seed(131)
library(sf)
library(mapview)
m = rbind(c(0,0), c(1,0), c(1,1), c(0,1), c(0,0))
p = st_polygon(list(m))
n = 5
l = vector("list", n)
for (i in 1:n)
  l[[i]] = p + 2 * runif(2)
s = st_sfc(l)
s.f = st_sf(s)
s.f$id = c(1,1,2,2,3)
s.f.2 = s.f %>% group_by(id) %>%   summarise(geometry = sf::st_union(s)) # %>% summarise(area = st_area(s))

s.f.2$area = st_area(s.f.2)

i = s.f.2 %>% 
  st_intersection(.) %>% 
  mutate(intersect_area = st_area(.)) #%>%

st_intersection(s.f.2) %>% 
  mutate(intersect_area = st_area(.),
         # id.int = sapply(i$origins, function(x) paste0(as.character(hr.pol$id)[x], collapse = ", ")),
         id1 = sapply(i$origins, function(x) paste0(as.character(s.f.2$id)[x][1])),
         id2 = sapply(i$origins, function(x) paste0(as.character(s.f.2$id)[x][2])),
         area.id1 = sapply(i$origins, function(x) s.f.2$area[x][1]),
         area.id2 = sapply(i$origins, function(x) s.f.2$area[x][2]),
         perc1 = as.vector(intersect_area/area.id1),
         perc2 = as.vector(intersect_area/area.id2)) %>%   # create new column with shape area
  filter(n.overlaps ==2) %>% 
  dplyr::select(id, intersect_area, id1, id2, 
                # id.int, 
                perc1,perc2) %>%   # only select columns needed to merge
  st_drop_geometry() %>%  # drop geometry as we don't need it
  select(-id) %>% 
  pivot_longer(#names_prefix = "id", 
    names_to = "perc",
    cols = starts_with("perc"))

这段代码给出了多边形之间重叠的百分比(我只针对 2 次重叠进行计算,但如果这可以推广到超过 1 次重叠就更好了!)

mapview(s.f.2,zcol = "id")

最后,我要找的是这样的:

id   `1`   `2`   `3`
1     100   31.6  0
2     27.0  100   0
3     0     0     100

所以多边形“1”覆盖多边形“2”面积的 31.6%,多边形“2”覆盖多边形“1”面积的 27.0%。

我现在拥有的是(但是非常慢):

data.sp = s.f.2 %>%  
  st_as_sf(.) %>%
  mutate(area.m =  st_area(geometry),
         area.ha = units::set_units(area.m, ha)) %>%
  select(-c(area,area.m))

id.sort = sort(unique(data.sp$id)) # used to reorder columns based on ID

df.fill =data.frame(id1 = NULL, id2=NULL, area =NULL, over1 = NULL, over2 = NULL)

for (k in 1:length(id.sort)) {
  for (op in 1:length(id.sort)) {
    int.out = st_intersection(data.sp[data.sp$id==id.sort[k],], 
                              data.sp[data.sp$id==id.sort[op],])
    # int.out
    if(nrow(int.out) != 0) {
      area.tmp = st_area(int.out)#/10000
      over1 = area.tmp/int.out$area.ha
      over2 = area.tmp/int.out$area.ha.1
    } else {area.tmp = 0;over1=0;over2=0}
    
    df.fill.tmp = data.frame(id1 = id.sort[k], id2=id.sort[op], 
                             area = area.tmp,
                             over1 = over1*100,
                             over2 = over2*100)
    df.fill = rbind(df.fill,df.fill.tmp)
  }
}
df.fill$over1 = as.numeric(df.fill$over1)
df.fill$over2 = as.numeric(df.fill$over2)
df.fill %>% 
  select(-c(area, over2)) %>% 
  pivot_wider(names_from = id2,values_from = over1, 
              values_fill = 0)

简单的问题,但不是显而易见的答案!我建议的解决方案遵循与您的略有不同的策略,并且不涉及任何 for 循环。首先,我开发了一个函数(即 area_cover_()),该函数仅使用至少具有一个交点的那些多边形生成十字 table。然后,在第二步中,我开发了另一个函数(即 add_isolated_poly()),它在第一步生成的交叉 table 的末端添加没有交点的多边形。如果您有许多没有相交的多边形,这会使最终的 table 更容易阅读。所以,请找到下面的代表。

注意:reprex 的输入数据对应于您的 sf 对象 s.f.2area

Reprex

1.第一步:创建一个十字table,只包括至少有一个交点的多边形(不包括没有交点的多边形使得十字table的读取效率更高)。为此,我开发了函数 area_cover()

  • area_cover() 函数的代码
library(sf)
library(dplyr)
library(tidyr)


area_cover <- function(x) {
  x %>% 
  st_intersection() %>% 
  filter(n.overlaps>1) %>%
  mutate(area_inter = st_area(.)) %>%  
  unnest(., cols = c(origins, geometry)) %>% 
  left_join(., as.data.frame(x), by = c("origins" = "id")) %>% 
  mutate(cover_percent = area_inter/area.y*100) %>% 
  select(.,origins, area.y, area_inter, cover_percent) %>% 
  rename("id" = "origins", "area" = "area.y") %>% 
  st_drop_geometry() %>%  
  group_by(area_inter) %>% 
  mutate(poly_X_id = rev(id)) %>% 
  relocate(poly_X_id, .before = area_inter) %>% 
  xtabs(cover_percent ~ id + poly_X_id, data = .) %>%  
  replace(.== 0, 100) %>% 
  round(., digits = 1)
  }
  • area_cover() 函数的输出
Results <- area_cover(s.f.2)

Results      # Only polygons with at least one intersection are present in this cross table
#>    poly_X_id
#> id      1     2
#>   1 100.0  31.6
#>   2  27.0 100.0  

class(Results) 
#> [1] "xtabs" "table"  # you can convert 'Results' into matrix with 'as.matrix()' if needed.

2。第二步(可选): 在交叉 table 'Results' 的末端添加孤立的多边形(即没有交点)(即上一步的结果)。为此,我开发了函数 add_isolated_poly(),它创建了一个数据框,其中 n 列对应于孤立多边形的 n 个 id,并填充了 0s

  • add_isolated_poly() 函数的代码
add_isolated_poly <- function(y, z){ # 'y arg.' = s.f.2 and 'z arg.' = result of the function area_cover()

id_isolated_poly <- setdiff(y$id, colnames(z))

df_isolated_poly <- y %>% 
  filter(.,id %in% id_isolated_poly) %>% 
  st_drop_geometry() %>% 
  select(., id) %>% 
  t() %>% 
  as.data.frame() %>% 
  `colnames<-`(., id_isolated_poly) %>% 
  rbind(.,rep(list(rep(0, nrow(y))), length(id_isolated_poly))) %>% 
  slice(-c(1))

cbind.fill <- function(...){
  nm <- list(...)
  nm <- lapply(nm, as.matrix)
  n <- max(sapply(nm, nrow))
  do.call(cbind, lapply(nm, function (x)
    rbind(x, matrix(0, n-nrow(x), ncol(x)))))
}

Results %>% 
  cbind.fill(., df_isolated_poly) %>%  
  replace(., col(.) == row(.), 100) %>%  
  `rownames<-`(., c(colnames(z), id_isolated_poly))
}
  • add_isolated_poly() 函数的输出
Results_2 <- add_isolated_poly(s.f.2, Results)

Results_2
#>     1     2   3
#> 1 100  31.6   0
#> 2  27 100.0   0
#> 3   0   0.0 100

class(Results_2)
#> [1] "matrix" "array

reprex package (v2.0.1)

于 2021-11-19 创建

重要编辑

虽然他们用建议的最小示例产生了正确的结果,但我上面提出的两个函数不可推广并且产生错误的结果。经过大量的反复试验,这里有一个简单、非常快速的函数……而且,这次,对了!!!所以,请找到下面的代表。

解决方案

  • area_cover() 函数的代码
library(sf)
library(dplyr)

area_cover <- function(x){
  Results <- x %>% 
    st_intersection(.,.) %>% 
    mutate(area_inter = st_area(.),
           cover = area_inter/area.1*100) %>% 
    st_drop_geometry() %>% 
    xtabs(cover ~ id.1 + id, data = ., sparse = TRUE) %>%  
    round(., digits = 1) %>% 
    as.matrix(.) 
  
  names(dimnames(Results)) <- NULL
  
  return(Results)
}
  • area_cover() 函数的输出
area_cover(s.f.2)

#>     1     2   3
#> 1 100  31.6   0
#> 2  27 100.0   0
#> 3   0   0.0 100

reprex package (v2.0.1)

创建于 2021-11-22

基准测试

我根据@M 提供的新数据集,将我的函数与@wibeasley 的经过验证的解决方案进行了比较。 Beausoleil(见下面的评论)。

为了使比较有效,我稍微修改了@wibeasley 的函数,以便输出是一个包含百分比的矩阵(即与我的函数相同的输出)

  • 数据
set.seed(1234)
m = rbind(c(0,0), c(1,0), c(1,1), c(0,1), c(0,0))
p = st_polygon(list(m))
n = 100
l = vector("list", n)
for (i in 1:n)
  l[[i]] = p + 7 * runif(2)
s = st_sfc(l)
s.f = st_sf(s)
s.f$id = c(1,1,2,2,3,4,5,5,6,7,7,8,9,8,7,3,4,5,5,6)
s.f.2 = s.f %>% group_by(id) %>%   summarise(geometry = sf::st_union(s)) # %>% summarise(area = st_area(s))
s.f.2$area = st_area(s.f.2)
  • 比较两个解决方案的代码
library(sf)
library(dplyr)
library(tidyr)
library(bench)

bench_results <- bench::mark(
  "wibeasley_validated_answer" = {  #!!!!!  NB: lighlty modified function for the comparison to be valid!!!!!
    wibeasley <- sf::st_intersection(s.f.2, s.f.2) %>% 
      dplyr::mutate(
        area       = sf::st_area(.),
        proportion = area / area.1 * 100
      ) %>%
      tibble::as_tibble() %>%
      dplyr::select(
        id_1 = id,
        id_2 = id.1,
        proportion,
      ) %>% 
      # tidyr::complete(id_1, id_2, fill = list(proportion = 0))
      tidyr::pivot_wider(
        names_from = id_1,
        values_from = proportion,
        values_fill = 0
      ) %>% 
      as.matrix(.,rownames.force = TRUE) %>% 
      `<-`(., .[,-c(1)]) %>%  
      round(.,1)
    },  
  "lovalery_answer" = {
    lovalery <- area_cover(s.f.2)
  },
  min_iterations = 1000,
  relative = TRUE, 
  check = TRUE)
  • 基准测试结果(相对值)
bench_results
#> # A tibble: 2 x 6
#>   expression                   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                 <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 wibeasley_validated_answer  1.13   1.10      1          1       1.16
#> 2 lovalery_answer             1      1         1.10      24.1     1
  • 基准测试结果(绝对值)
bench_results
#> # A tibble: 2 x 6
#>   expression                      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                 <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 wibeasley_validated_answer   46.4ms   49.1ms      20.2    2.54MB    1.11 
#> 2 lovalery_answer              41.5ms   43.7ms      22.7   61.09MB    0.969
  • 最终检查两个函数是否产生相同的结果
all.equal(wibeasley, lovalery)
#> [1] TRUE

wibeasley
#>       1     2     3     4     5     6     7     8     9
#> 1 100.0  19.0  13.3   4.6  22.6  21.2  12.8   3.7  11.6
#> 2  18.3 100.0  28.7  31.9  33.0  14.3  32.2  25.1   5.1
#> 3  14.6  32.7 100.0  23.3  35.5   7.2  28.8   7.0  13.7
#> 4   5.1  36.4  23.3 100.0  20.3  23.7  26.2  26.8  14.7
#> 5  13.0  19.8  18.7  10.7 100.0  10.3  27.0  15.6  12.4
#> 6  21.3  15.0   6.6  21.8  18.0 100.0  26.9  24.4  15.8
#> 7   9.5  24.8  19.5  17.7  34.7  19.9 100.0  11.8  17.9
#> 8   3.8  26.8   6.6  25.1  27.7  24.9  16.3 100.0   1.3
#> 9  22.1  10.0  23.7  25.5  41.0  30.0  46.0   2.3 100.0

lovalery
#>       1     2     3     4     5     6     7     8     9
#> 1 100.0  19.0  13.3   4.6  22.6  21.2  12.8   3.7  11.6
#> 2  18.3 100.0  28.7  31.9  33.0  14.3  32.2  25.1   5.1
#> 3  14.6  32.7 100.0  23.3  35.5   7.2  28.8   7.0  13.7
#> 4   5.1  36.4  23.3 100.0  20.3  23.7  26.2  26.8  14.7
#> 5  13.0  19.8  18.7  10.7 100.0  10.3  27.0  15.6  12.4
#> 6  21.3  15.0   6.6  21.8  18.0 100.0  26.9  24.4  15.8
#> 7   9.5  24.8  19.5  17.7  34.7  19.9 100.0  11.8  17.9
#> 8   3.8  26.8   6.6  25.1  27.7  24.9  16.3 100.0   1.3
#> 9  22.1  10.0  23.7  25.5  41.0  30.0  46.0   2.3 100.0

reprex package (v2.0.1)

创建于 2021-11-22

有几项我要优化。我无法真正评估只有三个多边形的优化,我猜相交计算是唯一真正昂贵的部分,所以我会从那里开始。

如果您不计算 (1) 多边形 A 和 B,然后 (2) 多边形 B 和 A,您将立即减少约 50%。从某种意义上说,只计算 upper triangle,reuse/reflect 三角形的值。


# I think you wanted to create 5 empty columns.  Use `numeric(0)` instead of `NULL`
df.fill=data.frame(id1=numeric(0), id2=numeric(0), area=numeric(0), over1=numeric(0), over2=numeric(0))

for (k in seq_along(id.sort)) {
  for (op in seq(from = k, to = length(id.sort), by = 1)) { # Avoid the lower triangle
    int.out = st_intersection(
      data.sp[data.sp$id==id.sort[k],], 
      data.sp[data.sp$id==id.sort[op],]
    )
    
    if(nrow(int.out) != 0) {
      area.tmp = st_area(int.out)#/10000
      over1 = area.tmp/int.out$area.ha
      over2 = area.tmp/int.out$area.ha.1 
    } else {area.tmp = 0;over1=0;over2=0}
    
    df.fill.tmp.upper = data.frame(id1 = id.sort[k], id2=id.sort[op], 
                                   area = area.tmp,
                                   over1 = over1,
                                   over2 = over2)
    df.fill.tmp.lower = data.frame(id1 = id.sort[op], id2=id.sort[k], 
                                   area = area.tmp,
                                   over1 = over2,
                                   over2 = over1)
    df.fill <- 
      if (k == op) rbind(df.fill, df.fill.tmp.upper)
      else         rbind(df.fill, df.fill.tmp.upper, df.fill.tmp.lower)
  }
}
df.fill %>% 
  dplyr::mutate(
    over1 = as.numeric(over1) * 100
    over2 = as.numeric(over2) * 100
  ) %>%
  select(-area, -over2) %>% 
  pivot_wider(
    names_from = id2,
    values_from = over1, 
    values_fill = 0
  )

输出:

# A tibble: 3 x 4
    id1   `1`   `2`   `3`
  <dbl> <dbl> <dbl> <dbl>
1     1 100    31.6     0
2     2  27.0 100       0
3     3   0     0     100

没有真实的基准测试示例,我不确定它是否比您的解决方案更快。但它更简单易懂(至少对我的大脑而言)。

sf::st_intersection() 被矢量化。所以它会为你找到 & return 第一个和第二个参数的所有交集。在这种情况下,两个参数是同一组多边形。

sf::st_intersection(s.f.2, s.f.2) %>% 
  dplyr::mutate(
    area       = sf::st_area(.),
    proportion = area / area.1
  ) %>%
  tibble::as_tibble() %>%
  dplyr::select(
    id_1 = id,
    id_2 = id.1,
    proportion,
  ) %>% 
  # tidyr::complete(id_1, id_2, fill = list(proportion = 0))
  tidyr::pivot_wider(
    names_from = id_1,
    values_from = proportion,
    values_fill = 0
  )

输出:

# A tibble: 3 x 4
   id_2   `1`   `2`   `3`
  <dbl> <dbl> <dbl> <dbl>
1     1 1     0.316     0
2     2 0.270 1         0
3     3 0     0         1

需要考虑的事项:

  • 保持区域的比例,而不是百分比。一般以后再计算比较好。
  • 保持多头,不要转向。通常以后计算更好,因为您可以加入 id_1id_2.
  • 如果你转向宽,你可能希望它作为矩阵,而不是 data.frame。

如果相交计算与其余代码相比真的很昂贵,则此解决方案可能比我的第二个解决方案更快。与@lovalery 的解决方案一样,只有一个 data.frame 被传递给 sf:st_intersection()。根据我的解释,主要区别在于此解决方案保持较长时间并使用交叉连接来枚举所有可能的组合,并在最后一步转向广泛。当事情很长而不是很宽时,(对我来说)更容易操纵。

intersections <-
  s.f.2 %>% 
  sf::st_intersection() %>% 
  dplyr::filter(2L <= n.overlaps) %>%
  dplyr::mutate(
    numerator = sf::st_area(.)
  ) %>%
  tibble::as_tibble() %>%
  tidyr::unnest_wider(origins) %>%
  dplyr::select(
    id_1 = `...1`, 
    id_2 = `...2`, 
    numerator
  )

denominators <- 
  s.f.2 %>% 
  tibble::as_tibble() %>% 
  dplyr::select(id, area) 
  
denominators %>% 
  dplyr::full_join(denominators, by = character()) %>% 
  dplyr::select(
    id_1 = id.x,
    id_2 = id.y,
    area = area.y
  ) %>% 
  dplyr::left_join(intersections, by = c("id_1" = "id_1",  "id_2" = "id_2")) %>%
  dplyr::left_join(intersections, by = c("id_1" = "id_2",  "id_2" = "id_1")) %>% 
  dplyr::mutate(
    numerator   = dplyr::coalesce(numerator.x, numerator.y, 0),
    proportion  = dplyr::if_else(id_1 == id_2, 1, numerator / area),
  ) %>% 
  dplyr::select(id_1, id_2, proportion) %>% 
  tidyr::pivot_wider(
    names_from  = id_1,
    values_from = proportion,
    values_fill = 0
  )

输出:

# A tibble: 3 × 4
   id_2   `1`   `2`   `3`
  <dbl> <dbl> <dbl> <dbl>
1     1 1     0.316     0
2     2 0.270 1         0
3     3 0     0         1