按R中最近的特征和日期空间连接两个数据框
Spatial join two data frames by nearest feature and date in R
这个问题基于 this and 话题。简而言之,这些线程中的代码尝试根据最近的特征在空间上连接两个空间数据帧。我感兴趣的额外复杂层是,除了按最近的特征加入外,还按日期加入。但是,我正在努力让代码正常工作。
以下是我尝试过(但失败)的数据框和代码:
df1 <- structure(list(lat1 = c(4417391, 4517826, 4435680, 4509372, 4449390,
4449390), long1 = c(5557780, 5358439, 5328731, 5323168, 5519670,
5519670), daydate = structure(c(16085, 16085, 16087, 16087, 16088,
16088), class = "Date")), row.names = c(NA, -6L), class = "data.frame")
df2 <- structure(list(lat2 = c(4394822, 4488830, 4417257, 4517995, 4435679
), long2 = c(5293795, 5418630, 5557927, 5358272, 5328084), daydate = structure(c(16085,
16085, 16087, 16087, 16088), class = "Date"), temp = c(6L, 26L,
13L, 30L, 8L)), row.names = c(NA, -5L), class = "data.frame")
df1
lat1 long1 daydate
1 4417391 5557780 2014-01-15
2 4517826 5358439 2014-01-15
3 4435680 5328731 2014-01-17
4 4509372 5323168 2014-01-17
5 4449390 5519670 2014-01-18
6 4449390 5519670 2014-01-18
df2
lat2 long2 daydate temp
1 4394822 5293795 2014-01-15 6
2 4488830 5418630 2014-01-15 26
3 4417257 5557927 2014-01-17 13
4 4517995 5358272 2014-01-17 30
5 4435679 5328084 2014-01-18 8
# Make df & df1 sf objects, and keep the coordinates as columns just in case.
df1 <- df1 %>% st_as_sf(coords = c("long1", "lat1"), remove = FALSE) %>%
st_set_crs(2193)
df2 <- df2 %>% st_as_sf(coords = c("long2", "lat2"), remove = FALSE) %>%
st_set_crs(2193)
# Join df with df1, based on the nearest feature:
df_near <- st_join(df1, df1, join = st_nearest_feature) %>%
group_by(daydate)
Error in `st_as_sf()`:
! Must group by variables found in `.data`.
x Column `daydate` is not found.
返回的错误 100% 有意义,因为代码是按顺序步骤编写的,但我不知道如何告诉 R 同时考虑这两个步骤。主要的 objective 是从 df2 获取临时值到 df1 的正确行。
我的实际数据中的额外信息:在我的实际 df1 中,坐标对(lat1 和 long1)以及日期可能重复。我的实际 df2 具有重复的坐标对和日期,但坐标对和日期的组合始终是唯一的,即 df2 的每一行都是唯一的。
这大概就是您要查找的内容吗?我为此使用了 {base},因为在协调两个不同表格的部分时,IMO 更容易处理......但我认为 lapply(split(df1, 1:nrow(df1), ...)
与应用 st_join()
dplyr::rowwise()
大致相同?
对于df1
中的每个唯一记录/行,计算子集中的最近特征 df2
具有 相同 daydate
:
library(sf)
#> Linking to GEOS 3.9.1, GDAL 3.2.1, PROJ 7.2.1; sf_use_s2() is TRUE
library(dplyr, warn = FALSE)
df1 <- data.frame(
lat1 = c(4417391, 4517826, 4435680, 4509372, 4449390, 4449390),
long1 = c(5557780, 5358439, 5328731, 5323168, 5519670, 5519670),
daydate = structure(c(16085, 16085, 16087, 16087, 16088, 16088), class = "Date")
)
df2 <- data.frame(
lat2 = c(4394822, 4488830, 4417257, 4517995, 4435679),
long2 = c(5293795, 5418630, 5557927, 5358272, 5328084),
daydate = structure(c(16085, 16085, 16087, 16087, 16088), class = "Date"),
temp = c(6L, 26L, 13L, 30L, 8L)
)
df1 <- df1 %>%
st_as_sf(coords = c("long1", "lat1"), remove = FALSE) %>%
st_set_crs(2193)
df2 <- df2 %>%
st_as_sf(coords = c("long2", "lat2"), remove = FALSE) %>%
st_set_crs(2193)
res <- do.call('rbind', lapply(split(df1, 1:nrow(df1)), function(x) {
st_join(x, df2[df2$daydate == unique(x$daydate),], join = st_nearest_feature)
}))
res
#> Simple feature collection with 6 features and 7 fields
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: 5323168 ymin: 4417391 xmax: 5557780 ymax: 4517826
#> Projected CRS: NZGD2000 / New Zealand Transverse Mercator 2000
#> lat1 long1 daydate.x lat2 long2 daydate.y temp
#> 1 4417391 5557780 2014-01-15 4488830 5418630 2014-01-15 26
#> 2 4517826 5358439 2014-01-15 4488830 5418630 2014-01-15 26
#> 3 4435680 5328731 2014-01-17 4517995 5358272 2014-01-17 30
#> 4 4509372 5323168 2014-01-17 4517995 5358272 2014-01-17 30
#> 5 4449390 5519670 2014-01-18 4435679 5328084 2014-01-18 8
#> 6 4449390 5519670 2014-01-18 4435679 5328084 2014-01-18 8
#> geometry
#> 1 POINT (5557780 4417391)
#> 2 POINT (5358439 4517826)
#> 3 POINT (5328731 4435680)
#> 4 POINT (5323168 4509372)
#> 5 POINT (5519670 4449390)
#> 6 POINT (5519670 4449390)
plot(st_geometry(res))
plot(df2 %>%
st_as_sf(coords = c("long2", "lat2"), remove = FALSE) %>%
st_set_crs(2193) %>%
st_geometry(), add = T, pch = "*")
编辑:这里使用 {data.table}
也是一样的
df1 <- data.table(df1)
.nearest_samedate <- function(x) {
st_join(st_as_sf(x), df2[df2$daydate == unique(x$daydate),], join = st_nearest_feature)
}
res <- df1[, .nearest_samedate(.SD), by = list(1:nrow(df1))]
您(几乎)做对了所有事情。 df_near
没有可作为分组依据的名为 daydate
的列。由于 df_1
和 df_2
都有一个名为 daydate
的列,因此输出有两个名为 daydate.x
和 daydate.y
的日期列。一个来自 left-hand 侧 (df1),另一个来自 right-hand 侧 (df2)。
使用 group_by(daydate.x)
应该可以,但您可能想检查数据帧之间的日期列是否相同(或者至少与您期望的相同)。
library(sf)
library(tidyverse)
df1 <- df1 %>% st_as_sf(coords = c("long1", "lat1"), remove = FALSE) %>%
st_set_crs(2193)
df2 <- df2 %>% st_as_sf(coords = c("long2", "lat2"), remove = FALSE) %>%
st_set_crs(2193)
# Join df with df1, based on the nearest feature:
df_near <- st_join(df1, df2, join = st_nearest_feature) %>%
group_by(daydate.x)
df_near
#> Simple feature collection with 6 features and 7 fields
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: 5323168 ymin: 4417391 xmax: 5557780 ymax: 4517826
#> Projected CRS: NZGD2000 / New Zealand Transverse Mercator 2000
#> # A tibble: 6 × 8
#> # Groups: daydate.x [3]
#> lat1 long1 daydate.x lat2 long2 daydate.y temp
#> <dbl> <dbl> <date> <dbl> <dbl> <date> <int>
#> 1 4417391 5557780 2014-01-15 4417257 5557927 2014-01-17 13
#> 2 4517826 5358439 2014-01-15 4517995 5358272 2014-01-17 30
#> 3 4435680 5328731 2014-01-17 4435679 5328084 2014-01-18 8
#> 4 4509372 5323168 2014-01-17 4517995 5358272 2014-01-17 30
#> 5 4449390 5519670 2014-01-18 4417257 5557927 2014-01-17 13
#> 6 4449390 5519670 2014-01-18 4417257 5557927 2014-01-17 13
#> # … with 1 more variable: geometry <POINT [m]>
由 reprex package (v2.0.1)
于 2022-04-21 创建
这个问题基于 this and
以下是我尝试过(但失败)的数据框和代码:
df1 <- structure(list(lat1 = c(4417391, 4517826, 4435680, 4509372, 4449390,
4449390), long1 = c(5557780, 5358439, 5328731, 5323168, 5519670,
5519670), daydate = structure(c(16085, 16085, 16087, 16087, 16088,
16088), class = "Date")), row.names = c(NA, -6L), class = "data.frame")
df2 <- structure(list(lat2 = c(4394822, 4488830, 4417257, 4517995, 4435679
), long2 = c(5293795, 5418630, 5557927, 5358272, 5328084), daydate = structure(c(16085,
16085, 16087, 16087, 16088), class = "Date"), temp = c(6L, 26L,
13L, 30L, 8L)), row.names = c(NA, -5L), class = "data.frame")
df1
lat1 long1 daydate
1 4417391 5557780 2014-01-15
2 4517826 5358439 2014-01-15
3 4435680 5328731 2014-01-17
4 4509372 5323168 2014-01-17
5 4449390 5519670 2014-01-18
6 4449390 5519670 2014-01-18
df2
lat2 long2 daydate temp
1 4394822 5293795 2014-01-15 6
2 4488830 5418630 2014-01-15 26
3 4417257 5557927 2014-01-17 13
4 4517995 5358272 2014-01-17 30
5 4435679 5328084 2014-01-18 8
# Make df & df1 sf objects, and keep the coordinates as columns just in case.
df1 <- df1 %>% st_as_sf(coords = c("long1", "lat1"), remove = FALSE) %>%
st_set_crs(2193)
df2 <- df2 %>% st_as_sf(coords = c("long2", "lat2"), remove = FALSE) %>%
st_set_crs(2193)
# Join df with df1, based on the nearest feature:
df_near <- st_join(df1, df1, join = st_nearest_feature) %>%
group_by(daydate)
Error in `st_as_sf()`:
! Must group by variables found in `.data`.
x Column `daydate` is not found.
返回的错误 100% 有意义,因为代码是按顺序步骤编写的,但我不知道如何告诉 R 同时考虑这两个步骤。主要的 objective 是从 df2 获取临时值到 df1 的正确行。
我的实际数据中的额外信息:在我的实际 df1 中,坐标对(lat1 和 long1)以及日期可能重复。我的实际 df2 具有重复的坐标对和日期,但坐标对和日期的组合始终是唯一的,即 df2 的每一行都是唯一的。
这大概就是您要查找的内容吗?我为此使用了 {base},因为在协调两个不同表格的部分时,IMO 更容易处理......但我认为 lapply(split(df1, 1:nrow(df1), ...)
与应用 st_join()
dplyr::rowwise()
大致相同?
对于df1
中的每个唯一记录/行,计算子集中的最近特征 df2
具有 相同 daydate
:
library(sf)
#> Linking to GEOS 3.9.1, GDAL 3.2.1, PROJ 7.2.1; sf_use_s2() is TRUE
library(dplyr, warn = FALSE)
df1 <- data.frame(
lat1 = c(4417391, 4517826, 4435680, 4509372, 4449390, 4449390),
long1 = c(5557780, 5358439, 5328731, 5323168, 5519670, 5519670),
daydate = structure(c(16085, 16085, 16087, 16087, 16088, 16088), class = "Date")
)
df2 <- data.frame(
lat2 = c(4394822, 4488830, 4417257, 4517995, 4435679),
long2 = c(5293795, 5418630, 5557927, 5358272, 5328084),
daydate = structure(c(16085, 16085, 16087, 16087, 16088), class = "Date"),
temp = c(6L, 26L, 13L, 30L, 8L)
)
df1 <- df1 %>%
st_as_sf(coords = c("long1", "lat1"), remove = FALSE) %>%
st_set_crs(2193)
df2 <- df2 %>%
st_as_sf(coords = c("long2", "lat2"), remove = FALSE) %>%
st_set_crs(2193)
res <- do.call('rbind', lapply(split(df1, 1:nrow(df1)), function(x) {
st_join(x, df2[df2$daydate == unique(x$daydate),], join = st_nearest_feature)
}))
res
#> Simple feature collection with 6 features and 7 fields
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: 5323168 ymin: 4417391 xmax: 5557780 ymax: 4517826
#> Projected CRS: NZGD2000 / New Zealand Transverse Mercator 2000
#> lat1 long1 daydate.x lat2 long2 daydate.y temp
#> 1 4417391 5557780 2014-01-15 4488830 5418630 2014-01-15 26
#> 2 4517826 5358439 2014-01-15 4488830 5418630 2014-01-15 26
#> 3 4435680 5328731 2014-01-17 4517995 5358272 2014-01-17 30
#> 4 4509372 5323168 2014-01-17 4517995 5358272 2014-01-17 30
#> 5 4449390 5519670 2014-01-18 4435679 5328084 2014-01-18 8
#> 6 4449390 5519670 2014-01-18 4435679 5328084 2014-01-18 8
#> geometry
#> 1 POINT (5557780 4417391)
#> 2 POINT (5358439 4517826)
#> 3 POINT (5328731 4435680)
#> 4 POINT (5323168 4509372)
#> 5 POINT (5519670 4449390)
#> 6 POINT (5519670 4449390)
plot(st_geometry(res))
plot(df2 %>%
st_as_sf(coords = c("long2", "lat2"), remove = FALSE) %>%
st_set_crs(2193) %>%
st_geometry(), add = T, pch = "*")
编辑:这里使用 {data.table}
也是一样的df1 <- data.table(df1)
.nearest_samedate <- function(x) {
st_join(st_as_sf(x), df2[df2$daydate == unique(x$daydate),], join = st_nearest_feature)
}
res <- df1[, .nearest_samedate(.SD), by = list(1:nrow(df1))]
您(几乎)做对了所有事情。 df_near
没有可作为分组依据的名为 daydate
的列。由于 df_1
和 df_2
都有一个名为 daydate
的列,因此输出有两个名为 daydate.x
和 daydate.y
的日期列。一个来自 left-hand 侧 (df1),另一个来自 right-hand 侧 (df2)。
使用 group_by(daydate.x)
应该可以,但您可能想检查数据帧之间的日期列是否相同(或者至少与您期望的相同)。
library(sf)
library(tidyverse)
df1 <- df1 %>% st_as_sf(coords = c("long1", "lat1"), remove = FALSE) %>%
st_set_crs(2193)
df2 <- df2 %>% st_as_sf(coords = c("long2", "lat2"), remove = FALSE) %>%
st_set_crs(2193)
# Join df with df1, based on the nearest feature:
df_near <- st_join(df1, df2, join = st_nearest_feature) %>%
group_by(daydate.x)
df_near
#> Simple feature collection with 6 features and 7 fields
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: 5323168 ymin: 4417391 xmax: 5557780 ymax: 4517826
#> Projected CRS: NZGD2000 / New Zealand Transverse Mercator 2000
#> # A tibble: 6 × 8
#> # Groups: daydate.x [3]
#> lat1 long1 daydate.x lat2 long2 daydate.y temp
#> <dbl> <dbl> <date> <dbl> <dbl> <date> <int>
#> 1 4417391 5557780 2014-01-15 4417257 5557927 2014-01-17 13
#> 2 4517826 5358439 2014-01-15 4517995 5358272 2014-01-17 30
#> 3 4435680 5328731 2014-01-17 4435679 5328084 2014-01-18 8
#> 4 4509372 5323168 2014-01-17 4517995 5358272 2014-01-17 30
#> 5 4449390 5519670 2014-01-18 4417257 5557927 2014-01-17 13
#> 6 4449390 5519670 2014-01-18 4417257 5557927 2014-01-17 13
#> # … with 1 more variable: geometry <POINT [m]>
由 reprex package (v2.0.1)
于 2022-04-21 创建