dplyr::inner_join -- 如何包含某些观察结果,即使它们没有出现在两个数据框中
dplyr::inner_join -- how to include certain observations even if they don't appear in both dataframes
我有两个数据框,我想通过一个公共变量合并它们。一些观察结果出现在一个数据中,但没有出现在另一个数据中,反之亦然。合并时,我只想保留出现在两个数据框中的观察结果,因此 dplyr::inner_join()
是合适的。
但是,有一个例外。有 一些 观察无论如何我想包含在合并数据中。也就是说,即使它们没有出现在两个原始数据框中。哪些是要保留的“特殊”观察结果的指示在某些列中作为某些值给出。
例子
我想合并以下数据框(df_population
和 df_gdp
)
1. df_population
library(tibble)
library(dplyr)
## helper function
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
set.seed(2021)
df_population <-
tribble(~country, ~population,
"australia", 24.99,
"united_kingdom", 66.65,
"france", 66.99,
"spain", 46.94,
"canada", 37.59,
"brazil", 209.5) %>%
mutate(col_of_strings = c(myFun(5), "dont_leave_me_behind"))
## # A tibble: 6 x 3
## country population col_of_strings
## <chr> <dbl> <chr>
## 1 australia 25.0 GLNWN9968R
## 2 united_kingdom 66.6 FTELH3426F
## 3 france 67.0 NFOSZ6335V
## 4 spain 46.9 ZFGRD8875F
## 5 canada 37.6 GFICE2875O
## 6 brazil 210. dont_leave_me_behind
2。 df_gdp
df_gdp <-
tribble(~country, ~gdp_growth,
"australia", 2.9,
"united_kingdom", 1.4,
"france", 1.7,
"spain", 2.4,
"canada", 1.9,
"greece", 1.9) %>%
mutate(col_of_strings = sample(c(myFun(5), "dont_leave_me_behind")))
## # A tibble: 6 x 3
## country gdp_growth col_of_strings
## <chr> <dbl> <chr>
## 1 australia 2.9 dont_leave_me_behind
## 2 united_kingdom 1.4 RQHHI9679V
## 3 france 1.7 PFSZX1552L
## 4 spain 2.4 BQTBY7537E
## 5 canada 1.9 OECIK9698V
## 6 greece 1.9 VXDQQ4718J
我的问题
通常我会选择
dplyr::inner_join(df_population, df_gdp, by = "country")
但是:
虽然我只想要两个数据框共有的国家/地区,但我仍然想包括任何具有 col_of_strings == dont_leave_me_behind
的国家/地区
我希望有一个简单的解决方案。谢谢!
想到了三种方法。
按照@M.Viking的建议,先full_join
再过滤。
dplyr::full_join(df_population, df_gdp, by = "country") %>%
dplyr::filter(
col_of_strings.y == "dont_leave_me_behind" | !is.na(col_of_strings.x),
col_of_strings.x == "dont_leave_me_behind" | !is.na(col_of_strings.y)
)
# # A tibble: 6 x 5
# country population col_of_strings.x gdp_growth col_of_strings.y
# <chr> <dbl> <chr> <dbl> <chr>
# 1 australia 25.0 LQMPB3662R 2.9 VKBCE2969H
# 2 united_kingdom 66.6 WDXVX4684T 1.4 FMAKF4470M
# 3 france 67.0 VJHBH0078U 1.7 dont_leave_me_behind
# 4 spain 46.9 XFJPD7687T 2.4 RMPYK2467U
# 5 canada 37.6 AQRCR0724P 1.9 JXMMZ3736X
# 6 brazil 210. dont_leave_me_behind NA <NA>
执行内部联接,从每个帧中提取缺失的行,然后 bind_rows
将它们重新加入。由于 .x
/.y
名字 post-join.
tmp1 <- dplyr::inner_join(df_population, df_gdp, by = "country")
missing_pop <- df_population %>%
dplyr::filter(
col_of_strings == "dont_leave_me_behind",
!country %in% tmp1$country
) %>%
dplyr::rename(col_of_strings.x = col_of_strings)
missing_pop
# # A tibble: 1 x 3
# country population col_of_strings.x
# <chr> <dbl> <chr>
# 1 brazil 210. dont_leave_me_behind
missing_gdp <- df_gdp %>%
dplyr::filter(
col_of_strings == "dont_leave_me_behind",
!country %in% tmp1$country
) %>%
dplyr::rename(col_of_strings.y = col_of_strings)
missing_gdp
# # A tibble: 0 x 3
# # ... with 3 variables: country <chr>, gdp_growth <dbl>, col_of_strings.y <chr>
out <- dplyr::bind_rows(tmp1, missing_pop, missing_gdp)
out
# # A tibble: 6 x 5
# country population col_of_strings.x gdp_growth col_of_strings.y
# <chr> <dbl> <chr> <dbl> <chr>
# 1 australia 25.0 LQMPB3662R 2.9 VKBCE2969H
# 2 united_kingdom 66.6 WDXVX4684T 1.4 FMAKF4470M
# 3 france 67.0 VJHBH0078U 1.7 dont_leave_me_behind
# 4 spain 46.9 XFJPD7687T 2.4 RMPYK2467U
# 5 canada 37.6 AQRCR0724P 1.9 JXMMZ3736X
# 6 brazil 210. dont_leave_me_behind NA <NA>
类似于2,但使用anti_join
:
tmp1 <- dplyr::inner_join(df_population, df_gdp, by = "country")
out <- dplyr::bind_rows(
tmp1,
dplyr::filter(df_population, col_of_strings == "dont_leave_me_behind") %>%
dplyr::anti_join(., tmp1, by = "country") %>%
dplyr::rename(col_of_strings.x = col_of_strings),
dplyr::filter(df_gdp, col_of_strings == "dont_leave_me_behind") %>%
anti_join(., tmp1, by = "country") %>%
dplyr::rename(col_of_strings.y = col_of_strings)
)
后两者与基准测试的表现大致相同:
bench::mark(full1=..., inner2=..., inner3=...)
# # A tibble: 3 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list>
# 1 full1 5.83ms 7.72ms 127. 1.36KB 4.32 59 2 463ms <tibb~ <Rpro~
# 2 inner2 9.54ms 11.46ms 84.9 11.28KB 2.07 41 1 483ms <tibb~ <Rpro~
# 3 inner3 13.95ms 14.92ms 62.9 11.28KB 4.49 28 2 445ms <tibb~ <Rpro~
# # ... with 2 more variables: time <list>, gc <list>
在这种情况下 full_join
效果更好 。更大的数据可能表现明显不同,我还没有测试过。
我有两个数据框,我想通过一个公共变量合并它们。一些观察结果出现在一个数据中,但没有出现在另一个数据中,反之亦然。合并时,我只想保留出现在两个数据框中的观察结果,因此 dplyr::inner_join()
是合适的。
但是,有一个例外。有 一些 观察无论如何我想包含在合并数据中。也就是说,即使它们没有出现在两个原始数据框中。哪些是要保留的“特殊”观察结果的指示在某些列中作为某些值给出。
例子
我想合并以下数据框(df_population
和 df_gdp
)
1. df_population
library(tibble)
library(dplyr)
## helper function
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
set.seed(2021)
df_population <-
tribble(~country, ~population,
"australia", 24.99,
"united_kingdom", 66.65,
"france", 66.99,
"spain", 46.94,
"canada", 37.59,
"brazil", 209.5) %>%
mutate(col_of_strings = c(myFun(5), "dont_leave_me_behind"))
## # A tibble: 6 x 3
## country population col_of_strings
## <chr> <dbl> <chr>
## 1 australia 25.0 GLNWN9968R
## 2 united_kingdom 66.6 FTELH3426F
## 3 france 67.0 NFOSZ6335V
## 4 spain 46.9 ZFGRD8875F
## 5 canada 37.6 GFICE2875O
## 6 brazil 210. dont_leave_me_behind
2。 df_gdp
df_gdp <-
tribble(~country, ~gdp_growth,
"australia", 2.9,
"united_kingdom", 1.4,
"france", 1.7,
"spain", 2.4,
"canada", 1.9,
"greece", 1.9) %>%
mutate(col_of_strings = sample(c(myFun(5), "dont_leave_me_behind")))
## # A tibble: 6 x 3
## country gdp_growth col_of_strings
## <chr> <dbl> <chr>
## 1 australia 2.9 dont_leave_me_behind
## 2 united_kingdom 1.4 RQHHI9679V
## 3 france 1.7 PFSZX1552L
## 4 spain 2.4 BQTBY7537E
## 5 canada 1.9 OECIK9698V
## 6 greece 1.9 VXDQQ4718J
我的问题
通常我会选择
dplyr::inner_join(df_population, df_gdp, by = "country")
但是:
虽然我只想要两个数据框共有的国家/地区,但我仍然想包括任何具有 col_of_strings == dont_leave_me_behind
我希望有一个简单的解决方案。谢谢!
想到了三种方法。
按照@M.Viking的建议,先
full_join
再过滤。dplyr::full_join(df_population, df_gdp, by = "country") %>% dplyr::filter( col_of_strings.y == "dont_leave_me_behind" | !is.na(col_of_strings.x), col_of_strings.x == "dont_leave_me_behind" | !is.na(col_of_strings.y) ) # # A tibble: 6 x 5 # country population col_of_strings.x gdp_growth col_of_strings.y # <chr> <dbl> <chr> <dbl> <chr> # 1 australia 25.0 LQMPB3662R 2.9 VKBCE2969H # 2 united_kingdom 66.6 WDXVX4684T 1.4 FMAKF4470M # 3 france 67.0 VJHBH0078U 1.7 dont_leave_me_behind # 4 spain 46.9 XFJPD7687T 2.4 RMPYK2467U # 5 canada 37.6 AQRCR0724P 1.9 JXMMZ3736X # 6 brazil 210. dont_leave_me_behind NA <NA>
执行内部联接,从每个帧中提取缺失的行,然后
bind_rows
将它们重新加入。由于.x
/.y
名字 post-join.tmp1 <- dplyr::inner_join(df_population, df_gdp, by = "country") missing_pop <- df_population %>% dplyr::filter( col_of_strings == "dont_leave_me_behind", !country %in% tmp1$country ) %>% dplyr::rename(col_of_strings.x = col_of_strings) missing_pop # # A tibble: 1 x 3 # country population col_of_strings.x # <chr> <dbl> <chr> # 1 brazil 210. dont_leave_me_behind missing_gdp <- df_gdp %>% dplyr::filter( col_of_strings == "dont_leave_me_behind", !country %in% tmp1$country ) %>% dplyr::rename(col_of_strings.y = col_of_strings) missing_gdp # # A tibble: 0 x 3 # # ... with 3 variables: country <chr>, gdp_growth <dbl>, col_of_strings.y <chr> out <- dplyr::bind_rows(tmp1, missing_pop, missing_gdp) out # # A tibble: 6 x 5 # country population col_of_strings.x gdp_growth col_of_strings.y # <chr> <dbl> <chr> <dbl> <chr> # 1 australia 25.0 LQMPB3662R 2.9 VKBCE2969H # 2 united_kingdom 66.6 WDXVX4684T 1.4 FMAKF4470M # 3 france 67.0 VJHBH0078U 1.7 dont_leave_me_behind # 4 spain 46.9 XFJPD7687T 2.4 RMPYK2467U # 5 canada 37.6 AQRCR0724P 1.9 JXMMZ3736X # 6 brazil 210. dont_leave_me_behind NA <NA>
类似于2,但使用
anti_join
:tmp1 <- dplyr::inner_join(df_population, df_gdp, by = "country") out <- dplyr::bind_rows( tmp1, dplyr::filter(df_population, col_of_strings == "dont_leave_me_behind") %>% dplyr::anti_join(., tmp1, by = "country") %>% dplyr::rename(col_of_strings.x = col_of_strings), dplyr::filter(df_gdp, col_of_strings == "dont_leave_me_behind") %>% anti_join(., tmp1, by = "country") %>% dplyr::rename(col_of_strings.y = col_of_strings) )
后两者与基准测试的表现大致相同:
bench::mark(full1=..., inner2=..., inner3=...)
# # A tibble: 3 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list>
# 1 full1 5.83ms 7.72ms 127. 1.36KB 4.32 59 2 463ms <tibb~ <Rpro~
# 2 inner2 9.54ms 11.46ms 84.9 11.28KB 2.07 41 1 483ms <tibb~ <Rpro~
# 3 inner3 13.95ms 14.92ms 62.9 11.28KB 4.49 28 2 445ms <tibb~ <Rpro~
# # ... with 2 more variables: time <list>, gc <list>
在这种情况下 full_join
效果更好 。更大的数据可能表现明显不同,我还没有测试过。