将部分数据框合并到另一个数据框中,同时保留以前的数据
Merging part of data frame into another data frame while preserving previous data
我有一个如下所示的数据框:
> data
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AlanHenry AlanHenry0.705 2109.350 0.705 AlanHenry0.708 2116.203 0.708 277
2 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.821 2355.950 0.821 303
3 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.856 2426.508 0.856 341
4 AlanHenry AlanHenry0.912 2553.133 0.912 AlanHenry0.886 2505.336 0.886 243
5 AlanHenry AlanHenry0.966 2660.238 0.966 AlanHenry0.958 2637.476 0.958 211
6 AlanHenry AlanHenry1.009 2779.621 1.009 AlanHenry0.989 2694.901 0.989 173
17 <NA> AmonGCarter0.724 NA NA AmonGCarter0.67 NA NA 155
18 <NA> AmonGCarter0.74 NA NA AmonGCarter0.629 NA NA 568
19 <NA> AmonGCarter0.8 NA NA AmonGCarter0.885 NA NA 456
20 <NA> AmonGCarter0.885 NA NA AmonGCarter0.842 NA NA 244
21 <NA> AmonGCarter0.914 NA NA AmonGCarter0.958 NA NA 36
22 <NA> AmonGCarter0.958 NA NA AmonGCarter1.035 NA NA 417
23 <NA> AmonGCarter0.976 NA NA AmonGCarter1.075 NA NA 58
24 <NA> AmonGCarter1.018 NA NA AmonGCarter0.92 NA NA 603
25 <NA> AmonGCarter1.035 NA NA AmonGCarter0.918 NA NA 194
> dput(data)
structure(list(Lake_name = c("AlanHenry", "AlanHenry", "AlanHenry",
"AlanHenry", "AlanHenry", "AlanHenry", NA, NA, NA, NA, NA, NA,
NA, NA, NA), Lake_name_percent = c("AlanHenry0.705", "AlanHenry0.82",
"AlanHenry0.82", "AlanHenry0.912", "AlanHenry0.966", "AlanHenry1.009",
"AmonGCarter0.724", "AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885",
"AmonGCarter0.914", "AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018",
"AmonGCarter1.035"), surface_area = c(2109.35, 2354.878, 2354.878,
2553.1325, 2660.238125, 2779.62076923077, NA, NA, NA, NA, NA,
NA, NA, NA, NA), percent2 = c(0.705, 0.82, 0.82, 0.912, 0.966,
1.009, NA, NA, NA, NA, NA, NA, NA, NA, NA), Lake_name_percent2 = c("AlanHenry0.708",
"AlanHenry0.821", "AlanHenry0.856", "AlanHenry0.886", "AlanHenry0.958",
"AlanHenry0.989", "AmonGCarter0.67", "AmonGCarter0.629", "AmonGCarter0.885",
"AmonGCarter0.842", "AmonGCarter0.958", "AmonGCarter1.035", "AmonGCarter1.075",
"AmonGCarter0.92", "AmonGCarter0.918"), prev.surface_area = c(2116.20333333333,
2355.95, 2426.50833333333, 2505.33642857143, 2637.47595744681,
2694.90068965517, NA, NA, NA, NA, NA, NA, NA, NA, NA), percent3 = c(0.708,
0.821, 0.856, 0.886, 0.958, 0.989, NA, NA, NA, NA, NA, NA, NA,
NA, NA), X = c(277L, 303L, 341L, 243L, 211L, 173L, 155L, 568L,
456L, 244L, 36L, 417L, 58L, 603L, 194L)), class = "data.frame", row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L
))
我正在尝试将此数据框与如下所示的数据框合并:
surface_area Lake_name percent2 Lake_name_percent
<dbl> <chr> <dbl> <chr>
1 1159. AmonGCarter 0.724 AmonGCarter0.724
2 1176. AmonGCarter 0.74 AmonGCarter0.74
3 1240. AmonGCarter 0.8 AmonGCarter0.8
4 1329. AmonGCarter 0.885 AmonGCarter0.885
5 1360. AmonGCarter 0.914 AmonGCarter0.914
6 1407. AmonGCarter 0.958 AmonGCarter0.958
7 1426. AmonGCarter 0.976 AmonGCarter0.976
8 1468 AmonGCarter 1.02 AmonGCarter1.018
9 1484. AmonGCarter 1.03 AmonGCarter1.035
> dput(AmonGCarter.sa)
structure(list(surface_area = c(1159.1, 1175.8, 1239.65714285714,
1329.2, 1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), Lake_name = c("AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter"), percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent = c("AmonGCarter0.724",
"AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914",
"AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"
)), row.names = c(NA, -9L), groups = structure(list(percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
当我像这样合并数据集时:
merge(data,AmonGCarter.sa, all.y = T)
我的数据在没有 AlanHenry 数据的情况下看起来像这样:
> merge(data,AmonGCarter.sa, all.y = T)
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AmonGCarter AmonGCarter0.724 1159.100 0.724 <NA> NA NA NA
2 AmonGCarter AmonGCarter0.74 1175.800 0.740 <NA> NA NA NA
3 AmonGCarter AmonGCarter0.8 1239.657 0.800 <NA> NA NA NA
4 AmonGCarter AmonGCarter0.885 1329.200 0.885 <NA> NA NA NA
5 AmonGCarter AmonGCarter0.914 1359.600 0.914 <NA> NA NA NA
6 AmonGCarter AmonGCarter0.958 1406.704 0.958 <NA> NA NA NA
7 AmonGCarter AmonGCarter0.976 1426.200 0.976 <NA> NA NA NA
8 AmonGCarter AmonGCarter1.018 1468.000 1.018 <NA> NA NA NA
9 AmonGCarter AmonGCarter1.035 1484.200 1.035 <NA> NA NA NA
> dput(merge(data,AmonGCarter.sa, all.y = T))
structure(list(Lake_name = c("AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter"), Lake_name_percent = c("AmonGCarter0.724", "AmonGCarter0.74",
"AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914", "AmonGCarter0.958",
"AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"),
surface_area = c(1159.1, 1175.8, 1239.65714285714, 1329.2,
1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent2 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), prev.surface_area = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), percent3 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), X = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), row.names = c(NA, -9L), class = "data.frame")
如何将 AmonGCarter
中的数据添加到合并的数据框中,以保留 AlanHenry 之前的数据?
您可以在 tidyverse
中使用 full_join
。这会创建额外的列(但我们用后缀控制名称),但是我们可以使用 coalesce
来组合类似的列。然后,我们可以删除多余的列。
library(tidyverse)
data %>%
full_join(AmonGCarter.sa, by = 'Lake_name_percent', suffix = c('_x', '_y')) %>%
mutate(across(ends_with('_x'), ~ coalesce(., get(
sub('_x', '_y', cur_column())
)),
.names = '{sub("_x", "", {.col})}')) %>%
select(!ends_with('_x') & !ends_with('_y')) %>%
select(names(data))
另一种简单的方法是使用 rquery
包中的 natural_join
。它会将 data
中的 NA
值替换为 AmonGCarter.sa
中的值。然后,要按 data
的原始顺序获取列,您可以使用 dplyr
.
中的 select
library(rquery)
library(rqdatatable)
library(dplyr)
rquery::natural_join(data, AmonGCarter.sa, by = "Lake_name_percent", jointype = "FULL") %>%
dplyr::select(names(data))
输出
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AlanHenry AlanHenry0.705 2109.350 0.705 AlanHenry0.708 2116.203 0.708 277
2 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.821 2355.950 0.821 303
3 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.856 2426.508 0.856 341
4 AlanHenry AlanHenry0.912 2553.133 0.912 AlanHenry0.886 2505.336 0.886 243
5 AlanHenry AlanHenry0.966 2660.238 0.966 AlanHenry0.958 2637.476 0.958 211
6 AlanHenry AlanHenry1.009 2779.621 1.009 AlanHenry0.989 2694.901 0.989 173
7 AmonGCarter AmonGCarter0.724 1159.100 0.724 AmonGCarter0.67 NA NA 155
8 AmonGCarter AmonGCarter0.74 1175.800 0.740 AmonGCarter0.629 NA NA 568
9 AmonGCarter AmonGCarter0.8 1239.657 0.800 AmonGCarter0.885 NA NA 456
10 AmonGCarter AmonGCarter0.885 1329.200 0.885 AmonGCarter0.842 NA NA 244
11 AmonGCarter AmonGCarter0.914 1359.600 0.914 AmonGCarter0.958 NA NA 36
12 AmonGCarter AmonGCarter0.958 1406.704 0.958 AmonGCarter1.035 NA NA 417
13 AmonGCarter AmonGCarter0.976 1426.200 0.976 AmonGCarter1.075 NA NA 58
14 AmonGCarter AmonGCarter1.018 1468.000 1.018 AmonGCarter0.92 NA NA 603
15 AmonGCarter AmonGCarter1.035 1484.200 1.035 AmonGCarter0.918 NA NA 194
我有一个如下所示的数据框:
> data
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AlanHenry AlanHenry0.705 2109.350 0.705 AlanHenry0.708 2116.203 0.708 277
2 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.821 2355.950 0.821 303
3 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.856 2426.508 0.856 341
4 AlanHenry AlanHenry0.912 2553.133 0.912 AlanHenry0.886 2505.336 0.886 243
5 AlanHenry AlanHenry0.966 2660.238 0.966 AlanHenry0.958 2637.476 0.958 211
6 AlanHenry AlanHenry1.009 2779.621 1.009 AlanHenry0.989 2694.901 0.989 173
17 <NA> AmonGCarter0.724 NA NA AmonGCarter0.67 NA NA 155
18 <NA> AmonGCarter0.74 NA NA AmonGCarter0.629 NA NA 568
19 <NA> AmonGCarter0.8 NA NA AmonGCarter0.885 NA NA 456
20 <NA> AmonGCarter0.885 NA NA AmonGCarter0.842 NA NA 244
21 <NA> AmonGCarter0.914 NA NA AmonGCarter0.958 NA NA 36
22 <NA> AmonGCarter0.958 NA NA AmonGCarter1.035 NA NA 417
23 <NA> AmonGCarter0.976 NA NA AmonGCarter1.075 NA NA 58
24 <NA> AmonGCarter1.018 NA NA AmonGCarter0.92 NA NA 603
25 <NA> AmonGCarter1.035 NA NA AmonGCarter0.918 NA NA 194
> dput(data)
structure(list(Lake_name = c("AlanHenry", "AlanHenry", "AlanHenry",
"AlanHenry", "AlanHenry", "AlanHenry", NA, NA, NA, NA, NA, NA,
NA, NA, NA), Lake_name_percent = c("AlanHenry0.705", "AlanHenry0.82",
"AlanHenry0.82", "AlanHenry0.912", "AlanHenry0.966", "AlanHenry1.009",
"AmonGCarter0.724", "AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885",
"AmonGCarter0.914", "AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018",
"AmonGCarter1.035"), surface_area = c(2109.35, 2354.878, 2354.878,
2553.1325, 2660.238125, 2779.62076923077, NA, NA, NA, NA, NA,
NA, NA, NA, NA), percent2 = c(0.705, 0.82, 0.82, 0.912, 0.966,
1.009, NA, NA, NA, NA, NA, NA, NA, NA, NA), Lake_name_percent2 = c("AlanHenry0.708",
"AlanHenry0.821", "AlanHenry0.856", "AlanHenry0.886", "AlanHenry0.958",
"AlanHenry0.989", "AmonGCarter0.67", "AmonGCarter0.629", "AmonGCarter0.885",
"AmonGCarter0.842", "AmonGCarter0.958", "AmonGCarter1.035", "AmonGCarter1.075",
"AmonGCarter0.92", "AmonGCarter0.918"), prev.surface_area = c(2116.20333333333,
2355.95, 2426.50833333333, 2505.33642857143, 2637.47595744681,
2694.90068965517, NA, NA, NA, NA, NA, NA, NA, NA, NA), percent3 = c(0.708,
0.821, 0.856, 0.886, 0.958, 0.989, NA, NA, NA, NA, NA, NA, NA,
NA, NA), X = c(277L, 303L, 341L, 243L, 211L, 173L, 155L, 568L,
456L, 244L, 36L, 417L, 58L, 603L, 194L)), class = "data.frame", row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L
))
我正在尝试将此数据框与如下所示的数据框合并:
surface_area Lake_name percent2 Lake_name_percent
<dbl> <chr> <dbl> <chr>
1 1159. AmonGCarter 0.724 AmonGCarter0.724
2 1176. AmonGCarter 0.74 AmonGCarter0.74
3 1240. AmonGCarter 0.8 AmonGCarter0.8
4 1329. AmonGCarter 0.885 AmonGCarter0.885
5 1360. AmonGCarter 0.914 AmonGCarter0.914
6 1407. AmonGCarter 0.958 AmonGCarter0.958
7 1426. AmonGCarter 0.976 AmonGCarter0.976
8 1468 AmonGCarter 1.02 AmonGCarter1.018
9 1484. AmonGCarter 1.03 AmonGCarter1.035
> dput(AmonGCarter.sa)
structure(list(surface_area = c(1159.1, 1175.8, 1239.65714285714,
1329.2, 1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), Lake_name = c("AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter"), percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent = c("AmonGCarter0.724",
"AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914",
"AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"
)), row.names = c(NA, -9L), groups = structure(list(percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
当我像这样合并数据集时:
merge(data,AmonGCarter.sa, all.y = T)
我的数据在没有 AlanHenry 数据的情况下看起来像这样:
> merge(data,AmonGCarter.sa, all.y = T)
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AmonGCarter AmonGCarter0.724 1159.100 0.724 <NA> NA NA NA
2 AmonGCarter AmonGCarter0.74 1175.800 0.740 <NA> NA NA NA
3 AmonGCarter AmonGCarter0.8 1239.657 0.800 <NA> NA NA NA
4 AmonGCarter AmonGCarter0.885 1329.200 0.885 <NA> NA NA NA
5 AmonGCarter AmonGCarter0.914 1359.600 0.914 <NA> NA NA NA
6 AmonGCarter AmonGCarter0.958 1406.704 0.958 <NA> NA NA NA
7 AmonGCarter AmonGCarter0.976 1426.200 0.976 <NA> NA NA NA
8 AmonGCarter AmonGCarter1.018 1468.000 1.018 <NA> NA NA NA
9 AmonGCarter AmonGCarter1.035 1484.200 1.035 <NA> NA NA NA
> dput(merge(data,AmonGCarter.sa, all.y = T))
structure(list(Lake_name = c("AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter"), Lake_name_percent = c("AmonGCarter0.724", "AmonGCarter0.74",
"AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914", "AmonGCarter0.958",
"AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"),
surface_area = c(1159.1, 1175.8, 1239.65714285714, 1329.2,
1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent2 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), prev.surface_area = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), percent3 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), X = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), row.names = c(NA, -9L), class = "data.frame")
如何将 AmonGCarter
中的数据添加到合并的数据框中,以保留 AlanHenry 之前的数据?
您可以在 tidyverse
中使用 full_join
。这会创建额外的列(但我们用后缀控制名称),但是我们可以使用 coalesce
来组合类似的列。然后,我们可以删除多余的列。
library(tidyverse)
data %>%
full_join(AmonGCarter.sa, by = 'Lake_name_percent', suffix = c('_x', '_y')) %>%
mutate(across(ends_with('_x'), ~ coalesce(., get(
sub('_x', '_y', cur_column())
)),
.names = '{sub("_x", "", {.col})}')) %>%
select(!ends_with('_x') & !ends_with('_y')) %>%
select(names(data))
另一种简单的方法是使用 rquery
包中的 natural_join
。它会将 data
中的 NA
值替换为 AmonGCarter.sa
中的值。然后,要按 data
的原始顺序获取列,您可以使用 dplyr
.
select
library(rquery)
library(rqdatatable)
library(dplyr)
rquery::natural_join(data, AmonGCarter.sa, by = "Lake_name_percent", jointype = "FULL") %>%
dplyr::select(names(data))
输出
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AlanHenry AlanHenry0.705 2109.350 0.705 AlanHenry0.708 2116.203 0.708 277
2 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.821 2355.950 0.821 303
3 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.856 2426.508 0.856 341
4 AlanHenry AlanHenry0.912 2553.133 0.912 AlanHenry0.886 2505.336 0.886 243
5 AlanHenry AlanHenry0.966 2660.238 0.966 AlanHenry0.958 2637.476 0.958 211
6 AlanHenry AlanHenry1.009 2779.621 1.009 AlanHenry0.989 2694.901 0.989 173
7 AmonGCarter AmonGCarter0.724 1159.100 0.724 AmonGCarter0.67 NA NA 155
8 AmonGCarter AmonGCarter0.74 1175.800 0.740 AmonGCarter0.629 NA NA 568
9 AmonGCarter AmonGCarter0.8 1239.657 0.800 AmonGCarter0.885 NA NA 456
10 AmonGCarter AmonGCarter0.885 1329.200 0.885 AmonGCarter0.842 NA NA 244
11 AmonGCarter AmonGCarter0.914 1359.600 0.914 AmonGCarter0.958 NA NA 36
12 AmonGCarter AmonGCarter0.958 1406.704 0.958 AmonGCarter1.035 NA NA 417
13 AmonGCarter AmonGCarter0.976 1426.200 0.976 AmonGCarter1.075 NA NA 58
14 AmonGCarter AmonGCarter1.018 1468.000 1.018 AmonGCarter0.92 NA NA 603
15 AmonGCarter AmonGCarter1.035 1484.200 1.035 AmonGCarter0.918 NA NA 194