使用 dplyr 包将两个数据框合并为一个,但将值添加到合并数据集的特定列
Use dplyr package to merge two dataframes into one but add values to specific columns of the merged dataset
我有以下 2 个数据集:
original_1
id type city state zip
<dbl> <chr> <chr> <chr> <chr>
1 1 type1 city1 state1 zip1
2 2 type2 city2 state2 zip2
3 3 type3 city3 state3 zip3
4 4 type4 city4 state4 zip4
5 5 type5 city5 state5 zip5
6 6 type6 city6 state6 zip6
7 7 type7 city7 state7 zip7
8 8 type8 city8 state8 zip8
9 9 type9 city9 state9 zip9
10 10 type10 city10 state10 zip10
11 11 type11 city11 state11 zip11
原创2
id1 type1 city1 state1 zip1 data data_1 data_2
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 12 type12 city12 state12 zip12 NA Non_changing_data Non_changing_data
2 13 type13 city13 state13 zip13 NA Non_changing_data Non_changing_data
3 14 type14 city14 state14 zip14 NA Non_changing_data Non_changing_data
4 15 type15 city15 state15 zip15 NA Non_changing_data Non_changing_data
5 16 type16 city16 state16 zip16 NA Non_changing_data Non_changing_data
6 17 type17 city17 state17 zip17 NA Non_changing_data Non_changing_data
7 18 type18 city18 state18 zip18 NA Non_changing_data Non_changing_data
8 19 type19 city19 state19 zip19 NA Non_changing_data Non_changing_data
9 20 type20 city20 state20 zip20 data1 Non_changing_data Non_changing_data
10 NA NA NA NA NA NA Non_changing_data Non_changing_data
11 NA NA NA NA NA data2 Non_changing_data Non_changing_data
并且我想以第二个数据集将是第一个数据集的延续的方式绑定它们。然后我想用来自第二个数据集的值填充 data1
data2
列中的 NAs
使用 dplyr 包。我还想删除 type
列。 original2
中存在的列 data
的值应移动到其初始行位置 9
和 11
以及任何不包含 [=22= 的值的行] 必须删除。
id city state data data_1 data_2
<dbl> <chr> <chr> <chr> <chr> <chr>
1 1 city1 state1 NA Non_changing_data Non_changing_data
2 2 city2 state2 NA Non_changing_data Non_changing_data
3 3 city3 state3 NA Non_changing_data Non_changing_data
4 4 city4 state4 NA Non_changing_data Non_changing_data
5 5 city5 state5 NA Non_changing_data Non_changing_data
6 6 city6 state6 NA Non_changing_data Non_changing_data
7 7 city7 state7 NA Non_changing_data Non_changing_data
8 8 city8 state8 NA Non_changing_data Non_changing_data
9 9 city9 state9 data1 Non_changing_data Non_changing_data
10 10 city10 state10 NA Non_changing_data Non_changing_data
11 11 city11 state11 data2 Non_changing_data Non_changing_data
12 12 city12 state12 NA Non_changing_data Non_changing_data
13 13 city13 state13 NA Non_changing_data Non_changing_data
14 14 city14 state14 NA Non_changing_data Non_changing_data
15 15 city15 state15 NA Non_changing_data Non_changing_data
16 16 city16 state16 NA Non_changing_data Non_changing_data
17 17 city17 state17 NA Non_changing_data Non_changing_data
18 18 city18 state18 NA Non_changing_data Non_changing_data
19 19 city19 state19 NA Non_changing_data Non_changing_data
20 20 city20 state20 NA Non_changing_data Non_changing_data
数据集
original_1<-structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), type = c("type1",
"type2", "type3", "type4", "type5", "type6", "type7", "type8",
"type9", "type10", "type11"), city = c("city1", "city2", "city3",
"city4", "city5", "city6", "city7", "city8", "city9", "city10",
"city11"), state = c("state1", "state2", "state3", "state4",
"state5", "state6", "state7", "state8", "state9", "state10",
"state11"), zip = c("zip1", "zip2", "zip3", "zip4", "zip5", "zip6",
"zip7", "zip8", "zip9", "zip10", "zip11")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -11L))
original2<- structure(list(id1 = c(12, 13, 14, 15, 16, 17, 18, 19, 20, NA,
NA), type1 = c("type12", "type13", "type14", "type15", "type16",
"type17", "type18", "type19", "type20", NA, NA), city1 = c("city12",
"city13", "city14", "city15", "city16", "city17", "city18", "city19",
"city20", NA, NA), state1 = c("state12", "state13", "state14",
"state15", "state16", "state17", "state18", "state19", "state20",
NA, NA), zip1 = c("zip12", "zip13", "zip14", "zip15", "zip16",
"zip17", "zip18", "zip19", "zip20", NA, NA), data = c(NA, NA,
NA, NA, NA, NA, NA, NA, "data1", NA, "data2"), data_1 = c("Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data",
"Non_changing_data"), data_2 = c("Non_changing_data", "Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-11L))
函数 full_join 是否回答了您的问题?
test <- full_join(original_1, original2, by = c("id" = "id1",
"type" = "type1",
"city" = "city1",
"state" = "state1",
"zip" = "zip1"))
这是您要找的结果吗?
您可以将 dplyr::bind_rows
与 tidyr::fill
结合使用来获取它。不过,列名需要一些清理。
library(dplyr)
# Clean up column names to add second dataset to the first using rename() to remove the numbers
original2 %>%
rename(id = id1,
type = type1,
city = city1,
state = state1,
zip = zip1) %>%
# Add dataset 1 and cleaned up dataset 2 together
bind_rows(original_1, .) %>%
# Fill NAs with data from dataset 2 using tidyr::fill()
tidyr::fill(data_1, .direction = "up") %>%
tidyr::fill(data_2, .direction = "up") %>%
# Remove "type" column
select(-type) %>%
# Artificially replace values "data1" and "data2" in "data" to row 9 and 11 respectively
mutate(data = case_when(row_number() == 9 ~ "data1",
row_number() == 11 ~ "data2",
TRUE ~ NA_character_)) %>%
# Remove rows that do not contain a value for "id"
filter(! is.na(id))
# A tibble: 20 x 7
id city state zip data data_1 data_2
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 city1 state1 zip1 NA Non_changing_data Non_changing_data
2 2 city2 state2 zip2 NA Non_changing_data Non_changing_data
3 3 city3 state3 zip3 NA Non_changing_data Non_changing_data
4 4 city4 state4 zip4 NA Non_changing_data Non_changing_data
5 5 city5 state5 zip5 NA Non_changing_data Non_changing_data
6 6 city6 state6 zip6 NA Non_changing_data Non_changing_data
7 7 city7 state7 zip7 NA Non_changing_data Non_changing_data
8 8 city8 state8 zip8 NA Non_changing_data Non_changing_data
9 9 city9 state9 zip9 data1 Non_changing_data Non_changing_data
10 10 city10 state10 zip10 NA Non_changing_data Non_changing_data
11 11 city11 state11 zip11 data2 Non_changing_data Non_changing_data
12 12 city12 state12 zip12 NA Non_changing_data Non_changing_data
13 13 city13 state13 zip13 NA Non_changing_data Non_changing_data
14 14 city14 state14 zip14 NA Non_changing_data Non_changing_data
15 15 city15 state15 zip15 NA Non_changing_data Non_changing_data
16 16 city16 state16 zip16 NA Non_changing_data Non_changing_data
17 17 city17 state17 zip17 NA Non_changing_data Non_changing_data
18 18 city18 state18 zip18 NA Non_changing_data Non_changing_data
19 19 city19 state19 zip19 NA Non_changing_data Non_changing_data
20 20 city20 state20 zip20 NA Non_changing_data Non_changing_data
我有以下 2 个数据集:
original_1
id type city state zip
<dbl> <chr> <chr> <chr> <chr>
1 1 type1 city1 state1 zip1
2 2 type2 city2 state2 zip2
3 3 type3 city3 state3 zip3
4 4 type4 city4 state4 zip4
5 5 type5 city5 state5 zip5
6 6 type6 city6 state6 zip6
7 7 type7 city7 state7 zip7
8 8 type8 city8 state8 zip8
9 9 type9 city9 state9 zip9
10 10 type10 city10 state10 zip10
11 11 type11 city11 state11 zip11
原创2
id1 type1 city1 state1 zip1 data data_1 data_2
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 12 type12 city12 state12 zip12 NA Non_changing_data Non_changing_data
2 13 type13 city13 state13 zip13 NA Non_changing_data Non_changing_data
3 14 type14 city14 state14 zip14 NA Non_changing_data Non_changing_data
4 15 type15 city15 state15 zip15 NA Non_changing_data Non_changing_data
5 16 type16 city16 state16 zip16 NA Non_changing_data Non_changing_data
6 17 type17 city17 state17 zip17 NA Non_changing_data Non_changing_data
7 18 type18 city18 state18 zip18 NA Non_changing_data Non_changing_data
8 19 type19 city19 state19 zip19 NA Non_changing_data Non_changing_data
9 20 type20 city20 state20 zip20 data1 Non_changing_data Non_changing_data
10 NA NA NA NA NA NA Non_changing_data Non_changing_data
11 NA NA NA NA NA data2 Non_changing_data Non_changing_data
并且我想以第二个数据集将是第一个数据集的延续的方式绑定它们。然后我想用来自第二个数据集的值填充 data1
data2
列中的 NAs
使用 dplyr 包。我还想删除 type
列。 original2
中存在的列 data
的值应移动到其初始行位置 9
和 11
以及任何不包含 [=22= 的值的行] 必须删除。
id city state data data_1 data_2
<dbl> <chr> <chr> <chr> <chr> <chr>
1 1 city1 state1 NA Non_changing_data Non_changing_data
2 2 city2 state2 NA Non_changing_data Non_changing_data
3 3 city3 state3 NA Non_changing_data Non_changing_data
4 4 city4 state4 NA Non_changing_data Non_changing_data
5 5 city5 state5 NA Non_changing_data Non_changing_data
6 6 city6 state6 NA Non_changing_data Non_changing_data
7 7 city7 state7 NA Non_changing_data Non_changing_data
8 8 city8 state8 NA Non_changing_data Non_changing_data
9 9 city9 state9 data1 Non_changing_data Non_changing_data
10 10 city10 state10 NA Non_changing_data Non_changing_data
11 11 city11 state11 data2 Non_changing_data Non_changing_data
12 12 city12 state12 NA Non_changing_data Non_changing_data
13 13 city13 state13 NA Non_changing_data Non_changing_data
14 14 city14 state14 NA Non_changing_data Non_changing_data
15 15 city15 state15 NA Non_changing_data Non_changing_data
16 16 city16 state16 NA Non_changing_data Non_changing_data
17 17 city17 state17 NA Non_changing_data Non_changing_data
18 18 city18 state18 NA Non_changing_data Non_changing_data
19 19 city19 state19 NA Non_changing_data Non_changing_data
20 20 city20 state20 NA Non_changing_data Non_changing_data
数据集
original_1<-structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), type = c("type1",
"type2", "type3", "type4", "type5", "type6", "type7", "type8",
"type9", "type10", "type11"), city = c("city1", "city2", "city3",
"city4", "city5", "city6", "city7", "city8", "city9", "city10",
"city11"), state = c("state1", "state2", "state3", "state4",
"state5", "state6", "state7", "state8", "state9", "state10",
"state11"), zip = c("zip1", "zip2", "zip3", "zip4", "zip5", "zip6",
"zip7", "zip8", "zip9", "zip10", "zip11")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -11L))
original2<- structure(list(id1 = c(12, 13, 14, 15, 16, 17, 18, 19, 20, NA,
NA), type1 = c("type12", "type13", "type14", "type15", "type16",
"type17", "type18", "type19", "type20", NA, NA), city1 = c("city12",
"city13", "city14", "city15", "city16", "city17", "city18", "city19",
"city20", NA, NA), state1 = c("state12", "state13", "state14",
"state15", "state16", "state17", "state18", "state19", "state20",
NA, NA), zip1 = c("zip12", "zip13", "zip14", "zip15", "zip16",
"zip17", "zip18", "zip19", "zip20", NA, NA), data = c(NA, NA,
NA, NA, NA, NA, NA, NA, "data1", NA, "data2"), data_1 = c("Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data",
"Non_changing_data"), data_2 = c("Non_changing_data", "Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data",
"Non_changing_data", "Non_changing_data", "Non_changing_data"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-11L))
函数 full_join 是否回答了您的问题?
test <- full_join(original_1, original2, by = c("id" = "id1",
"type" = "type1",
"city" = "city1",
"state" = "state1",
"zip" = "zip1"))
这是您要找的结果吗?
您可以将 dplyr::bind_rows
与 tidyr::fill
结合使用来获取它。不过,列名需要一些清理。
library(dplyr)
# Clean up column names to add second dataset to the first using rename() to remove the numbers
original2 %>%
rename(id = id1,
type = type1,
city = city1,
state = state1,
zip = zip1) %>%
# Add dataset 1 and cleaned up dataset 2 together
bind_rows(original_1, .) %>%
# Fill NAs with data from dataset 2 using tidyr::fill()
tidyr::fill(data_1, .direction = "up") %>%
tidyr::fill(data_2, .direction = "up") %>%
# Remove "type" column
select(-type) %>%
# Artificially replace values "data1" and "data2" in "data" to row 9 and 11 respectively
mutate(data = case_when(row_number() == 9 ~ "data1",
row_number() == 11 ~ "data2",
TRUE ~ NA_character_)) %>%
# Remove rows that do not contain a value for "id"
filter(! is.na(id))
# A tibble: 20 x 7
id city state zip data data_1 data_2
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 city1 state1 zip1 NA Non_changing_data Non_changing_data
2 2 city2 state2 zip2 NA Non_changing_data Non_changing_data
3 3 city3 state3 zip3 NA Non_changing_data Non_changing_data
4 4 city4 state4 zip4 NA Non_changing_data Non_changing_data
5 5 city5 state5 zip5 NA Non_changing_data Non_changing_data
6 6 city6 state6 zip6 NA Non_changing_data Non_changing_data
7 7 city7 state7 zip7 NA Non_changing_data Non_changing_data
8 8 city8 state8 zip8 NA Non_changing_data Non_changing_data
9 9 city9 state9 zip9 data1 Non_changing_data Non_changing_data
10 10 city10 state10 zip10 NA Non_changing_data Non_changing_data
11 11 city11 state11 zip11 data2 Non_changing_data Non_changing_data
12 12 city12 state12 zip12 NA Non_changing_data Non_changing_data
13 13 city13 state13 zip13 NA Non_changing_data Non_changing_data
14 14 city14 state14 zip14 NA Non_changing_data Non_changing_data
15 15 city15 state15 zip15 NA Non_changing_data Non_changing_data
16 16 city16 state16 zip16 NA Non_changing_data Non_changing_data
17 17 city17 state17 zip17 NA Non_changing_data Non_changing_data
18 18 city18 state18 zip18 NA Non_changing_data Non_changing_data
19 19 city19 state19 zip19 NA Non_changing_data Non_changing_data
20 20 city20 state20 zip20 NA Non_changing_data Non_changing_data