组合两个具有相同数据但更喜欢一个输入的数据框
Combine two dataframes that have the same data but prefer one enter over another
我有两个数据帧或多或少包含两列相同的数据,'User ID'(例如37f879fb-9667-4de2-9cc9-b48918fcaa1f)和'Frequent User'(可以是'Frequent' 或 'Infrequent').
我正在尝试将这两个数据框合并在一起,但是如果一个数据框有 'Frequent' 用于频繁用户,另一个是 'Infrequent',我希望它更喜欢并保留或替换为频繁用户用户每次。我不确定如何实现这个
df1
structure(list(user_id = c("000f2b1e-dde2-4227-9122-d197c674dea8",
"001abd72-53f1-436a-a26f-e3187543afb6", "001c1e12-a8f7-4459-9b23-6b8bdc5d2625",
"002d8272-8ee2-4e1a-a523-5ba0813c20f9", "0037abe8-7623-4ac7-9fbb-7398f505c1f6",
"003911c6-c013-43b7-9996-e771dbe3ac83"), frequent_user = c("Frequent",
"Infrequent", "Infrequent", "Infrequent", "Infrequent", "Infrequent"
)), row.names = c(NA, 6L), class = "data.frame")
df2
structure(list(user_id = c("000f2b1e-dde2-4227-9122-d197c674dea8",
"001abd72-53f1-436a-a26f-e3187543afb6", "001c1e12-a8f7-4459-9b23-6b8bdc5d2625",
"002d8272-8ee2-4e1a-a523-5ba0813c20f9", "0037abe8-7623-4ac7-9fbb-7398f505c1f6",
"003911c6-c013-43b7-9996-e771dbe3ac83"), frequent_user = c("Infrequent",
"Infrequent", "Infrequent", "Infrequent", "Infrequent", "Infrequent"
)), row.names = c(NA, 6L), class = "data.frame")
试试下面的代码
transform(
merge(df1, df2, by = "user_id", all = TRUE),
frequent_user = c("Infrequent", "Frequent")[1 + (rowSums(cbind(frequent_user.x, frequent_user.y) == "Frequent") > 0)]
)[c("user_id", "frequent_user")]
这给出了
user_id frequent_user
1 000f2b1e-dde2-4227-9122-d197c674dea8 Frequent
2 001abd72-53f1-436a-a26f-e3187543afb6 Infrequent
3 001c1e12-a8f7-4459-9b23-6b8bdc5d2625 Infrequent
4 002d8272-8ee2-4e1a-a523-5ba0813c20f9 Infrequent
5 0037abe8-7623-4ac7-9fbb-7398f505c1f6 Infrequent
6 003911c6-c013-43b7-9996-e771dbe3ac83 Infrequent
这是 tidyverse
的解决方案 left_join
和 transmute
,
library(tidyverse)
left_join(
df1, df2, by = "user_id"
) %>% group_by(user_id) %>%
transmute(
frequent_user = case_when(
frequent_user.x == frequent_user.y ~ frequent_user.y,
TRUE ~ "Frequent"
)
)
给出以下输出,
# A tibble: 6 x 2
# Groups: user_id [6]
user_id frequent_user
<chr> <chr>
1 000f2b1e-dde2-4227-9122-d197c674dea8 Frequent
2 001abd72-53f1-436a-a26f-e3187543afb6 Infrequent
3 001c1e12-a8f7-4459-9b23-6b8bdc5d2625 Infrequent
4 002d8272-8ee2-4e1a-a523-5ba0813c20f9 Infrequent
5 0037abe8-7623-4ac7-9fbb-7398f505c1f6 Infrequent
6 003911c6-c013-43b7-9996-e771dbe3ac83 Infrequent
这是一个没有 join
函数的 dplyr
解决方案:
library(dplyr)
df1 %>%
bind_rows(df2) %>%
group_by(user_id) %>%
arrange(user_id, frequent_user) %>%
slice(1) %>%
ungroup()
returns
# A tibble: 6 x 2
user_id frequent_user
<chr> <chr>
1 000f2b1e-dde2-4227-9122-d197c674dea8 Frequent
2 001abd72-53f1-436a-a26f-e3187543afb6 Infrequent
3 001c1e12-a8f7-4459-9b23-6b8bdc5d2625 Infrequent
4 002d8272-8ee2-4e1a-a523-5ba0813c20f9 Infrequent
5 0037abe8-7623-4ac7-9fbb-7398f505c1f6 Infrequent
6 003911c6-c013-43b7-9996-e771dbe3ac83 Infrequent
我有两个数据帧或多或少包含两列相同的数据,'User ID'(例如37f879fb-9667-4de2-9cc9-b48918fcaa1f)和'Frequent User'(可以是'Frequent' 或 'Infrequent').
我正在尝试将这两个数据框合并在一起,但是如果一个数据框有 'Frequent' 用于频繁用户,另一个是 'Infrequent',我希望它更喜欢并保留或替换为频繁用户用户每次。我不确定如何实现这个
df1
structure(list(user_id = c("000f2b1e-dde2-4227-9122-d197c674dea8",
"001abd72-53f1-436a-a26f-e3187543afb6", "001c1e12-a8f7-4459-9b23-6b8bdc5d2625",
"002d8272-8ee2-4e1a-a523-5ba0813c20f9", "0037abe8-7623-4ac7-9fbb-7398f505c1f6",
"003911c6-c013-43b7-9996-e771dbe3ac83"), frequent_user = c("Frequent",
"Infrequent", "Infrequent", "Infrequent", "Infrequent", "Infrequent"
)), row.names = c(NA, 6L), class = "data.frame")
df2
structure(list(user_id = c("000f2b1e-dde2-4227-9122-d197c674dea8",
"001abd72-53f1-436a-a26f-e3187543afb6", "001c1e12-a8f7-4459-9b23-6b8bdc5d2625",
"002d8272-8ee2-4e1a-a523-5ba0813c20f9", "0037abe8-7623-4ac7-9fbb-7398f505c1f6",
"003911c6-c013-43b7-9996-e771dbe3ac83"), frequent_user = c("Infrequent",
"Infrequent", "Infrequent", "Infrequent", "Infrequent", "Infrequent"
)), row.names = c(NA, 6L), class = "data.frame")
试试下面的代码
transform(
merge(df1, df2, by = "user_id", all = TRUE),
frequent_user = c("Infrequent", "Frequent")[1 + (rowSums(cbind(frequent_user.x, frequent_user.y) == "Frequent") > 0)]
)[c("user_id", "frequent_user")]
这给出了
user_id frequent_user
1 000f2b1e-dde2-4227-9122-d197c674dea8 Frequent
2 001abd72-53f1-436a-a26f-e3187543afb6 Infrequent
3 001c1e12-a8f7-4459-9b23-6b8bdc5d2625 Infrequent
4 002d8272-8ee2-4e1a-a523-5ba0813c20f9 Infrequent
5 0037abe8-7623-4ac7-9fbb-7398f505c1f6 Infrequent
6 003911c6-c013-43b7-9996-e771dbe3ac83 Infrequent
这是 tidyverse
的解决方案 left_join
和 transmute
,
library(tidyverse)
left_join(
df1, df2, by = "user_id"
) %>% group_by(user_id) %>%
transmute(
frequent_user = case_when(
frequent_user.x == frequent_user.y ~ frequent_user.y,
TRUE ~ "Frequent"
)
)
给出以下输出,
# A tibble: 6 x 2
# Groups: user_id [6]
user_id frequent_user
<chr> <chr>
1 000f2b1e-dde2-4227-9122-d197c674dea8 Frequent
2 001abd72-53f1-436a-a26f-e3187543afb6 Infrequent
3 001c1e12-a8f7-4459-9b23-6b8bdc5d2625 Infrequent
4 002d8272-8ee2-4e1a-a523-5ba0813c20f9 Infrequent
5 0037abe8-7623-4ac7-9fbb-7398f505c1f6 Infrequent
6 003911c6-c013-43b7-9996-e771dbe3ac83 Infrequent
这是一个没有 join
函数的 dplyr
解决方案:
library(dplyr)
df1 %>%
bind_rows(df2) %>%
group_by(user_id) %>%
arrange(user_id, frequent_user) %>%
slice(1) %>%
ungroup()
returns
# A tibble: 6 x 2
user_id frequent_user
<chr> <chr>
1 000f2b1e-dde2-4227-9122-d197c674dea8 Frequent
2 001abd72-53f1-436a-a26f-e3187543afb6 Infrequent
3 001c1e12-a8f7-4459-9b23-6b8bdc5d2625 Infrequent
4 002d8272-8ee2-4e1a-a523-5ba0813c20f9 Infrequent
5 0037abe8-7623-4ac7-9fbb-7398f505c1f6 Infrequent
6 003911c6-c013-43b7-9996-e771dbe3ac83 Infrequent