跨不同数据框的条件匹配列
Condtional match columns across different dataframes
我正在处理两个数据集 - 一组有成对的项目:
original <- data.frame(label1 = c("cat", "cat", "dog", "dog", "cat", "tiger", "tiger", "cow"),
label2 = c("dog", "dog", "cat", "cat", "dog", "cow", "cow", "tiger"))
original
label1 label2
1 cat dog
2 cat dog
3 dog cat
4 dog cat
5 cat dog
6 tiger cow
7 tiger cow
8 cow tiger
第二个数据集包含第一组项目的索引代码:
index <- data.frame(item = c("cat", "dog", "tiger", "cow"),
code = c(1, 0, 1, 0))
index
item code
1 cat 1
2 dog 0
3 tiger 1
4 cow 0
我正在寻找一种方法来创建两个新列:tag0
和 tag1
以便它看起来像这样:
new <- data.frame(label1 = c("cat", "cat", "dog", "dog", "cat", "tiger", "tiger", "cow"),
label2 = c("dog", "dog", "cat", "cat", "dog", "cow", "cow", "tiger"),
tag1 = c("cat", "cat", "cat", "cat", "cat", "tiger", "tiger", "tiger"),
tag0 = c("dog", "dog", "dog", "dog", "dog", "cow", "cow", "cow"))
new
label1 label2 tag1 tag0
1 cat dog cat dog
2 cat dog cat dog
3 dog cat cat dog
4 dog cat cat dog
5 cat dog cat dog
6 tiger cow tiger cow
7 tiger cow tiger cow
8 cow tiger tiger cow
tag0
指code=0
对应的label,tag1
指index
dataframe中code=1
对应的label。[=21] =]
谁能帮我解决一个 tidyverse
的问题?
这里有两个解决方案tidyverse
。虽然第一种适用于这种特殊情况,但我更喜欢第二种,它更优雅且可扩展。
解决方案 1:每个 label*
一个 JOIN
首先导入 tidyverse
并生成数据集 original
和 index
。
library(tidyverse)
# ...
# Code to generate 'original' and 'index' datasets.
# ...
然后应用此工作流程。
original %>%
# Uniquely identify each row (for pivoting later).
mutate(row_id = row_number()) %>%
# Match 'label1' to the tags.
left_join(
index,
by = c("label1" = "item"),
keep = TRUE
) %>%
# Match 'label2' to the tags.
left_join(
index,
by = c("label2" = "item"),
keep = TRUE,
suffix = c(".1", ".2")
) %>%
# Pivot 'item.1 | ... | item.n | code.1 | ... | code.n' into a consolidated
# 'item | code' form.
pivot_longer(
cols = matches("^(item|code)\.(\d+)?$"),
names_pattern = "^(item|code)\.(\d+)?$",
names_to = c(".value", NA)
) %>%
# Pivot back into a 'tag1 | tag0' form.
pivot_wider(
values_from = item,
names_from = code,
names_glue = "tag{code}"
) %>%
# Omit unique identifier.
select(!row_id)
结果
给定 original
和 index
数据集,如此处复制的数据集
original <- data.frame(
label1 = c("cat", "cat", "dog", "dog", "cat", "tiger", "tiger", "cow"),
label2 = c("dog", "dog", "cat", "cat", "dog", "cow", "cow", "tiger")
)
index <- data.frame(
item = c("cat", "dog", "tiger", "cow"),
code = c(1, 0, 1, 0)
)
此解决方案应产生以下结果:
# A tibble: 8 x 4
label1 label2 tag1 tag0
<chr> <chr> <chr> <chr>
1 cat dog cat dog
2 cat dog cat dog
3 dog cat cat dog
4 dog cat cat dog
5 cat dog cat dog
6 tiger cow tiger cow
7 tiger cow tiger cow
8 cow tiger tiger cow
备注
如果您的 original
数据集还有更多 label*
列,您将需要对 每个 执行额外的 JOIN
] 这些列。
解决方案 2:单个 CROSS JOIN
这是一个更优雅的工作流程,它也更灵活:它适用于任意数量的 label*
列在 original
和 index
.
中的任意一组 code
original %>%
# Uniquely identify each row (for pivoting later).
mutate(row_id = row_number()) %>%
# Perform a cross-join compare every 'item' to every 'label*'.
full_join(
index,
by = character()
) %>%
# Keep only those rows where 'item' matches a 'label*'.
rowwise() %>%
filter(item %in% c_across(matches("^label\d+"))) %>%
# Pivot into a 'tag1 | tag0' form.
pivot_wider(
values_from = item,
names_from = code,
names_glue = "tag{code}"
) %>%
# Omit unique identifier.
select(!row_id)
结果
结果保持一致。
# A tibble: 8 x 4
label1 label2 tag1 tag0
<chr> <chr> <chr> <chr>
1 cat dog cat dog
2 cat dog cat dog
3 dog cat cat dog
4 dog cat cat dog
5 cat dog cat dog
6 tiger cow tiger cow
7 tiger cow tiger cow
8 cow tiger tiger cow
备注
唯一的缺点是它必须执行 CROSS JOIN
,这可能会影响更大数据集的性能。
另一个可能的解决方案:
library(tidyverse)
original <- data.frame(label1 = c("cat", "cat", "dog", "dog", "cat", "tiger", "tiger", "cow"),
label2 = c("dog", "dog", "cat", "cat", "dog", "cow", "cow", "tiger"))
index <- data.frame(item = c("cat", "dog", "tiger", "cow"),
code = c(1, 0, 1, 0))
original %>%
full_join(index, by=c("label1" = "item")) %>%
full_join(index, by=c("label2" = "item")) %>%
mutate(tag1 = if_else(code.x == 1, label1, label2)) %>%
mutate(tag2 = if_else(code.y == 1, label1, label2)) %>%
select(!starts_with("code"))
#> label1 label2 tag1 tag2
#> 1 cat dog cat dog
#> 2 cat dog cat dog
#> 3 dog cat cat dog
#> 4 dog cat cat dog
#> 5 cat dog cat dog
#> 6 tiger cow tiger cow
#> 7 tiger cow tiger cow
#> 8 cow tiger tiger cow
我正在处理两个数据集 - 一组有成对的项目:
original <- data.frame(label1 = c("cat", "cat", "dog", "dog", "cat", "tiger", "tiger", "cow"),
label2 = c("dog", "dog", "cat", "cat", "dog", "cow", "cow", "tiger"))
original
label1 label2
1 cat dog
2 cat dog
3 dog cat
4 dog cat
5 cat dog
6 tiger cow
7 tiger cow
8 cow tiger
第二个数据集包含第一组项目的索引代码:
index <- data.frame(item = c("cat", "dog", "tiger", "cow"),
code = c(1, 0, 1, 0))
index
item code
1 cat 1
2 dog 0
3 tiger 1
4 cow 0
我正在寻找一种方法来创建两个新列:tag0
和 tag1
以便它看起来像这样:
new <- data.frame(label1 = c("cat", "cat", "dog", "dog", "cat", "tiger", "tiger", "cow"),
label2 = c("dog", "dog", "cat", "cat", "dog", "cow", "cow", "tiger"),
tag1 = c("cat", "cat", "cat", "cat", "cat", "tiger", "tiger", "tiger"),
tag0 = c("dog", "dog", "dog", "dog", "dog", "cow", "cow", "cow"))
new
label1 label2 tag1 tag0
1 cat dog cat dog
2 cat dog cat dog
3 dog cat cat dog
4 dog cat cat dog
5 cat dog cat dog
6 tiger cow tiger cow
7 tiger cow tiger cow
8 cow tiger tiger cow
tag0
指code=0
对应的label,tag1
指index
dataframe中code=1
对应的label。[=21] =]
谁能帮我解决一个 tidyverse
的问题?
这里有两个解决方案tidyverse
。虽然第一种适用于这种特殊情况,但我更喜欢第二种,它更优雅且可扩展。
解决方案 1:每个 label*
一个 JOIN
首先导入 tidyverse
并生成数据集 original
和 index
。
library(tidyverse)
# ...
# Code to generate 'original' and 'index' datasets.
# ...
然后应用此工作流程。
original %>%
# Uniquely identify each row (for pivoting later).
mutate(row_id = row_number()) %>%
# Match 'label1' to the tags.
left_join(
index,
by = c("label1" = "item"),
keep = TRUE
) %>%
# Match 'label2' to the tags.
left_join(
index,
by = c("label2" = "item"),
keep = TRUE,
suffix = c(".1", ".2")
) %>%
# Pivot 'item.1 | ... | item.n | code.1 | ... | code.n' into a consolidated
# 'item | code' form.
pivot_longer(
cols = matches("^(item|code)\.(\d+)?$"),
names_pattern = "^(item|code)\.(\d+)?$",
names_to = c(".value", NA)
) %>%
# Pivot back into a 'tag1 | tag0' form.
pivot_wider(
values_from = item,
names_from = code,
names_glue = "tag{code}"
) %>%
# Omit unique identifier.
select(!row_id)
结果
给定 original
和 index
数据集,如此处复制的数据集
original <- data.frame(
label1 = c("cat", "cat", "dog", "dog", "cat", "tiger", "tiger", "cow"),
label2 = c("dog", "dog", "cat", "cat", "dog", "cow", "cow", "tiger")
)
index <- data.frame(
item = c("cat", "dog", "tiger", "cow"),
code = c(1, 0, 1, 0)
)
此解决方案应产生以下结果:
# A tibble: 8 x 4
label1 label2 tag1 tag0
<chr> <chr> <chr> <chr>
1 cat dog cat dog
2 cat dog cat dog
3 dog cat cat dog
4 dog cat cat dog
5 cat dog cat dog
6 tiger cow tiger cow
7 tiger cow tiger cow
8 cow tiger tiger cow
备注
如果您的 original
数据集还有更多 label*
列,您将需要对 每个 执行额外的 JOIN
] 这些列。
解决方案 2:单个 CROSS JOIN
这是一个更优雅的工作流程,它也更灵活:它适用于任意数量的 label*
列在 original
和 index
.
code
original %>%
# Uniquely identify each row (for pivoting later).
mutate(row_id = row_number()) %>%
# Perform a cross-join compare every 'item' to every 'label*'.
full_join(
index,
by = character()
) %>%
# Keep only those rows where 'item' matches a 'label*'.
rowwise() %>%
filter(item %in% c_across(matches("^label\d+"))) %>%
# Pivot into a 'tag1 | tag0' form.
pivot_wider(
values_from = item,
names_from = code,
names_glue = "tag{code}"
) %>%
# Omit unique identifier.
select(!row_id)
结果
结果保持一致。
# A tibble: 8 x 4
label1 label2 tag1 tag0
<chr> <chr> <chr> <chr>
1 cat dog cat dog
2 cat dog cat dog
3 dog cat cat dog
4 dog cat cat dog
5 cat dog cat dog
6 tiger cow tiger cow
7 tiger cow tiger cow
8 cow tiger tiger cow
备注
唯一的缺点是它必须执行 CROSS JOIN
,这可能会影响更大数据集的性能。
另一个可能的解决方案:
library(tidyverse)
original <- data.frame(label1 = c("cat", "cat", "dog", "dog", "cat", "tiger", "tiger", "cow"),
label2 = c("dog", "dog", "cat", "cat", "dog", "cow", "cow", "tiger"))
index <- data.frame(item = c("cat", "dog", "tiger", "cow"),
code = c(1, 0, 1, 0))
original %>%
full_join(index, by=c("label1" = "item")) %>%
full_join(index, by=c("label2" = "item")) %>%
mutate(tag1 = if_else(code.x == 1, label1, label2)) %>%
mutate(tag2 = if_else(code.y == 1, label1, label2)) %>%
select(!starts_with("code"))
#> label1 label2 tag1 tag2
#> 1 cat dog cat dog
#> 2 cat dog cat dog
#> 3 dog cat cat dog
#> 4 dog cat cat dog
#> 5 cat dog cat dog
#> 6 tiger cow tiger cow
#> 7 tiger cow tiger cow
#> 8 cow tiger tiger cow