根据其他 Dataframe 用 TRUE 或 FALSE 填充列
Fill Columns with TRUE or FALSE based on other Dataframe
很遗憾,我遇到了问题,需要您的支持。
我有一份最受欢迎的宠物名字列表。从数据框 x 看来,“Jerry”是最受欢迎的老鼠名字,“Garfield”是第二受欢迎的猫名字,依此类推。
x <- data.frame(animals = c("Mice", "Mice", "Mice", "Cats", "Cats", "Cats", "Dogs", "Dogs", "Dogs" ,"Birds", "Birds", "Birds"),
names = c("Jerry", "Speedy", "Feivel", "Tom", "Garfield", "Pumpkin", "Snoopy", "Pluto", "Sirius", "Tweety", "Donald", "Hedwig"),
rank = rep(c(1,2,3),4))
数据框 y 还包含宠物及其名称:
y <- data.frame(animals = c("Mice", "Cats", "Cats", "Dogs", "Birds"),
names = c("Pam", "Michael", "Garfield", "Sirius", "Tweety"))
我现在想知道数据框 y 中的动物名称是否属于最受欢迎的动物名称,如果是,该名称的排名是多少。因此,应该为三个等级中的每一个创建三个列。如果与最流行的宠物名字匹配,则应在具有相应排名的列中输入 TRUE。例如,“Pam”不是最流行的鼠标名称之一,因此在三列中的每一列中都输入了 FALSE(或 NA,我不关心)。另一方面,加菲猫是最流行的猫名之一,因此应在 rank2 列中输入值 TRUE。此结果应存储在数据框 z 中。
z <- data.frame(animals = c("Mice", "Cats", "Cats", "Dogs", "Birds"),
names = c("Pam", "Michael", "Garfield", "Sirius", "Tweety"),
rank1 = c(FALSE, FALSE, FALSE, FALSE, TRUE),
rank2 = c(FALSE, FALSE, TRUE, FALSE, FALSE),
rank3 = c(FALSE, FALSE, FALSE, TRUE, FALSE))
我已经考虑过使用 tidyr
包中的函数 spread()
或 pivot_wider()
,但不幸的是我无法得到正确的结果。
如果你能提供帮助,我将非常高兴,在此先感谢你!
一种选择是将left_join()
x
序列化到y
上,然后在最后将NA
转换为logical
。
library(tidyverse)
x <- data.frame(animals = c("Mice", "Mice", "Mice", "Cats", "Cats", "Cats", "Dogs", "Dogs", "Dogs" ,"Birds", "Birds", "Birds"),
names = c("Jerry", "Speedy", "Feivel", "Tom", "Garfield", "Pumpkin", "Snoopy", "Pluto", "Sirius", "Tweety", "Donald", "Hedwig"),
rank = rep(c(1,2,3),4))
y <- data.frame(animals = c("Mice", "Cats", "Cats", "Dogs", "Birds"),
names = c("Pam", "Michael", "Garfield", "Sirius", "Tweety"))
y %>%
left_join(x %>% filter(rank == 1) %>% rename(rank1 = rank)) %>%
left_join(x %>% filter(rank == 2) %>% rename(rank2 = rank)) %>%
left_join(x %>% filter(rank == 3) %>% rename(rank3 = rank)) %>%
mutate(across(starts_with("rank"), ~if_else(is.na(.x), F, T)))
#> animals names rank1 rank2 rank3
#> 1 Mice Pam FALSE FALSE FALSE
#> 2 Cats Michael FALSE FALSE FALSE
#> 3 Cats Garfield FALSE TRUE FALSE
#> 4 Dogs Sirius FALSE FALSE TRUE
#> 5 Birds Tweety TRUE FALSE FALSE
由 reprex package (v2.0.1)
创建于 2022-04-07
您可以在一个 ìnner_join
之后使用 pivot_wider
将排名虚拟编码为多个逻辑列。这将丢弃未出现在 x
中并因此排名未知的动物。
library(tidyverse)
x <- data.frame(
animals = c("Mice", "Mice", "Mice", "Cats", "Cats", "Cats", "Dogs", "Dogs", "Dogs", "Birds", "Birds", "Birds"),
names = c("Jerry", "Speedy", "Feivel", "Tom", "Garfield", "Pumpkin", "Snoopy", "Pluto", "Sirius", "Tweety", "Donald", "Hedwig"),
rank = rep(c(1, 2, 3), 4)
)
y <- data.frame(
animals = c("Mice", "Cats", "Cats", "Dogs", "Birds"),
names = c("Pam", "Michael", "Garfield", "Sirius", "Tweety")
)
y %>%
inner_join(x) %>%
# dummy encoding
mutate(present = TRUE) %>%
pivot_wider(
names_from = rank, values_from = present, names_prefix = "rank",
values_fill = list(present = FALSE)
)
#> Joining, by = c("animals", "names")
#> # A tibble: 3 × 5
#> animals names rank2 rank3 rank1
#> <chr> <chr> <lgl> <lgl> <lgl>
#> 1 Cats Garfield TRUE FALSE FALSE
#> 2 Dogs Sirius FALSE TRUE FALSE
#> 3 Birds Tweety FALSE FALSE TRUE
由 reprex package (v2.0.0)
于 2022-04-07 创建
获得 y
对象排名的最简单方法是使用 left_join
。由于您只想创建三个简单的列,因此使用 mutate
:
可能是最简单的
library(dplyr)
x <- data.frame(animals = c("Mice", "Mice", "Mice", "Cats", "Cats", "Cats", "Dogs", "Dogs", "Dogs" ,"Birds", "Birds", "Birds"),
names = c("Jerry", "Speedy", "Feivel", "Tom", "Garfield", "Pumpkin", "Snoopy", "Pluto", "Sirius", "Tweety", "Donald", "Hedwig"),
rank = rep(c(1,2,3),4))
y <- data.frame(animals = c("Mice", "Cats", "Cats", "Dogs", "Birds"),
names = c("Pam", "Michael", "Garfield", "Sirius", "Tweety"))
z = left_join(y, x) %>%
mutate(rank1 = rank == 1,
rank2 = rank == 2,
rank3 = rank == 3) %>%
select(-rank) %>% # removing the rank column
print()
#Joining, by = c("animals", "names")
# animals names rank1 rank2 rank3
#1 Mice Pam NA NA NA
#2 Cats Michael NA NA NA
#3 Cats Garfield FALSE TRUE FALSE
#4 Dogs Sirius FALSE FALSE TRUE
#5 Birds Tweety TRUE FALSE FALSE
left_join
的另一种方法。与上述解决方案不同的是,我只使用一个 left_join
,并使用 ifelse
语句将 NA
分配给 FALSE
,其他分配给 TRUE
。
library(tidyverse)
left_join(y, x, by = c("animals", "names")) %>%
pivot_wider(names_from = "rank", values_from = "rank", names_prefix = "rank", names_sort = T) %>%
mutate(across(starts_with("rank"), ~ifelse(is.na(.x), F, T))) %>%
select(-rankNA)
# A tibble: 5 × 5
animals names rank1 rank2 rank3
<chr> <chr> <lgl> <lgl> <lgl>
1 Mice Pam FALSE FALSE FALSE
2 Cats Michael FALSE FALSE FALSE
3 Cats Garfield FALSE TRUE FALSE
4 Dogs Sirius FALSE FALSE TRUE
5 Birds Tweety TRUE FALSE FALSE
很遗憾,我遇到了问题,需要您的支持。
我有一份最受欢迎的宠物名字列表。从数据框 x 看来,“Jerry”是最受欢迎的老鼠名字,“Garfield”是第二受欢迎的猫名字,依此类推。
x <- data.frame(animals = c("Mice", "Mice", "Mice", "Cats", "Cats", "Cats", "Dogs", "Dogs", "Dogs" ,"Birds", "Birds", "Birds"),
names = c("Jerry", "Speedy", "Feivel", "Tom", "Garfield", "Pumpkin", "Snoopy", "Pluto", "Sirius", "Tweety", "Donald", "Hedwig"),
rank = rep(c(1,2,3),4))
数据框 y 还包含宠物及其名称:
y <- data.frame(animals = c("Mice", "Cats", "Cats", "Dogs", "Birds"),
names = c("Pam", "Michael", "Garfield", "Sirius", "Tweety"))
我现在想知道数据框 y 中的动物名称是否属于最受欢迎的动物名称,如果是,该名称的排名是多少。因此,应该为三个等级中的每一个创建三个列。如果与最流行的宠物名字匹配,则应在具有相应排名的列中输入 TRUE。例如,“Pam”不是最流行的鼠标名称之一,因此在三列中的每一列中都输入了 FALSE(或 NA,我不关心)。另一方面,加菲猫是最流行的猫名之一,因此应在 rank2 列中输入值 TRUE。此结果应存储在数据框 z 中。
z <- data.frame(animals = c("Mice", "Cats", "Cats", "Dogs", "Birds"),
names = c("Pam", "Michael", "Garfield", "Sirius", "Tweety"),
rank1 = c(FALSE, FALSE, FALSE, FALSE, TRUE),
rank2 = c(FALSE, FALSE, TRUE, FALSE, FALSE),
rank3 = c(FALSE, FALSE, FALSE, TRUE, FALSE))
我已经考虑过使用 tidyr
包中的函数 spread()
或 pivot_wider()
,但不幸的是我无法得到正确的结果。
如果你能提供帮助,我将非常高兴,在此先感谢你!
一种选择是将left_join()
x
序列化到y
上,然后在最后将NA
转换为logical
。
library(tidyverse)
x <- data.frame(animals = c("Mice", "Mice", "Mice", "Cats", "Cats", "Cats", "Dogs", "Dogs", "Dogs" ,"Birds", "Birds", "Birds"),
names = c("Jerry", "Speedy", "Feivel", "Tom", "Garfield", "Pumpkin", "Snoopy", "Pluto", "Sirius", "Tweety", "Donald", "Hedwig"),
rank = rep(c(1,2,3),4))
y <- data.frame(animals = c("Mice", "Cats", "Cats", "Dogs", "Birds"),
names = c("Pam", "Michael", "Garfield", "Sirius", "Tweety"))
y %>%
left_join(x %>% filter(rank == 1) %>% rename(rank1 = rank)) %>%
left_join(x %>% filter(rank == 2) %>% rename(rank2 = rank)) %>%
left_join(x %>% filter(rank == 3) %>% rename(rank3 = rank)) %>%
mutate(across(starts_with("rank"), ~if_else(is.na(.x), F, T)))
#> animals names rank1 rank2 rank3
#> 1 Mice Pam FALSE FALSE FALSE
#> 2 Cats Michael FALSE FALSE FALSE
#> 3 Cats Garfield FALSE TRUE FALSE
#> 4 Dogs Sirius FALSE FALSE TRUE
#> 5 Birds Tweety TRUE FALSE FALSE
由 reprex package (v2.0.1)
创建于 2022-04-07您可以在一个 ìnner_join
之后使用 pivot_wider
将排名虚拟编码为多个逻辑列。这将丢弃未出现在 x
中并因此排名未知的动物。
library(tidyverse)
x <- data.frame(
animals = c("Mice", "Mice", "Mice", "Cats", "Cats", "Cats", "Dogs", "Dogs", "Dogs", "Birds", "Birds", "Birds"),
names = c("Jerry", "Speedy", "Feivel", "Tom", "Garfield", "Pumpkin", "Snoopy", "Pluto", "Sirius", "Tweety", "Donald", "Hedwig"),
rank = rep(c(1, 2, 3), 4)
)
y <- data.frame(
animals = c("Mice", "Cats", "Cats", "Dogs", "Birds"),
names = c("Pam", "Michael", "Garfield", "Sirius", "Tweety")
)
y %>%
inner_join(x) %>%
# dummy encoding
mutate(present = TRUE) %>%
pivot_wider(
names_from = rank, values_from = present, names_prefix = "rank",
values_fill = list(present = FALSE)
)
#> Joining, by = c("animals", "names")
#> # A tibble: 3 × 5
#> animals names rank2 rank3 rank1
#> <chr> <chr> <lgl> <lgl> <lgl>
#> 1 Cats Garfield TRUE FALSE FALSE
#> 2 Dogs Sirius FALSE TRUE FALSE
#> 3 Birds Tweety FALSE FALSE TRUE
由 reprex package (v2.0.0)
于 2022-04-07 创建获得 y
对象排名的最简单方法是使用 left_join
。由于您只想创建三个简单的列,因此使用 mutate
:
library(dplyr)
x <- data.frame(animals = c("Mice", "Mice", "Mice", "Cats", "Cats", "Cats", "Dogs", "Dogs", "Dogs" ,"Birds", "Birds", "Birds"),
names = c("Jerry", "Speedy", "Feivel", "Tom", "Garfield", "Pumpkin", "Snoopy", "Pluto", "Sirius", "Tweety", "Donald", "Hedwig"),
rank = rep(c(1,2,3),4))
y <- data.frame(animals = c("Mice", "Cats", "Cats", "Dogs", "Birds"),
names = c("Pam", "Michael", "Garfield", "Sirius", "Tweety"))
z = left_join(y, x) %>%
mutate(rank1 = rank == 1,
rank2 = rank == 2,
rank3 = rank == 3) %>%
select(-rank) %>% # removing the rank column
print()
#Joining, by = c("animals", "names")
# animals names rank1 rank2 rank3
#1 Mice Pam NA NA NA
#2 Cats Michael NA NA NA
#3 Cats Garfield FALSE TRUE FALSE
#4 Dogs Sirius FALSE FALSE TRUE
#5 Birds Tweety TRUE FALSE FALSE
left_join
的另一种方法。与上述解决方案不同的是,我只使用一个 left_join
,并使用 ifelse
语句将 NA
分配给 FALSE
,其他分配给 TRUE
。
library(tidyverse)
left_join(y, x, by = c("animals", "names")) %>%
pivot_wider(names_from = "rank", values_from = "rank", names_prefix = "rank", names_sort = T) %>%
mutate(across(starts_with("rank"), ~ifelse(is.na(.x), F, T))) %>%
select(-rankNA)
# A tibble: 5 × 5
animals names rank1 rank2 rank3
<chr> <chr> <lgl> <lgl> <lgl>
1 Mice Pam FALSE FALSE FALSE
2 Cats Michael FALSE FALSE FALSE
3 Cats Garfield FALSE TRUE FALSE
4 Dogs Sirius FALSE FALSE TRUE
5 Birds Tweety TRUE FALSE FALSE