加入两个数据框,使一列包含多个值
join two dataframes so that a column contains multiple values
我的数据是这样的:
df1
#> Artist Album Year
#> 1 Beatles Sgt. Pepper's 1967
#> 2 Rolling Stones Sticky Fingers 1971
和
df2
#> Artist Members
#> 1 Beatles George
#> 2 Beatles Ringo
#> 3 Beatles Paul
#> 4 Beatles John
我想加入这两个 df,我认为这是一种 "untidy" 时尚。尽管不整洁,但让最终输出看起来像下面的示例对我很有帮助,其中每个乐队(艺术家)只占一行,乐队成员都放在一列中,用逗号分隔:
Desired Output
#> Artist Album Members Year
#> 1 Beatles Sgt. Pepper's George, Ringo, Paul, John 1967
#> 2 Rolling Stones Sticky Fingers 1971
我已经能够接近解决方案(如下),但是:
- 有更简单的方法吗?
- 我如何概括我的代码,以便如果乐队有 11 名或 13 名成员,代码仍然有效?
- 当数据缺失时,例如滚石乐队,值为 "NA"。留白容易吗?
library(tidyverse)
df1 <- data.frame(stringsAsFactors=FALSE,
Artist = c("Beatles", "Rolling Stones"),
Album = c("Sgt. Pepper's", "Sticky Fingers"),
Year = c(1967, 1971)
)
df2 <- data.frame(stringsAsFactors=FALSE,
Artist = c("Beatles", "Beatles", "Beatles", "Beatles"),
Members = c("George", "Ringo", "Paul", "John")
)
df <- left_join(df1, df2, by = "Artist")
df <- df %>% group_by(Artist) %>% mutate(member_number = seq_along(Members))
df <- spread(df, key = "member_number", value = "Members", sep = "_")
df <- df %>% unite(col = "members", member_number_1:member_number_4, sep = ",")
给出输出
df
#> # A tibble: 2 x 4
#> # Groups: Artist [2]
#> Artist Album Year members
#> <chr> <chr> <dbl> <chr>
#> 1 Beatles Sgt. Pepper's 1967 George,Ringo,Paul,John
#> 2 Rolling Stones Sticky Fingers 1971 NA,NA,NA,NA
我们可以 left_join
然后 summarise
多个列并将它们折叠成 unique
comma-separated 个字符串。
library(dplyr)
left_join(df1, df2, by = "Artist") %>%
group_by(Artist) %>%
summarise_at(vars(Album:Members), ~toString(unique(.)))
# A tibble: 2 x 4
# Artist Album Year Members
# <chr> <chr> <chr> <chr>
#1 Beatles Sgt. Pepper's 1967 George, Ringo, Paul, John
#2 Rolling Stones Sticky Fingers 1971 NA
略有不同:
library(dplyr)
left_join(df1, df2) %>%
group_by(Artist, Album, Year) %>%
summarise(members = paste(Members, collapse = ","))
# A tibble: 2 x 4
# Groups: Artist, Album [?]
Artist Album Year members
<chr> <chr> <dbl> <chr>
1 Beatles Sgt. Pepper's 1967 George,Ringo,Paul,John
2 Rolling Stones Sticky Fingers 1971 NA
使用data.table
library(data.table)
setDT(df2)[df1, on = .(Artist)][, .(members = toString(Members)),
.(Artist, Album, Year)]
# Artist Album Year members
#1: Beatles Sgt. Pepper's 1967 George, Ringo, Paul, John
#2: Rolling Stones Sticky Fingers 1971 NA
我的包 safejoin 允许通过连接变量对连接的 table 进行聚合操作:
# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
library(dplyr)
df1 %>% eat(df2, .agg = toString)
# Joining, by = "Artist"
# Artist Album Year Members
# 1 Beatles Sgt. Pepper's 1967 George, Ringo, Paul, John
# 2 Rolling Stones Sticky Fingers 1971 <NA>
我的数据是这样的:
df1
#> Artist Album Year
#> 1 Beatles Sgt. Pepper's 1967
#> 2 Rolling Stones Sticky Fingers 1971
和
df2
#> Artist Members
#> 1 Beatles George
#> 2 Beatles Ringo
#> 3 Beatles Paul
#> 4 Beatles John
我想加入这两个 df,我认为这是一种 "untidy" 时尚。尽管不整洁,但让最终输出看起来像下面的示例对我很有帮助,其中每个乐队(艺术家)只占一行,乐队成员都放在一列中,用逗号分隔:
Desired Output
#> Artist Album Members Year
#> 1 Beatles Sgt. Pepper's George, Ringo, Paul, John 1967
#> 2 Rolling Stones Sticky Fingers 1971
我已经能够接近解决方案(如下),但是:
- 有更简单的方法吗?
- 我如何概括我的代码,以便如果乐队有 11 名或 13 名成员,代码仍然有效?
- 当数据缺失时,例如滚石乐队,值为 "NA"。留白容易吗?
library(tidyverse)
df1 <- data.frame(stringsAsFactors=FALSE,
Artist = c("Beatles", "Rolling Stones"),
Album = c("Sgt. Pepper's", "Sticky Fingers"),
Year = c(1967, 1971)
)
df2 <- data.frame(stringsAsFactors=FALSE,
Artist = c("Beatles", "Beatles", "Beatles", "Beatles"),
Members = c("George", "Ringo", "Paul", "John")
)
df <- left_join(df1, df2, by = "Artist")
df <- df %>% group_by(Artist) %>% mutate(member_number = seq_along(Members))
df <- spread(df, key = "member_number", value = "Members", sep = "_")
df <- df %>% unite(col = "members", member_number_1:member_number_4, sep = ",")
给出输出
df
#> # A tibble: 2 x 4
#> # Groups: Artist [2]
#> Artist Album Year members
#> <chr> <chr> <dbl> <chr>
#> 1 Beatles Sgt. Pepper's 1967 George,Ringo,Paul,John
#> 2 Rolling Stones Sticky Fingers 1971 NA,NA,NA,NA
我们可以 left_join
然后 summarise
多个列并将它们折叠成 unique
comma-separated 个字符串。
library(dplyr)
left_join(df1, df2, by = "Artist") %>%
group_by(Artist) %>%
summarise_at(vars(Album:Members), ~toString(unique(.)))
# A tibble: 2 x 4
# Artist Album Year Members
# <chr> <chr> <chr> <chr>
#1 Beatles Sgt. Pepper's 1967 George, Ringo, Paul, John
#2 Rolling Stones Sticky Fingers 1971 NA
略有不同:
library(dplyr)
left_join(df1, df2) %>%
group_by(Artist, Album, Year) %>%
summarise(members = paste(Members, collapse = ","))
# A tibble: 2 x 4
# Groups: Artist, Album [?]
Artist Album Year members
<chr> <chr> <dbl> <chr>
1 Beatles Sgt. Pepper's 1967 George,Ringo,Paul,John
2 Rolling Stones Sticky Fingers 1971 NA
使用data.table
library(data.table)
setDT(df2)[df1, on = .(Artist)][, .(members = toString(Members)),
.(Artist, Album, Year)]
# Artist Album Year members
#1: Beatles Sgt. Pepper's 1967 George, Ringo, Paul, John
#2: Rolling Stones Sticky Fingers 1971 NA
我的包 safejoin 允许通过连接变量对连接的 table 进行聚合操作:
# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
library(dplyr)
df1 %>% eat(df2, .agg = toString)
# Joining, by = "Artist"
# Artist Album Year Members
# 1 Beatles Sgt. Pepper's 1967 George, Ringo, Paul, John
# 2 Rolling Stones Sticky Fingers 1971 <NA>