spread() 将非唯一值放入新列中
spread() where non-unique values are put into a new column
我有一些数据看起来像这样(最后输入代码):
#> artist album year source id
#> 1 Beatles Sgt. Pepper's 1967 amazon B0025KVLTM
#> 2 Beatles Sgt. Pepper's 1967 spotify 6QaVfG1pHYl1z15ZxkvVDW
#> 3 Beatles Sgt. Pepper's 1967 amazon B06WGVMLJY
#> 4 Rolling Stones Sticky Fingers 1971 spotify 29m6DinzdaD0OPqWKGyMdz
我想修复 'id' 列(其中包括来自多个来源的 ID,如 'source' 列所示。
这应该是一个简单的 spread()
,但复杂的是有时我们有来自完全相同来源的重复 ID:请参见上面的第 1 行和第 3 行。
有没有简单的方法来执行 spread()
并将重复的 ID 放在新列中?
我想要的结果是:
#> artist album year source amazon_id amazon_id_2
#> 1 Beatles Sgt. Pepper's 1967 amazon B0025KVLTM B06WGVMLJY
#> 2 Rolling Stones Sticky Fingers 1971 spotify <NA> <NA>
#> spotify
#> 1 6QaVfG1pHYl1z15ZxkvVDW
#> 2 29m6DinzdaD0OPqWKGyMdz
下面的代码是输入示例数据:
df <- data.frame(stringsAsFactors=FALSE,
artist = c("Beatles", "Beatles", "Beatles", "Rolling Stones"),
album = c("Sgt. Pepper's", "Sgt. Pepper's", "Sgt. Pepper's",
"Sticky Fingers"),
year = c(1967, 1967, 1967, 1971),
source = c("amazon", "spotify", "amazon", "spotify"),
id = c("B0025KVLTM", "6QaVfG1pHYl1z15ZxkvVDW", "B06WGVMLJY",
"29m6DinzdaD0OPqWKGyMdz")
)
df
一种可能是:
df %>%
group_by(artist, album, year, source) %>%
mutate(source2 = paste(source, row_number(), sep = "_")) %>%
spread(source2, id) %>%
ungroup()
artist album year source amazon_1 amazon_2 spotify_1
<chr> <chr> <dbl> <chr> <chr> <chr> <chr>
1 Beatles Sgt. Pepper's 1967 amazon B0025KVLTM B06WGVMLJY <NA>
2 Beatles Sgt. Pepper's 1967 spotify <NA> <NA> 6QaVfG1pHYl1z15ZxkvVDW
3 Rolling Stones Sticky Fingers 1971 spotify <NA> <NA> 29m6DinzdaD0OPqWKGyMdz
请注意,这里的输出由三行组成,因为 spotify
是披头士专辑中唯一的 "source"。
如果您仍然想要两行,您可以这样做:
df %>%
group_by(artist, album, year, source) %>%
mutate(source2 = paste(source, row_number(), sep = "_")) %>%
ungroup() %>%
select(-source) %>%
spread(source2, id)
artist album year amazon_1 amazon_2 spotify_1
<chr> <chr> <dbl> <chr> <chr> <chr>
1 Beatles Sgt. Pepper's 1967 B0025KVLTM B06WGVMLJY 6QaVfG1pHYl1z15ZxkvVDW
2 Rolling Stones Sticky Fingers 1971 <NA> <NA> 29m6DinzdaD0OPqWKGyMdz
如果您还想拥有 "source" 列:
df %>%
group_by(artist, album, year, source) %>%
mutate(source2 = paste(source, row_number(), sep = "_")) %>%
group_by(artist, album, year) %>%
mutate(source = toString(unique(source))) %>%
spread(source2, id) %>%
ungroup()
artist album year source amazon_1 amazon_2 spotify_1
<chr> <chr> <dbl> <chr> <chr> <chr> <chr>
1 Beatles Sgt. Pepper's 1967 amazon, spotify B0025KVL… B06WGVML… 6QaVfG1pHYl1z15ZxkvV…
2 Rolling Stones Sticky Fingers 1971 spotify <NA> <NA> 29m6DinzdaD0OPqWKGyM…
这可以通过 data.table
中的 dcast
在一行中完成。但因此我认为非常优雅。
library(data.table)
dcast(df, artist + album + year ~ paste(source, rowid(artist, source), sep = "_"))
# artist album year amazon_1 amazon_2 spotify_1
#1 Beatles Sgt. Pepper's 1967 B0025KVLTM B06WGVMLJY 6QaVfG1pHYl1z15ZxkvVDW
#2 Rolling Stones Sticky Fingers 1971 <NA> <NA> 29m6DinzdaD0OPqWKGyMdz
这是一种方法。
df %>%
group_by(artist,source) %>%
mutate(rownum = row_number()) %>%
unite(source, source, rownum, sep="_") %>%
spread(source,id)
# A tibble: 2 x 6
# Groups: artist [2]
artist album year amazon_1 amazon_2 spotify_1
<chr> <chr> <dbl> <chr> <chr> <chr>
1 Beatles Sgt. Pepper's 1967 B0025KVLTM B06WGVMLJY 6QaVfG1pHYl1z15ZxkvVDW
2 Rolling Stones Sticky Fingers 1971 NA NA 29m6DinzdaD0OPqWKGyMdz
也可能在 ave
和 reshape
的基础 R 中。
df$source <- with(df, paste(source,
ave(artist, source, FUN=function(i)
cumsum(duplicated(i)) + 1)), sep="_")
reshape(df, timevar="source", idvar=c("artist", "album", "year"), direction="wide")
# artist album year id.amazon_1 id.spotify_1 id.amazon_2 id.amazon_3
# 1 Beatles Sgt. Pepper's 1967 B0025KVLTM 6QaVfG1pHYl1z15ZxkvVDW B06WGVMLJY SoMeFoO
# 4 Rolling Stones Sticky Fingers 1971 <NA> 29m6DinzdaD0OPqWKGyMdz <NA> <NA>
数据
df <- structure(list(artist = c("Beatles", "Beatles", "Beatles", "Rolling Stones"
), album = c("Sgt. Pepper's", "Sgt. Pepper's", "Sgt. Pepper's",
"Sticky Fingers"), year = c(1967, 1967, 1967, 1971), source = c("amazon",
"spotify", "amazon", "spotify"), id = c("B0025KVLTM", "6QaVfG1pHYl1z15ZxkvVDW",
"B06WGVMLJY", "29m6DinzdaD0OPqWKGyMdz")), class = "data.frame", row.names = c(NA,
-4L))
df <- rbind(df, df[1, ])
df[5, 5] <- "SoMeFoO"
我有一些数据看起来像这样(最后输入代码):
#> artist album year source id
#> 1 Beatles Sgt. Pepper's 1967 amazon B0025KVLTM
#> 2 Beatles Sgt. Pepper's 1967 spotify 6QaVfG1pHYl1z15ZxkvVDW
#> 3 Beatles Sgt. Pepper's 1967 amazon B06WGVMLJY
#> 4 Rolling Stones Sticky Fingers 1971 spotify 29m6DinzdaD0OPqWKGyMdz
我想修复 'id' 列(其中包括来自多个来源的 ID,如 'source' 列所示。
这应该是一个简单的 spread()
,但复杂的是有时我们有来自完全相同来源的重复 ID:请参见上面的第 1 行和第 3 行。
有没有简单的方法来执行 spread()
并将重复的 ID 放在新列中?
我想要的结果是:
#> artist album year source amazon_id amazon_id_2
#> 1 Beatles Sgt. Pepper's 1967 amazon B0025KVLTM B06WGVMLJY
#> 2 Rolling Stones Sticky Fingers 1971 spotify <NA> <NA>
#> spotify
#> 1 6QaVfG1pHYl1z15ZxkvVDW
#> 2 29m6DinzdaD0OPqWKGyMdz
下面的代码是输入示例数据:
df <- data.frame(stringsAsFactors=FALSE,
artist = c("Beatles", "Beatles", "Beatles", "Rolling Stones"),
album = c("Sgt. Pepper's", "Sgt. Pepper's", "Sgt. Pepper's",
"Sticky Fingers"),
year = c(1967, 1967, 1967, 1971),
source = c("amazon", "spotify", "amazon", "spotify"),
id = c("B0025KVLTM", "6QaVfG1pHYl1z15ZxkvVDW", "B06WGVMLJY",
"29m6DinzdaD0OPqWKGyMdz")
)
df
一种可能是:
df %>%
group_by(artist, album, year, source) %>%
mutate(source2 = paste(source, row_number(), sep = "_")) %>%
spread(source2, id) %>%
ungroup()
artist album year source amazon_1 amazon_2 spotify_1
<chr> <chr> <dbl> <chr> <chr> <chr> <chr>
1 Beatles Sgt. Pepper's 1967 amazon B0025KVLTM B06WGVMLJY <NA>
2 Beatles Sgt. Pepper's 1967 spotify <NA> <NA> 6QaVfG1pHYl1z15ZxkvVDW
3 Rolling Stones Sticky Fingers 1971 spotify <NA> <NA> 29m6DinzdaD0OPqWKGyMdz
请注意,这里的输出由三行组成,因为 spotify
是披头士专辑中唯一的 "source"。
如果您仍然想要两行,您可以这样做:
df %>%
group_by(artist, album, year, source) %>%
mutate(source2 = paste(source, row_number(), sep = "_")) %>%
ungroup() %>%
select(-source) %>%
spread(source2, id)
artist album year amazon_1 amazon_2 spotify_1
<chr> <chr> <dbl> <chr> <chr> <chr>
1 Beatles Sgt. Pepper's 1967 B0025KVLTM B06WGVMLJY 6QaVfG1pHYl1z15ZxkvVDW
2 Rolling Stones Sticky Fingers 1971 <NA> <NA> 29m6DinzdaD0OPqWKGyMdz
如果您还想拥有 "source" 列:
df %>%
group_by(artist, album, year, source) %>%
mutate(source2 = paste(source, row_number(), sep = "_")) %>%
group_by(artist, album, year) %>%
mutate(source = toString(unique(source))) %>%
spread(source2, id) %>%
ungroup()
artist album year source amazon_1 amazon_2 spotify_1
<chr> <chr> <dbl> <chr> <chr> <chr> <chr>
1 Beatles Sgt. Pepper's 1967 amazon, spotify B0025KVL… B06WGVML… 6QaVfG1pHYl1z15ZxkvV…
2 Rolling Stones Sticky Fingers 1971 spotify <NA> <NA> 29m6DinzdaD0OPqWKGyM…
这可以通过 data.table
中的 dcast
在一行中完成。但因此我认为非常优雅。
library(data.table)
dcast(df, artist + album + year ~ paste(source, rowid(artist, source), sep = "_"))
# artist album year amazon_1 amazon_2 spotify_1
#1 Beatles Sgt. Pepper's 1967 B0025KVLTM B06WGVMLJY 6QaVfG1pHYl1z15ZxkvVDW
#2 Rolling Stones Sticky Fingers 1971 <NA> <NA> 29m6DinzdaD0OPqWKGyMdz
这是一种方法。
df %>%
group_by(artist,source) %>%
mutate(rownum = row_number()) %>%
unite(source, source, rownum, sep="_") %>%
spread(source,id)
# A tibble: 2 x 6
# Groups: artist [2]
artist album year amazon_1 amazon_2 spotify_1
<chr> <chr> <dbl> <chr> <chr> <chr>
1 Beatles Sgt. Pepper's 1967 B0025KVLTM B06WGVMLJY 6QaVfG1pHYl1z15ZxkvVDW
2 Rolling Stones Sticky Fingers 1971 NA NA 29m6DinzdaD0OPqWKGyMdz
也可能在 ave
和 reshape
的基础 R 中。
df$source <- with(df, paste(source,
ave(artist, source, FUN=function(i)
cumsum(duplicated(i)) + 1)), sep="_")
reshape(df, timevar="source", idvar=c("artist", "album", "year"), direction="wide")
# artist album year id.amazon_1 id.spotify_1 id.amazon_2 id.amazon_3
# 1 Beatles Sgt. Pepper's 1967 B0025KVLTM 6QaVfG1pHYl1z15ZxkvVDW B06WGVMLJY SoMeFoO
# 4 Rolling Stones Sticky Fingers 1971 <NA> 29m6DinzdaD0OPqWKGyMdz <NA> <NA>
数据
df <- structure(list(artist = c("Beatles", "Beatles", "Beatles", "Rolling Stones"
), album = c("Sgt. Pepper's", "Sgt. Pepper's", "Sgt. Pepper's",
"Sticky Fingers"), year = c(1967, 1967, 1967, 1971), source = c("amazon",
"spotify", "amazon", "spotify"), id = c("B0025KVLTM", "6QaVfG1pHYl1z15ZxkvVDW",
"B06WGVMLJY", "29m6DinzdaD0OPqWKGyMdz")), class = "data.frame", row.names = c(NA,
-4L))
df <- rbind(df, df[1, ])
df[5, 5] <- "SoMeFoO"