如何根据列匹配有效地将这些 imdb 电影标题 ID 替换为实际标题?
How to efficiently replace these imdb movie title IDs with the actual title based on a column match?
我一直在使用 R 处理他们发布的一些 IMDB 数据,并且今天已经坚持了很长时间。
primaryName tconst primaryTitle knownForTitles
1 Aaron Lim tt2317744 My Friend Bernard tt0268228,tt0891369,tt2317744,tt3709694
2 Aaron Woodley tt3228088 Spark: A Space Tail tt0326065,tt1650535,tt4426464,tt3228088
3 Abdelkader Belhedi tt11069302 The Carthage Castaways tt11698758,tt11069302,tt0485746
我正在努力想出一种方法来将 knownForTitles
ID 与 tconst
列中的 ID 相匹配。匹配后,我想用 primaryTitle
中的实际标题名称替换 knownForTitles
中的 ID,如下所示。
primaryName tconst primaryTitle knownForTitles
1 Aaron Lim tt2317744 My Friend Bernard Movie Title,Movie Title,Movie Title,Movie Title
2 Aaron Woodley tt3228088 Spark: A Space Tail Movie Title,Movie Title,Movie Title,Movie Title
3 Abdelkader Belhedi tt11069302 The Carthage Castaways Movie Title,Movie Title,Movie Title
我只能想到使用一堆 for 循环,这对于数千行来说可能非常低效。如果有人能指出我更好的方向,那就太棒了。
代码是这样的。解释如下。
代码。
df = data.frame(primaryName = c("Aaron Lim", "Aaron Woodley"), tconst = c("tt2317744", "tt3228088"), primaryTitle = c("My friend Ron", "Spark: Some Title"), knownForTitles = c("tt0268228,tt0891369,tt2317744,tt3709694", "tt0326065,tt1650535,tt4426464,tt3228088"))
df$tconst = as.character(df$tconst)
Names = df %>%
mutate(V2 = strsplit(as.character(knownForTitles), ",")) %>%
tidyr::unnest(V2) %>%
select(-knownForTitles) %>%
as.data.frame(.)
Movies = df[,2:3]
Modi = left_join(Names, Movies, by = c("V2" = "tconst"))
Modi$primaryTitle.y = as.character(Modi$primaryTitle.y)
Modi[is.na(Modi$primaryTitle.y), "primaryTitle.y"] = "Test"
Modi %>%
group_by(tconst) %>%
summarise(primNew = stringr::str_c(primaryTitle.y, collapse = ", ")) %>%
inner_join(df, .)
输出。
primaryName tconst primaryTitle knownForTitles
1 Aaron Lim tt2317744 My friend Ron tt0268228,tt0891369,tt2317744,tt3709694
2 Aaron Woodley tt3228088 Spark: Some Title tt0326065,tt1650535,tt4426464,tt3228088
primNew
1 Test, Test, My friend Ron, Test
2 Test, Test, Test, Spark: Some Title
解释。
让我们定义一些玩具数据。
df = data.frame(primaryName = c("Aaron Lim", "Aaron Woodley"),
tconst = c("tt2317744", "tt3228088"),
primaryTitle = c("My friend", "Spark"),
knownForTitles = c("tt0268228,tt0891369,tt2317744,tt3709694", "tt0326065,tt1650535,tt4426464,tt3228088"))
df$tconst = as.character(df$tconst)
然后你可以使用 tidyr 的 unnest
函数将所有列字符串拆分为行,像这样
Names = df %>%
mutate(V2 = strsplit(as.character(knownForTitles), ",")) %>%
tidyr::unnest(V2) %>%
select(-knownForTitles) %>%
as.data.frame(.)
结果
> Names
primaryName tconst primaryTitle V2
1 Aaron Lim tt2317744 My friend Ron tt0268228
2 Aaron Lim tt2317744 My friend Ron tt0891369
3 Aaron Lim tt2317744 My friend Ron tt2317744
4 Aaron Lim tt2317744 My friend Ron tt3709694
5 Aaron Woodley tt3228088 Spark: Some Title tt0326065
6 Aaron Woodley tt3228088 Spark: Some Title tt1650535
7 Aaron Woodley tt3228088 Spark: Some Title tt4426464
8 Aaron Woodley tt3228088 Spark: Some Title tt3228088
然后你得到所有 tconstants
和
的电影名称
Movies = df[,2:3]
Modi = left_join(Names, Movies, by = c("V2" = "tconst"))
结果
primaryName tconst primaryTitle.x V2 primaryTitle.y
1 Aaron Lim tt2317744 My friend Ron tt0268228 <NA>
2 Aaron Lim tt2317744 My friend Ron tt0891369 <NA>
3 Aaron Lim tt2317744 My friend Ron tt2317744 My friend Ron
4 Aaron Lim tt2317744 My friend Ron tt3709694 <NA>
5 Aaron Woodley tt3228088 Spark: Some Title tt0326065 <NA>
6 Aaron Woodley tt3228088 Spark: Some Title tt1650535 <NA>
7 Aaron Woodley tt3228088 Spark: Some Title tt4426464 <NA>
8 Aaron Woodley tt3228088 Spark: Some Title tt3228088 Spark: Some Title
因为这是玩具数据,所以有 NA
个值会引起一些麻烦,所以我们
Modi$primaryTitle.y = as.character(Modi$primaryTitle.y)
Modi[is.na(Modi$primaryTitle.y), "primaryTitle.y"] = "Test"
来应对。
最后,您修改匹配的电影并将它们折叠成一行
Modi %>%
group_by(tconst) %>%
summarise(primNew = stringr::str_c(primaryTitle.y, collapse = ", ")) %>%
inner_join(df, .)
结果
primaryName tconst primaryTitle knownForTitles
1 Aaron Lim tt2317744 My friend Ron tt0268228,tt0891369,tt2317744,tt3709694
2 Aaron Woodley tt3228088 Spark: Some Title tt0326065,tt1650535,tt4426464,tt3228088
primNew
1 Test, Test, My friend Ron, Test
2 Test, Test, Test, Spark: Some Title
我们可以得到separate_rows
、match
knownForTitles
中的数据,tconst
得到对应的primaryTitle
值,并且对每个Name
。
library(dplyr)
df %>%
tidyr::separate_rows(knownForTitles, sep = ',') %>%
mutate(knownForTitles = primaryTitle[match(knownForTitles, tconst)]) %>%
group_by(primaryName) %>%
summarise(knownForTitles = toString(na.omit(knownForTitles)))
在 base R 中,我们可以拆分字符串和 match
df$knownForTitles <- sapply(strsplit(df$knownForTitles, ','), function(x)
with(df, toString(na.omit(primaryTitle[match(x, tconst)]))))
我一直在使用 R 处理他们发布的一些 IMDB 数据,并且今天已经坚持了很长时间。
primaryName tconst primaryTitle knownForTitles
1 Aaron Lim tt2317744 My Friend Bernard tt0268228,tt0891369,tt2317744,tt3709694
2 Aaron Woodley tt3228088 Spark: A Space Tail tt0326065,tt1650535,tt4426464,tt3228088
3 Abdelkader Belhedi tt11069302 The Carthage Castaways tt11698758,tt11069302,tt0485746
我正在努力想出一种方法来将 knownForTitles
ID 与 tconst
列中的 ID 相匹配。匹配后,我想用 primaryTitle
中的实际标题名称替换 knownForTitles
中的 ID,如下所示。
primaryName tconst primaryTitle knownForTitles
1 Aaron Lim tt2317744 My Friend Bernard Movie Title,Movie Title,Movie Title,Movie Title
2 Aaron Woodley tt3228088 Spark: A Space Tail Movie Title,Movie Title,Movie Title,Movie Title
3 Abdelkader Belhedi tt11069302 The Carthage Castaways Movie Title,Movie Title,Movie Title
我只能想到使用一堆 for 循环,这对于数千行来说可能非常低效。如果有人能指出我更好的方向,那就太棒了。
代码是这样的。解释如下。
代码。
df = data.frame(primaryName = c("Aaron Lim", "Aaron Woodley"), tconst = c("tt2317744", "tt3228088"), primaryTitle = c("My friend Ron", "Spark: Some Title"), knownForTitles = c("tt0268228,tt0891369,tt2317744,tt3709694", "tt0326065,tt1650535,tt4426464,tt3228088"))
df$tconst = as.character(df$tconst)
Names = df %>%
mutate(V2 = strsplit(as.character(knownForTitles), ",")) %>%
tidyr::unnest(V2) %>%
select(-knownForTitles) %>%
as.data.frame(.)
Movies = df[,2:3]
Modi = left_join(Names, Movies, by = c("V2" = "tconst"))
Modi$primaryTitle.y = as.character(Modi$primaryTitle.y)
Modi[is.na(Modi$primaryTitle.y), "primaryTitle.y"] = "Test"
Modi %>%
group_by(tconst) %>%
summarise(primNew = stringr::str_c(primaryTitle.y, collapse = ", ")) %>%
inner_join(df, .)
输出。
primaryName tconst primaryTitle knownForTitles
1 Aaron Lim tt2317744 My friend Ron tt0268228,tt0891369,tt2317744,tt3709694
2 Aaron Woodley tt3228088 Spark: Some Title tt0326065,tt1650535,tt4426464,tt3228088
primNew
1 Test, Test, My friend Ron, Test
2 Test, Test, Test, Spark: Some Title
解释。
让我们定义一些玩具数据。
df = data.frame(primaryName = c("Aaron Lim", "Aaron Woodley"),
tconst = c("tt2317744", "tt3228088"),
primaryTitle = c("My friend", "Spark"),
knownForTitles = c("tt0268228,tt0891369,tt2317744,tt3709694", "tt0326065,tt1650535,tt4426464,tt3228088"))
df$tconst = as.character(df$tconst)
然后你可以使用 tidyr 的 unnest
函数将所有列字符串拆分为行,像这样
Names = df %>%
mutate(V2 = strsplit(as.character(knownForTitles), ",")) %>%
tidyr::unnest(V2) %>%
select(-knownForTitles) %>%
as.data.frame(.)
结果
> Names
primaryName tconst primaryTitle V2
1 Aaron Lim tt2317744 My friend Ron tt0268228
2 Aaron Lim tt2317744 My friend Ron tt0891369
3 Aaron Lim tt2317744 My friend Ron tt2317744
4 Aaron Lim tt2317744 My friend Ron tt3709694
5 Aaron Woodley tt3228088 Spark: Some Title tt0326065
6 Aaron Woodley tt3228088 Spark: Some Title tt1650535
7 Aaron Woodley tt3228088 Spark: Some Title tt4426464
8 Aaron Woodley tt3228088 Spark: Some Title tt3228088
然后你得到所有 tconstants
和
Movies = df[,2:3]
Modi = left_join(Names, Movies, by = c("V2" = "tconst"))
结果
primaryName tconst primaryTitle.x V2 primaryTitle.y
1 Aaron Lim tt2317744 My friend Ron tt0268228 <NA>
2 Aaron Lim tt2317744 My friend Ron tt0891369 <NA>
3 Aaron Lim tt2317744 My friend Ron tt2317744 My friend Ron
4 Aaron Lim tt2317744 My friend Ron tt3709694 <NA>
5 Aaron Woodley tt3228088 Spark: Some Title tt0326065 <NA>
6 Aaron Woodley tt3228088 Spark: Some Title tt1650535 <NA>
7 Aaron Woodley tt3228088 Spark: Some Title tt4426464 <NA>
8 Aaron Woodley tt3228088 Spark: Some Title tt3228088 Spark: Some Title
因为这是玩具数据,所以有 NA
个值会引起一些麻烦,所以我们
Modi$primaryTitle.y = as.character(Modi$primaryTitle.y)
Modi[is.na(Modi$primaryTitle.y), "primaryTitle.y"] = "Test"
来应对。
最后,您修改匹配的电影并将它们折叠成一行
Modi %>%
group_by(tconst) %>%
summarise(primNew = stringr::str_c(primaryTitle.y, collapse = ", ")) %>%
inner_join(df, .)
结果
primaryName tconst primaryTitle knownForTitles
1 Aaron Lim tt2317744 My friend Ron tt0268228,tt0891369,tt2317744,tt3709694
2 Aaron Woodley tt3228088 Spark: Some Title tt0326065,tt1650535,tt4426464,tt3228088
primNew
1 Test, Test, My friend Ron, Test
2 Test, Test, Test, Spark: Some Title
我们可以得到separate_rows
、match
knownForTitles
中的数据,tconst
得到对应的primaryTitle
值,并且对每个Name
。
library(dplyr)
df %>%
tidyr::separate_rows(knownForTitles, sep = ',') %>%
mutate(knownForTitles = primaryTitle[match(knownForTitles, tconst)]) %>%
group_by(primaryName) %>%
summarise(knownForTitles = toString(na.omit(knownForTitles)))
在 base R 中,我们可以拆分字符串和 match
df$knownForTitles <- sapply(strsplit(df$knownForTitles, ','), function(x)
with(df, toString(na.omit(primaryTitle[match(x, tconst)]))))