在 R 中,当 ID 列存在但不相关时如何应用 VLOOKUP?
In R, how do I apply a VLOOKUP when an ID column is present but not otherwise relevant?
这里有很多关于这个问题的相关问题,特别是使用 dplyr
中的 left_join
,但我还是想不通。
我想做的就是 Lookup
中的 return LanguageClean
基于与 df
中的 Language
列的匹配。如果没有匹配,则简单地 return NA
。我希望 LanguageClean
作为新列添加到 df
。
我可以看到我下面的代码正在复制 ID
,但我不想这样做。 ID
列与我在这里的目的无关,尽管我需要将其保留在最终数据框中。
df <- structure(list(ID = structure(c(18L, 89L, 42L, 161L, 88L, 71L,
175L, 181L, 133L, 56L, 18L, 89L, 42L, 161L, 88L, 71L, 175L, 181L,
133L, 56L, 18L, 89L, 42L, 161L, 88L, 71L, 175L, 181L, 133L, 56L
), .Dim = c(10L, 3L)), Language = c("en", "", "lv", "en", "en",
"de", "en", "ms", "", "en"), Geo = c("us", "", "-", "us",
"us", "gb", "ca", "us", "-", "us")), class = "data.frame", row.names = c(NA,
-10L))
lookup <- structure(list(Language = c("af", "ar", "ar", "ar", "ar", "ar",
"ar", "ar", "ar", "eu", "be", "zh", "zh", "hr", "da", "nl", "en",
"en", "en", "en", "en", "en", "fo", "fi", "fr", "fr", "gd", "de",
"de", "de", "he", "hu", "id", "it", "ko", "lv", "mk", "mt", "no",
"pt", "rm", "ro", "ru", "sr", "sk", "sb", "es", "es", "es", "es",
"es", "es", "es", "es", "es", "sx", "sv", "ts", "tr", "ur", "vi",
"ji", "sq", "ar", "ar", "ar", "ar", "ar", "ar", "ar", "ar", "bg",
"ca", "zh", "zh", "cs", "nl", "en", "en", "en", "en", "en", "en",
"et", "fa", "fr", "fr", "fr", "ga", "de", "de", "el", "hi", "is",
"it", "ja", "ko", "lt", "ms", "no", "pl", "pt", "ro", "ru", "sz",
"sr", "sl", "es", "es", "es", "es", "es", "es", "es", "es", "es",
"es", "sv", "th", "tn", "uk", "ve", "xh", "zu"), LanguageClean = c("Afrikaans",
"Arabic", "Arabic", "Arabic", "Arabic", "Arabic", "Arabic", "Arabic",
"Arabic", "Basque", "Belarusian", "Chinese", "Chinese", "Croatian",
"Danish", "Dutch", "English", "English", "English", "English",
"English", "English", "Faeroese", "Finnish", "French", "French",
"Gaelic", "German", "German", "German", "Hebrew", "Hungarian",
"Indonesian", "Italian", "Korean", "Latvian", "Macedonian", "Maltese",
"Norwegian", "Portuguese", "Rhaeto-Romanic", "Romanian", "Russian",
"Serbian", "Slovak", "Sorbian", "Spanish", "Spanish", "Spanish",
"Spanish", "Spanish", "Spanish", "Spanish", "Spanish", "Spanish",
"Sutu", "Swedish", "Tsonga", "Turkish", "Urdu", "Vietnamese",
"Yiddish", "Albanian", "Arabic", "Arabic", "Arabic", "Arabic",
"Arabic", "Arabic", "Arabic", "Arabic", "Bulgarian", "Catalan",
"Chinese", "Chinese", "Czech", "Dutch", "English", "English",
"English", "English", "English", "English", "Estonian", "Farsi",
"French", "French", "French", "Irish", "German", "German", "Greek",
"Hindi", "Icelandic", "Italian", "Japanese", "Korean", "Lithuanian",
"Malaysian", "Norwegian", "Polish", "Portuguese", "Romanian",
"Russian", "Sami", "Serbian", "Slovenian", "Spanish", "Spanish",
"Spanish", "Spanish", "Spanish", "Spanish", "Spanish", "Spanish",
"Spanish", "Spanish", "Swedish", "Thai", "Tswana", "Ukrainian",
"Venda", "Xhosa", "Zulu")), class = "data.frame", row.names = c(NA,
-124L))
df <- left_join(df, lookup, by="Language")
问题是您的查找 table 包含某些语言的多个条目。因此,您最终会得到多个匹配项。因此,解决您的问题可以使用 dplyr::distinct
:
从查找中过滤掉不同或唯一的组合
library(dplyr)
df <- left_join(df, distinct(lookup, Language, LanguageClean), by = "Language")
df
#> ID.1 ID.2 ID.3 Language Geo LanguageClean
#> 1 18 18 18 en us English
#> 2 89 89 89 <NA>
#> 3 42 42 42 lv - Latvian
#> 4 161 161 161 en us English
#> 5 88 88 88 en us English
#> 6 71 71 71 de gb German
#> 7 175 175 175 en ca English
#> 8 181 181 181 ms us Malaysian
#> 9 133 133 133 - <NA>
#> 10 56 56 56 en us English
这里有很多关于这个问题的相关问题,特别是使用 dplyr
中的 left_join
,但我还是想不通。
我想做的就是 Lookup
中的 return LanguageClean
基于与 df
中的 Language
列的匹配。如果没有匹配,则简单地 return NA
。我希望 LanguageClean
作为新列添加到 df
。
我可以看到我下面的代码正在复制 ID
,但我不想这样做。 ID
列与我在这里的目的无关,尽管我需要将其保留在最终数据框中。
df <- structure(list(ID = structure(c(18L, 89L, 42L, 161L, 88L, 71L,
175L, 181L, 133L, 56L, 18L, 89L, 42L, 161L, 88L, 71L, 175L, 181L,
133L, 56L, 18L, 89L, 42L, 161L, 88L, 71L, 175L, 181L, 133L, 56L
), .Dim = c(10L, 3L)), Language = c("en", "", "lv", "en", "en",
"de", "en", "ms", "", "en"), Geo = c("us", "", "-", "us",
"us", "gb", "ca", "us", "-", "us")), class = "data.frame", row.names = c(NA,
-10L))
lookup <- structure(list(Language = c("af", "ar", "ar", "ar", "ar", "ar",
"ar", "ar", "ar", "eu", "be", "zh", "zh", "hr", "da", "nl", "en",
"en", "en", "en", "en", "en", "fo", "fi", "fr", "fr", "gd", "de",
"de", "de", "he", "hu", "id", "it", "ko", "lv", "mk", "mt", "no",
"pt", "rm", "ro", "ru", "sr", "sk", "sb", "es", "es", "es", "es",
"es", "es", "es", "es", "es", "sx", "sv", "ts", "tr", "ur", "vi",
"ji", "sq", "ar", "ar", "ar", "ar", "ar", "ar", "ar", "ar", "bg",
"ca", "zh", "zh", "cs", "nl", "en", "en", "en", "en", "en", "en",
"et", "fa", "fr", "fr", "fr", "ga", "de", "de", "el", "hi", "is",
"it", "ja", "ko", "lt", "ms", "no", "pl", "pt", "ro", "ru", "sz",
"sr", "sl", "es", "es", "es", "es", "es", "es", "es", "es", "es",
"es", "sv", "th", "tn", "uk", "ve", "xh", "zu"), LanguageClean = c("Afrikaans",
"Arabic", "Arabic", "Arabic", "Arabic", "Arabic", "Arabic", "Arabic",
"Arabic", "Basque", "Belarusian", "Chinese", "Chinese", "Croatian",
"Danish", "Dutch", "English", "English", "English", "English",
"English", "English", "Faeroese", "Finnish", "French", "French",
"Gaelic", "German", "German", "German", "Hebrew", "Hungarian",
"Indonesian", "Italian", "Korean", "Latvian", "Macedonian", "Maltese",
"Norwegian", "Portuguese", "Rhaeto-Romanic", "Romanian", "Russian",
"Serbian", "Slovak", "Sorbian", "Spanish", "Spanish", "Spanish",
"Spanish", "Spanish", "Spanish", "Spanish", "Spanish", "Spanish",
"Sutu", "Swedish", "Tsonga", "Turkish", "Urdu", "Vietnamese",
"Yiddish", "Albanian", "Arabic", "Arabic", "Arabic", "Arabic",
"Arabic", "Arabic", "Arabic", "Arabic", "Bulgarian", "Catalan",
"Chinese", "Chinese", "Czech", "Dutch", "English", "English",
"English", "English", "English", "English", "Estonian", "Farsi",
"French", "French", "French", "Irish", "German", "German", "Greek",
"Hindi", "Icelandic", "Italian", "Japanese", "Korean", "Lithuanian",
"Malaysian", "Norwegian", "Polish", "Portuguese", "Romanian",
"Russian", "Sami", "Serbian", "Slovenian", "Spanish", "Spanish",
"Spanish", "Spanish", "Spanish", "Spanish", "Spanish", "Spanish",
"Spanish", "Spanish", "Swedish", "Thai", "Tswana", "Ukrainian",
"Venda", "Xhosa", "Zulu")), class = "data.frame", row.names = c(NA,
-124L))
df <- left_join(df, lookup, by="Language")
问题是您的查找 table 包含某些语言的多个条目。因此,您最终会得到多个匹配项。因此,解决您的问题可以使用 dplyr::distinct
:
library(dplyr)
df <- left_join(df, distinct(lookup, Language, LanguageClean), by = "Language")
df
#> ID.1 ID.2 ID.3 Language Geo LanguageClean
#> 1 18 18 18 en us English
#> 2 89 89 89 <NA>
#> 3 42 42 42 lv - Latvian
#> 4 161 161 161 en us English
#> 5 88 88 88 en us English
#> 6 71 71 71 de gb German
#> 7 175 175 175 en ca English
#> 8 181 181 181 ms us Malaysian
#> 9 133 133 133 - <NA>
#> 10 56 56 56 en us English