dplyr:从子字符串的多个 areas/characters 变异
dplyr: mutate from multiple areas/characters of a substring
初学者。尝试使用 dplyr: mutate/case_when 具有多个子字符串条件(10 个字符的字符串)。每个字符代表一个种族。例如,字符串的 1-3 位中的任何一个为“Y”而其余位置为“N”的字符串将被定义为“Latino”。我正在尝试获取一个字符串的正确代码,该字符串在 1-3 个位置中的任何一个位置都有一个“Y”,但在 4-5 个位置(“亚洲”)中的任何一个位置都有一个“Y”。我想将其定义为“多种族”。希望提供正确的代码来创建结果为“多民族”的字符串?非常感谢这个网站!
library(dplyr)
data = data.frame(APP_AC = c("YNNNNNNNNN",
"YYNNNNNNNN",
"YYYNNNNNNN",
"YNYNNNNNNN",
"NNNYNNNNNN",
"YNNYNNNNNN",
"NNNNNYNNNN",
"YNNNNYNNNY",
"NNNNNNNNNN"))
data %>%
mutate(ETHNICITY = case_when(
str_sub(APP_AC,1,1) == "Y" ~ "Latino",
str_sub(APP_AC,2,2) == "Y" ~ "Latino",
str_sub(APP_AC,3,3) == "Y" ~ "Latino",
str_sub(APP_AC,4,4) == "Y" ~ "Asian",
str_sub(APP_AC,5,5) == "Y" ~ "Asian",
str_sub(APP_AC,6,6) == "Y" ~ "Black",
str_sub(APP_AC,7,7) == "Y" ~ "Native_American_Alaskan",
str_sub(APP_AC,8,8) == "Y" ~ "Pacific_Islander",
str_sub(APP_AC,9,9) == "Y" ~ "Pacific_Islander",
str_sub(APP_AC,10,10) == "Y" ~ "White",
TRUE ~ "Unknown"))
APP_AC ETHNICITY
1 YNNNNNNNNN Latino
2 YYNNNNNNNN Latino
3 YYYNNNNNNN Latino
4 YNYNNNNNNN Latino
5 NNNYNNNNNN Asian
6 YNNYNNNNNN Asian
7 NNNNNYNNNN Black
8 YNNNNYNNNY Latino
9 NNNNNNNNNN Unknown
期望的输出:
APP_AC ETHNICITY
1 YNNNNNNNNN Latino
2 YYNNNNNNNN Latino
3 YYYNNNNNNN Latino
4 YNYNNNNNNN Latino
5 NNNYNNNNNN Asian
6 YNNYNNNNNN Multi-Ethnic
7 NNNNNYNNNN Black
8 YNNNNYNNNY Multi-Ethnic
9 NNNNNNNNNN Unknown
您可以使用 str_detect
:
library(dplyr)
library(stringr)
data %>%
mutate(ETHNICITY = case_when(
str_count(APP_AC, 'Y') > 1 ~ "Multi-Ethnic",
str_sub(APP_AC,1,1) == "Y" ~ "Latino",
str_sub(APP_AC,2,2) == "Y" ~ "Latino",
str_sub(APP_AC,3,3) == "Y" ~ "Latino",
str_sub(APP_AC,4,4) == "Y" ~ "Asian",
str_sub(APP_AC,5,5) == "Y" ~ "Asian",
str_sub(APP_AC,6,6) == "Y" ~ "Black",
str_sub(APP_AC,7,7) == "Y" ~ "Native_American_Alaskan",
str_sub(APP_AC,8,8) == "Y" ~ "Pacific_Islander",
str_sub(APP_AC,9,9) == "Y" ~ "Pacific_Islander",
str_sub(APP_AC,10,10) == "Y" ~ "White",
TRUE ~ "Unknown"))
# APP_AC ETHNICITY
#1 YNNNNNNNNN Latino
#2 NNNYNNNNNN Asian
#3 YNNYNNNNNN Multi-Ethnic
#4 NNNNNYNNNN Black
#5 NNNNNNNNNN Unknown
同样,您也可以将其他条件合二为一,使代码更短。
data %>%
mutate(ETHNICITY = case_when(
str_count(APP_AC, 'Y') > 1 ~ "Multi-Ethnic",
str_detect(str_sub(APP_AC, 1, 3), 'Y') ~ "Latino",
str_detect(str_sub(APP_AC, 4, 5), 'Y') ~ "Asian",
str_sub(APP_AC,6,6) == "Y" ~ "Black",
str_sub(APP_AC,7,7) == "Y" ~ "Native_American_Alaskan",
str_detect(str_sub(APP_AC, 8, 9), 'Y') ~ "Pacific_Islander",
str_sub(APP_AC,10,10) == "Y" ~ "White",
TRUE ~ "Unknown"))
已更新以包含评论中描述的逻辑和更新的问题。
代码
data %>%
mutate(ETHNICITY = case_when(
str_detect(substr(APP_AC, 1, 3),"Y") & str_count(substr(APP_AC, 4, 10), "Y") == 0 ~ "Latino",
str_detect(substr(APP_AC, 1, 3),"Y") & str_count(substr(APP_AC, 4, 10), "Y") >= 1 ~ "Multi-Ethnic",
str_detect(substr(APP_AC, 4, 5),"Y") ~ "Asian",
str_detect(substr(APP_AC, 6, 6),"Y") ~ "Black",
str_detect(substr(APP_AC, 7, 7),"Y") ~ "Native_American_Alaskan",
str_detect(substr(APP_AC, 8, 9),"Y") ~ "Pacific_Islander",
str_detect(substr(APP_AC, 10, 10),"Y") ~ "White",
TRUE ~ "Unknown")
)
输出
APP_AC ETHNICITY
1 YNNNNNNNNN Latino
2 YYNNNNNNNN Latino
3 YYYNNNNNNN Latino
4 YNYNNNNNNN Latino
5 NNNYNNNNNN Asian
6 YNNYNNNNNN Multi-Ethnic
7 NNNNNYNNNN Black
8 YNNNNYNNNY Multi-Ethnic
9 NNNNNNNNNN Unknown
初学者。尝试使用 dplyr: mutate/case_when 具有多个子字符串条件(10 个字符的字符串)。每个字符代表一个种族。例如,字符串的 1-3 位中的任何一个为“Y”而其余位置为“N”的字符串将被定义为“Latino”。我正在尝试获取一个字符串的正确代码,该字符串在 1-3 个位置中的任何一个位置都有一个“Y”,但在 4-5 个位置(“亚洲”)中的任何一个位置都有一个“Y”。我想将其定义为“多种族”。希望提供正确的代码来创建结果为“多民族”的字符串?非常感谢这个网站!
library(dplyr)
data = data.frame(APP_AC = c("YNNNNNNNNN",
"YYNNNNNNNN",
"YYYNNNNNNN",
"YNYNNNNNNN",
"NNNYNNNNNN",
"YNNYNNNNNN",
"NNNNNYNNNN",
"YNNNNYNNNY",
"NNNNNNNNNN"))
data %>%
mutate(ETHNICITY = case_when(
str_sub(APP_AC,1,1) == "Y" ~ "Latino",
str_sub(APP_AC,2,2) == "Y" ~ "Latino",
str_sub(APP_AC,3,3) == "Y" ~ "Latino",
str_sub(APP_AC,4,4) == "Y" ~ "Asian",
str_sub(APP_AC,5,5) == "Y" ~ "Asian",
str_sub(APP_AC,6,6) == "Y" ~ "Black",
str_sub(APP_AC,7,7) == "Y" ~ "Native_American_Alaskan",
str_sub(APP_AC,8,8) == "Y" ~ "Pacific_Islander",
str_sub(APP_AC,9,9) == "Y" ~ "Pacific_Islander",
str_sub(APP_AC,10,10) == "Y" ~ "White",
TRUE ~ "Unknown"))
APP_AC ETHNICITY
1 YNNNNNNNNN Latino
2 YYNNNNNNNN Latino
3 YYYNNNNNNN Latino
4 YNYNNNNNNN Latino
5 NNNYNNNNNN Asian
6 YNNYNNNNNN Asian
7 NNNNNYNNNN Black
8 YNNNNYNNNY Latino
9 NNNNNNNNNN Unknown
期望的输出:
APP_AC ETHNICITY
1 YNNNNNNNNN Latino
2 YYNNNNNNNN Latino
3 YYYNNNNNNN Latino
4 YNYNNNNNNN Latino
5 NNNYNNNNNN Asian
6 YNNYNNNNNN Multi-Ethnic
7 NNNNNYNNNN Black
8 YNNNNYNNNY Multi-Ethnic
9 NNNNNNNNNN Unknown
您可以使用 str_detect
:
library(dplyr)
library(stringr)
data %>%
mutate(ETHNICITY = case_when(
str_count(APP_AC, 'Y') > 1 ~ "Multi-Ethnic",
str_sub(APP_AC,1,1) == "Y" ~ "Latino",
str_sub(APP_AC,2,2) == "Y" ~ "Latino",
str_sub(APP_AC,3,3) == "Y" ~ "Latino",
str_sub(APP_AC,4,4) == "Y" ~ "Asian",
str_sub(APP_AC,5,5) == "Y" ~ "Asian",
str_sub(APP_AC,6,6) == "Y" ~ "Black",
str_sub(APP_AC,7,7) == "Y" ~ "Native_American_Alaskan",
str_sub(APP_AC,8,8) == "Y" ~ "Pacific_Islander",
str_sub(APP_AC,9,9) == "Y" ~ "Pacific_Islander",
str_sub(APP_AC,10,10) == "Y" ~ "White",
TRUE ~ "Unknown"))
# APP_AC ETHNICITY
#1 YNNNNNNNNN Latino
#2 NNNYNNNNNN Asian
#3 YNNYNNNNNN Multi-Ethnic
#4 NNNNNYNNNN Black
#5 NNNNNNNNNN Unknown
同样,您也可以将其他条件合二为一,使代码更短。
data %>%
mutate(ETHNICITY = case_when(
str_count(APP_AC, 'Y') > 1 ~ "Multi-Ethnic",
str_detect(str_sub(APP_AC, 1, 3), 'Y') ~ "Latino",
str_detect(str_sub(APP_AC, 4, 5), 'Y') ~ "Asian",
str_sub(APP_AC,6,6) == "Y" ~ "Black",
str_sub(APP_AC,7,7) == "Y" ~ "Native_American_Alaskan",
str_detect(str_sub(APP_AC, 8, 9), 'Y') ~ "Pacific_Islander",
str_sub(APP_AC,10,10) == "Y" ~ "White",
TRUE ~ "Unknown"))
已更新以包含评论中描述的逻辑和更新的问题。
代码
data %>%
mutate(ETHNICITY = case_when(
str_detect(substr(APP_AC, 1, 3),"Y") & str_count(substr(APP_AC, 4, 10), "Y") == 0 ~ "Latino",
str_detect(substr(APP_AC, 1, 3),"Y") & str_count(substr(APP_AC, 4, 10), "Y") >= 1 ~ "Multi-Ethnic",
str_detect(substr(APP_AC, 4, 5),"Y") ~ "Asian",
str_detect(substr(APP_AC, 6, 6),"Y") ~ "Black",
str_detect(substr(APP_AC, 7, 7),"Y") ~ "Native_American_Alaskan",
str_detect(substr(APP_AC, 8, 9),"Y") ~ "Pacific_Islander",
str_detect(substr(APP_AC, 10, 10),"Y") ~ "White",
TRUE ~ "Unknown")
)
输出
APP_AC ETHNICITY
1 YNNNNNNNNN Latino
2 YYNNNNNNNN Latino
3 YYYNNNNNNN Latino
4 YNYNNNNNNN Latino
5 NNNYNNNNNN Asian
6 YNNYNNNNNN Multi-Ethnic
7 NNNNNYNNNN Black
8 YNNNNYNNNY Multi-Ethnic
9 NNNNNNNNNN Unknown