创建从多个选定列中提取和标记检测到的字符串模式的列
Create columns that extract and flag a detected string pattern from multiple selected columns
我有下面的数据框,其中每个观察都有一串不同对象的键和值 (k:v)。每个对象由 \n
分隔
df = tribble(
~col, ~summary,
"a", "Oranges: 1\nApples: 1\nPeaches: 6\nBananas: 2",
"b", "Apples: 2\nBananas: 1",
"c", "Oranges: 4\nPeaches: 2\nBananas: 5",
"d", "Peaches: 2"
)
我将这些值分开如下:
df_sep = df %>%
separate(summary,c("col1","col2","col3","col4"),sep = "\n")
我想使用 dplyr 获得以下输出,如果适用,可能使用 across
和 any
。
# A tibble: 4 x 10
col summary Apples_flag Bananas_flag oranges_flag Peaches_flag_value Apples_value Bananas_value oranges_value Peaches_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 b Apples: 2, Bananas: 1 1 1 0 0 2 1 0 0
2 a Oranges: 1, Apples: 1, Peaches: 6, Bananas: 2 1 1 1 1 1 2 1 6
3 c Oranges: 4, Peaches: 2, Bananas: 5 0 1 1 1 0 5 4 2
4 d Peaches: 2 0 0 0 1 0 0 0 2
这是 tidyverse
中的一种可能性,它依赖于旋转数据:
df %>%
separate(summary,c("col1","col2","col3","col4"),sep = "\n", fill = "right") %>%
gather(dummy, value, -col) %>%
filter(!is.na(value)) %>%
group_by(col) %>%
transmute(
col,
summary = paste(value, collapse = ", "),
fruit = gsub(": \d+", "", value),
n = parse_number(value)
) %>%
arrange(fruit) %>%
pivot_wider(names_from = fruit, values_from = n, values_fill = 0) %>%
mutate(across(-summary, list(flag = sign)))
# A tibble: 4 x 10
# Groups: col [4]
col summary Apples Bananas oranges Peaches Apples_flag Bananas_flag oranges_flag Peaches_flag
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 b Apples: 2, Bananas: 1 2 1 0 0 1 1 0 0
2 a oranges: 1, Apples: 1, Peaches: 6, Bananas: 2 1 2 1 6 1 1 1 1
3 c oranges: 4, Peaches: 2, Bananas: 5 0 5 4 2 0 1 1 1
4 d Peaches: 2 0 0 0 2 0 0 0 1
下面是 tidyverse
中另一个可能的解决方案,利用 c_cross
、any
、str_detect
和 str_match
。
它通过将 str_detect
应用到 match
参数中指定列的 any
并使用 +
添加 1 作为标志到适当的列来进行标记.
它通过使用 str_match
应用正则表达式来提取值,并将其添加到适当的列中。
df %>%
mutate(summary_clone = summary) %>%
separate(summary_clone,c("col1","col2","col3","col4"),sep = "\n", fill = "right") %>%
rowwise() %>%
# Flagging
mutate(Apples_flag = +any(str_detect(c_across(matches("col[1-4]")), "Apples")),
Oranges_flag = +any(str_detect(c_across(matches("col[1-4]")), "oranges")),
Bananas_flag = +any(str_detect(c_across(matches("col[1-4]")), "Bananas")),
Peaches_flag = +any(str_detect(c_across(matches("col[1-4]")), "Peaches"))
) %>%
mutate(summary = str_replace_all(summary,"\n"," ")) %>%
# Parse matched values
mutate(Apples_value = str_match(summary,"Apples: (\d+)")[2],
Oranges_value = str_match(summary,"Oranges: (\d+)")[2],
Bananas_value = str_match(summary,"Bananas: (\d+)")[2],
Peaches_value = str_match(summary,"Peaches: (\d+)")[2]
) %>%
select(-matches("col[1-4]")) %>%
mutate(across(3:10, .fns = ~replace_na(as.numeric(.),0)))
# A tibble: 4 x 10
# Rowwise:
col summary Apples_flag Oranges_flag Bananas_flag Peaches_flag Apples_value Oranges_value Bananas_value Peaches_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a Oranges: 1 Apples: 1 Peaches: 6 Bananas: 2 1 1 1 1 1 0 2 6
2 b Apples: 2 Bananas: 1 1 0 1 0 2 0 1 0
3 c Oranges: 4 Peaches: 2 Bananas: 5 0 1 1 1 0 0 5 2
4 d Peaches: 2 0 0 0 0 0 0 0 0
另一种方式是:
df %>%
left_join(df%>%
separate_rows(summary, sep='\n') %>%
separate(summary, c('name', 'value'), convert = TRUE) %>%
pivot_wider(col,names_glue = '{name}_value', values_fill = 0) %>%
mutate(across(-col, sign, .names = '{.col}_flag')))
我有下面的数据框,其中每个观察都有一串不同对象的键和值 (k:v)。每个对象由 \n
df = tribble(
~col, ~summary,
"a", "Oranges: 1\nApples: 1\nPeaches: 6\nBananas: 2",
"b", "Apples: 2\nBananas: 1",
"c", "Oranges: 4\nPeaches: 2\nBananas: 5",
"d", "Peaches: 2"
)
我将这些值分开如下:
df_sep = df %>%
separate(summary,c("col1","col2","col3","col4"),sep = "\n")
我想使用 dplyr 获得以下输出,如果适用,可能使用 across
和 any
。
# A tibble: 4 x 10
col summary Apples_flag Bananas_flag oranges_flag Peaches_flag_value Apples_value Bananas_value oranges_value Peaches_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 b Apples: 2, Bananas: 1 1 1 0 0 2 1 0 0
2 a Oranges: 1, Apples: 1, Peaches: 6, Bananas: 2 1 1 1 1 1 2 1 6
3 c Oranges: 4, Peaches: 2, Bananas: 5 0 1 1 1 0 5 4 2
4 d Peaches: 2 0 0 0 1 0 0 0 2
这是 tidyverse
中的一种可能性,它依赖于旋转数据:
df %>%
separate(summary,c("col1","col2","col3","col4"),sep = "\n", fill = "right") %>%
gather(dummy, value, -col) %>%
filter(!is.na(value)) %>%
group_by(col) %>%
transmute(
col,
summary = paste(value, collapse = ", "),
fruit = gsub(": \d+", "", value),
n = parse_number(value)
) %>%
arrange(fruit) %>%
pivot_wider(names_from = fruit, values_from = n, values_fill = 0) %>%
mutate(across(-summary, list(flag = sign)))
# A tibble: 4 x 10
# Groups: col [4]
col summary Apples Bananas oranges Peaches Apples_flag Bananas_flag oranges_flag Peaches_flag
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 b Apples: 2, Bananas: 1 2 1 0 0 1 1 0 0
2 a oranges: 1, Apples: 1, Peaches: 6, Bananas: 2 1 2 1 6 1 1 1 1
3 c oranges: 4, Peaches: 2, Bananas: 5 0 5 4 2 0 1 1 1
4 d Peaches: 2 0 0 0 2 0 0 0 1
下面是 tidyverse
中另一个可能的解决方案,利用 c_cross
、any
、str_detect
和 str_match
。
它通过将 str_detect
应用到 match
参数中指定列的 any
并使用 +
添加 1 作为标志到适当的列来进行标记.
它通过使用 str_match
应用正则表达式来提取值,并将其添加到适当的列中。
df %>%
mutate(summary_clone = summary) %>%
separate(summary_clone,c("col1","col2","col3","col4"),sep = "\n", fill = "right") %>%
rowwise() %>%
# Flagging
mutate(Apples_flag = +any(str_detect(c_across(matches("col[1-4]")), "Apples")),
Oranges_flag = +any(str_detect(c_across(matches("col[1-4]")), "oranges")),
Bananas_flag = +any(str_detect(c_across(matches("col[1-4]")), "Bananas")),
Peaches_flag = +any(str_detect(c_across(matches("col[1-4]")), "Peaches"))
) %>%
mutate(summary = str_replace_all(summary,"\n"," ")) %>%
# Parse matched values
mutate(Apples_value = str_match(summary,"Apples: (\d+)")[2],
Oranges_value = str_match(summary,"Oranges: (\d+)")[2],
Bananas_value = str_match(summary,"Bananas: (\d+)")[2],
Peaches_value = str_match(summary,"Peaches: (\d+)")[2]
) %>%
select(-matches("col[1-4]")) %>%
mutate(across(3:10, .fns = ~replace_na(as.numeric(.),0)))
# A tibble: 4 x 10
# Rowwise:
col summary Apples_flag Oranges_flag Bananas_flag Peaches_flag Apples_value Oranges_value Bananas_value Peaches_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a Oranges: 1 Apples: 1 Peaches: 6 Bananas: 2 1 1 1 1 1 0 2 6
2 b Apples: 2 Bananas: 1 1 0 1 0 2 0 1 0
3 c Oranges: 4 Peaches: 2 Bananas: 5 0 1 1 1 0 0 5 2
4 d Peaches: 2 0 0 0 0 0 0 0 0
另一种方式是:
df %>%
left_join(df%>%
separate_rows(summary, sep='\n') %>%
separate(summary, c('name', 'value'), convert = TRUE) %>%
pivot_wider(col,names_glue = '{name}_value', values_fill = 0) %>%
mutate(across(-col, sign, .names = '{.col}_flag')))