如何优雅地 str_detect 跨多个列并有条件地填充新列
How do I elegantly str_detect across multiple columns and populating new columns conditionally
如您所见,我正在处理一些严重的脏数据。这段代码有效,但看起来有点笨拙。有没有更有效和动态的方式来实现最终结果而不需要那么多编码?
我必须分阶段执行此操作,首先标记内容类型,然后利用内容类型将它们填充到相应的列类型中。
感谢您的帮助
#load library
library(dplyr)
library(stringr)
library(lubridate)
#create sample data
df <- tibble(c1 = c('9996155', '4001096', '4001525', '4000590','2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08'),
c2 = c('4001902', '5000009', '2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08', '4000461', '4000311'),
c3 = c('W-7', 'W-8', 'W-9', 'W-2', 'W-1', 'W-1','3.527E+20', '3.498E+20'),
c4 = c('B09/20', 'B04/20', 'B05/20', 'B10/20', 'B06/20', '3.408E+20', '3.229E+20', '3.225E+20')
)
数据是这样的
> df
# A tibble: 8 x 4
c1 c2 c3 c4
<chr> <chr> <chr> <chr>
1 9996155 4001902 W-7 B09/20
2 4001096 5000009 W-8 B04/20
3 4001525 2020-01-23 W-9 B05/20
4 4000590 2019-12-23 W-2 B10/20
5 2020-01-23 2020-01-20 W-1 B06/20
6 2019-12-23 2019-12-08 W-1 3.408E+20
7 2020-01-20 4000461 3.527E+20 3.229E+20
8 2019-12-08 4000311 3.498E+20 3.225E+20
我做了这样的事情来让它成形
df %>%
mutate(across(#flag them now to allow next step for data population
starts_with('c'),
~ case_when(
is.na(.) ~ NA_character_,
str_detect(., regex('(^20[1,2][0-9]\-)|(\/20[1,2][0-9]$)')) ~ 'date',
str_detect(., regex('\d\.\d{3}[eE][+-]\d{2}+')) ~ 'numericScientificNotation',
str_detect(.,regex('(^[a-zA-Z][0-9]{2}\/2[0-1]{1}$)|(^[A-Z]{1,2}\-\d.*[a-zA-Z]*$)|(^[a-zA-Z][0-9]{2})|(^[A-Z][0-9]$)')) ~ 'batches',
str_detect(., regex('^-?\d+$')) ~ 'integers',
TRUE ~ NA_character_
),
.names = paste0('test', "_{col}")
)) %>% #casewhen to populate new columns
mutate(integer = case_when(test_c1 == 'integers' ~ c1,
test_c2 == 'integers' ~ c2,
test_c3 == 'integers' ~ c3,
test_c4 == 'integers' ~ c4),
date = case_when(test_c1 == 'date' ~ c1,
test_c2 == 'date' ~ c2,
test_c3 == 'date' ~ c3,
test_c4 == 'date' ~ c4),
batches = case_when(test_c1 == 'batches' ~ c1,
test_c2 == 'batches' ~ c2,
test_c3 == 'batches' ~ c3,
test_c4 == 'batches' ~ c4),
numericScientificNotation = case_when(test_c1 == 'numericScientificNotation' ~ c1,
test_c2 == 'numericScientificNotation' ~ c2,
test_c3 == 'numericScientificNotation' ~ c3,
test_c4 == 'numericScientificNotation' ~ c4)
) %>%
select(9:12) #this is all that i need
只需要这种有条理的输出。
谢谢!
# A tibble: 8 x 4
integer date batches numericScientificNotation
<chr> <chr> <chr> <chr>
1 9996155 NA W-7 NA
2 4001096 NA W-8 NA
3 4001525 2020-01-23 W-9 NA
4 4000590 2019-12-23 W-2 NA
5 NA 2020-01-23 W-1 NA
6 NA 2019-12-23 W-1 3.408E+20
7 4000461 2020-01-20 NA 3.527E+20
8 4000311 2019-12-08 NA 3.498E+20
这里有一种方法可以简化它并减少重复:
library(dplyr)
regex_list <- list(date = '(^20[1,2][0-9]\-)|(\/20[1,2][0-9]$)',
numericScientificNotation = '\d\.\d{3}[eE][+-]\d{2}+',
batches = '(^[a-zA-Z][0-9]{2}\/2[0-1]{1}$)|(^[A-Z]{1,2}\-\d.*[a-zA-Z]*$)|(^[a-zA-Z][0-9]{2})|(^[A-Z][0-9]$)',
integers = '^-?\d+$')
purrr::imap_dfc(regex_list, function(x, y)
df %>%
mutate(across(.fns = ~ifelse(str_detect(.x, x), .x, NA))) %>%
transmute(!!y := do.call(coalesce, .)))
# date numericScientificNotation batches integers
# <chr> <chr> <chr> <chr>
#1 NA NA W-7 9996155
#2 NA NA W-8 4001096
#3 2020-01-23 NA W-9 4001525
#4 2019-12-23 NA W-2 4000590
#5 2020-01-23 NA W-1 NA
#6 2019-12-23 3.408E+20 W-1 NA
#7 2020-01-20 3.527E+20 NA 4000461
#8 2019-12-08 3.498E+20 NA 4000311
你也可以这样做
df <- data.frame(c1 = c('9996155', '4001096', '4001525', '4000590','2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08'),
c2 = c('4001902', '5000009', '2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08', '4000461', '4000311'),
c3 = c('W-7', 'W-8', 'W-9', 'W-2', 'W-1', 'W-1','3.527E+20', '3.498E+20'),
c4 = c('B09/20', 'B04/20', 'B05/20', 'B10/20', 'B06/20', '3.408E+20', '3.229E+20', '3.225E+20')
)
library(tidyverse)
df %>% mutate(rowid = row_number()) %>%
pivot_longer(!rowid) %>%
mutate(new = case_when(str_detect(value, '(^20[1,2][0-9]\-)|(\/20[1,2][0-9]$)') ~ 'date',
str_detect(value, '\d\.\d{3}[eE][+-]\d{2}+') ~ 'numeric',
str_detect(value, '(^[a-zA-Z][0-9]{2}\/2[0-1]{1}$)|(^[A-Z]{1,2}\-\d.*[a-zA-Z]*$)|(^[a-zA-Z][0-9]{2})|(^[A-Z][0-9]$)') ~ 'Batches',
str_detect(value, '^-?\d+$') ~ 'Integer',
TRUE ~ 'Other')) %>%
pivot_wider(id_cols = rowid, names_from = new, values_from = value, values_fill = NA, values_fn = first)
#> # A tibble: 8 x 5
#> rowid Integer Batches date numeric
#> <int> <chr> <chr> <chr> <chr>
#> 1 1 9996155 W-7 <NA> <NA>
#> 2 2 4001096 W-8 <NA> <NA>
#> 3 3 4001525 W-9 2020-01-23 <NA>
#> 4 4 4000590 W-2 2019-12-23 <NA>
#> 5 5 <NA> W-1 2020-01-23 <NA>
#> 6 6 <NA> W-1 2019-12-23 3.408E+20
#> 7 7 4000461 <NA> 2020-01-20 3.527E+20
#> 8 8 4000311 <NA> 2019-12-08 3.498E+20
由 reprex package (v2.0.0)
于 2021-05-13 创建
如您所见,我正在处理一些严重的脏数据。这段代码有效,但看起来有点笨拙。有没有更有效和动态的方式来实现最终结果而不需要那么多编码?
我必须分阶段执行此操作,首先标记内容类型,然后利用内容类型将它们填充到相应的列类型中。
感谢您的帮助
#load library
library(dplyr)
library(stringr)
library(lubridate)
#create sample data
df <- tibble(c1 = c('9996155', '4001096', '4001525', '4000590','2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08'),
c2 = c('4001902', '5000009', '2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08', '4000461', '4000311'),
c3 = c('W-7', 'W-8', 'W-9', 'W-2', 'W-1', 'W-1','3.527E+20', '3.498E+20'),
c4 = c('B09/20', 'B04/20', 'B05/20', 'B10/20', 'B06/20', '3.408E+20', '3.229E+20', '3.225E+20')
)
数据是这样的
> df
# A tibble: 8 x 4
c1 c2 c3 c4
<chr> <chr> <chr> <chr>
1 9996155 4001902 W-7 B09/20
2 4001096 5000009 W-8 B04/20
3 4001525 2020-01-23 W-9 B05/20
4 4000590 2019-12-23 W-2 B10/20
5 2020-01-23 2020-01-20 W-1 B06/20
6 2019-12-23 2019-12-08 W-1 3.408E+20
7 2020-01-20 4000461 3.527E+20 3.229E+20
8 2019-12-08 4000311 3.498E+20 3.225E+20
我做了这样的事情来让它成形
df %>%
mutate(across(#flag them now to allow next step for data population
starts_with('c'),
~ case_when(
is.na(.) ~ NA_character_,
str_detect(., regex('(^20[1,2][0-9]\-)|(\/20[1,2][0-9]$)')) ~ 'date',
str_detect(., regex('\d\.\d{3}[eE][+-]\d{2}+')) ~ 'numericScientificNotation',
str_detect(.,regex('(^[a-zA-Z][0-9]{2}\/2[0-1]{1}$)|(^[A-Z]{1,2}\-\d.*[a-zA-Z]*$)|(^[a-zA-Z][0-9]{2})|(^[A-Z][0-9]$)')) ~ 'batches',
str_detect(., regex('^-?\d+$')) ~ 'integers',
TRUE ~ NA_character_
),
.names = paste0('test', "_{col}")
)) %>% #casewhen to populate new columns
mutate(integer = case_when(test_c1 == 'integers' ~ c1,
test_c2 == 'integers' ~ c2,
test_c3 == 'integers' ~ c3,
test_c4 == 'integers' ~ c4),
date = case_when(test_c1 == 'date' ~ c1,
test_c2 == 'date' ~ c2,
test_c3 == 'date' ~ c3,
test_c4 == 'date' ~ c4),
batches = case_when(test_c1 == 'batches' ~ c1,
test_c2 == 'batches' ~ c2,
test_c3 == 'batches' ~ c3,
test_c4 == 'batches' ~ c4),
numericScientificNotation = case_when(test_c1 == 'numericScientificNotation' ~ c1,
test_c2 == 'numericScientificNotation' ~ c2,
test_c3 == 'numericScientificNotation' ~ c3,
test_c4 == 'numericScientificNotation' ~ c4)
) %>%
select(9:12) #this is all that i need
只需要这种有条理的输出。
谢谢!
# A tibble: 8 x 4
integer date batches numericScientificNotation
<chr> <chr> <chr> <chr>
1 9996155 NA W-7 NA
2 4001096 NA W-8 NA
3 4001525 2020-01-23 W-9 NA
4 4000590 2019-12-23 W-2 NA
5 NA 2020-01-23 W-1 NA
6 NA 2019-12-23 W-1 3.408E+20
7 4000461 2020-01-20 NA 3.527E+20
8 4000311 2019-12-08 NA 3.498E+20
这里有一种方法可以简化它并减少重复:
library(dplyr)
regex_list <- list(date = '(^20[1,2][0-9]\-)|(\/20[1,2][0-9]$)',
numericScientificNotation = '\d\.\d{3}[eE][+-]\d{2}+',
batches = '(^[a-zA-Z][0-9]{2}\/2[0-1]{1}$)|(^[A-Z]{1,2}\-\d.*[a-zA-Z]*$)|(^[a-zA-Z][0-9]{2})|(^[A-Z][0-9]$)',
integers = '^-?\d+$')
purrr::imap_dfc(regex_list, function(x, y)
df %>%
mutate(across(.fns = ~ifelse(str_detect(.x, x), .x, NA))) %>%
transmute(!!y := do.call(coalesce, .)))
# date numericScientificNotation batches integers
# <chr> <chr> <chr> <chr>
#1 NA NA W-7 9996155
#2 NA NA W-8 4001096
#3 2020-01-23 NA W-9 4001525
#4 2019-12-23 NA W-2 4000590
#5 2020-01-23 NA W-1 NA
#6 2019-12-23 3.408E+20 W-1 NA
#7 2020-01-20 3.527E+20 NA 4000461
#8 2019-12-08 3.498E+20 NA 4000311
你也可以这样做
df <- data.frame(c1 = c('9996155', '4001096', '4001525', '4000590','2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08'),
c2 = c('4001902', '5000009', '2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08', '4000461', '4000311'),
c3 = c('W-7', 'W-8', 'W-9', 'W-2', 'W-1', 'W-1','3.527E+20', '3.498E+20'),
c4 = c('B09/20', 'B04/20', 'B05/20', 'B10/20', 'B06/20', '3.408E+20', '3.229E+20', '3.225E+20')
)
library(tidyverse)
df %>% mutate(rowid = row_number()) %>%
pivot_longer(!rowid) %>%
mutate(new = case_when(str_detect(value, '(^20[1,2][0-9]\-)|(\/20[1,2][0-9]$)') ~ 'date',
str_detect(value, '\d\.\d{3}[eE][+-]\d{2}+') ~ 'numeric',
str_detect(value, '(^[a-zA-Z][0-9]{2}\/2[0-1]{1}$)|(^[A-Z]{1,2}\-\d.*[a-zA-Z]*$)|(^[a-zA-Z][0-9]{2})|(^[A-Z][0-9]$)') ~ 'Batches',
str_detect(value, '^-?\d+$') ~ 'Integer',
TRUE ~ 'Other')) %>%
pivot_wider(id_cols = rowid, names_from = new, values_from = value, values_fill = NA, values_fn = first)
#> # A tibble: 8 x 5
#> rowid Integer Batches date numeric
#> <int> <chr> <chr> <chr> <chr>
#> 1 1 9996155 W-7 <NA> <NA>
#> 2 2 4001096 W-8 <NA> <NA>
#> 3 3 4001525 W-9 2020-01-23 <NA>
#> 4 4 4000590 W-2 2019-12-23 <NA>
#> 5 5 <NA> W-1 2020-01-23 <NA>
#> 6 6 <NA> W-1 2019-12-23 3.408E+20
#> 7 7 4000461 <NA> 2020-01-20 3.527E+20
#> 8 8 4000311 <NA> 2019-12-08 3.498E+20
由 reprex package (v2.0.0)
于 2021-05-13 创建