你能在 R 中使用 AssertR 提取有缺陷的行吗?

Can you extract defective rows using AssertR in R?

下面的示例是一个简单的示例,它试图断言列 y 始终为正值 (y>0)。如何提取错误数据(负值的第 3 行,可能放入数据框或任何方便的对象中),同时允许工作流继续处理“已清理”的数据?

library(tidyverse)
library(assertr)
#> Warning: package 'assertr' was built under R version 4.0.5

test = tribble(
  ~x,~y,
  1,4,
  3,8,
  1,-2,
  3,1
)

test %>% 
  assert(within_bounds(0,100),y) %>% 
  group_by(x) %>% 
  summarise(avg = mean(y))
#> Column 'y' violates assertion 'within_bounds(0, 100)' 1 time
#>     verb redux_fn             predicate column index value
#> 1 assert       NA within_bounds(0, 100)      y     3    -2
#> Error: assertr stopped execution

reprex package (v0.3.0)

于 2021-04-12 创建

更新

尝试@Sirius 解决方案后,我第一次尝试它时它对我有用,但随后的试验都失败了,并显示以下 reprex 中显示的消息:

library(tidyverse)
library(assertr)
#> Warning: package 'assertr' was built under R version 4.0.5

test = tribble(
  ~x,~y,
  1,4,
  3,8,
  1,-2,
  3,1,
  5,81
)


my_error_fun = function( ... ) {
  args = list(...)
  
  do.call( just_warn, args )
  
  bad.data = args[[1]][[1]]$error_df
  
  these.failed.rows = args$data %>% 
    slice( bad.data$index )
  
  if(!exists("my.failed.rows", inherits=TRUE)) {
    my.failed.rows = NULL
  }
  my.failed.rows = rbind( my.failed.rows, these.failed.rows )
  assign( "my.failed.rows", my.failed.rows, envir=parent.frame(n=3) )
  
  good.rows = args$data %>% slice(-bad.data$index)
  
  return(good.rows)
  
}

my.result <- test %>%
  assert(within_bounds(0,100),y, error_fun = my_error_fun) %>% 
  group_by(x) %>%
  summarise(avg = mean(y))
#> Column 'y' violates assertion 'within_bounds(0, 100)' 1 time
#>     verb redux_fn             predicate column index value
#> 1 assert       NA within_bounds(0, 100)      y     3    -2
#> Warning: assertr encountered errors
#> `summarise()` ungrouping output (override with `.groups` argument)

print(my.result)
#> # A tibble: 3 x 2
#>       x   avg
#>   <dbl> <dbl>
#> 1     1   4  
#> 2     3   4.5
#> 3     5  81

print(my.failed.rows)
#> Error in print(my.failed.rows): object 'my.failed.rows' not found

reprex package (v0.3.0)

于 2021-05-02 创建
devtools::session_info()
#> - Session info ---------------------------------------------------------------
#>  setting  value                       
#>  version  R version 4.0.2 (2020-06-22)
#>  os       Windows 10 x64              
#>  system   x86_64, mingw32             
#>  ui       RTerm                       
#>  language (EN)                        
#>  collate  English_United States.1252  
#>  ctype    English_United States.1252  
#>  tz       Africa/Nairobi              
#>  date     2021-05-02                  
#> 
#> - Packages -------------------------------------------------------------------
#>  package     * version date       lib source        
#>  assertr     * 2.8     2021-01-25 [1] CRAN (R 4.0.5)
#>  assertthat    0.2.1   2019-03-21 [1] CRAN (R 4.0.2)
#>  backports     1.1.9   2020-08-24 [1] CRAN (R 4.0.2)
#>  blob          1.2.1   2020-01-20 [1] CRAN (R 4.0.2)
#>  broom         0.7.0   2020-07-09 [1] CRAN (R 4.0.2)
#>  callr         3.4.4   2020-09-07 [1] CRAN (R 4.0.2)
#>  cellranger    1.1.0   2016-07-27 [1] CRAN (R 4.0.2)
#>  cli           2.0.2   2020-02-28 [1] CRAN (R 4.0.2)
#>  colorspace    1.4-1   2019-03-18 [1] CRAN (R 4.0.2)
#>  crayon        1.3.4   2017-09-16 [1] CRAN (R 4.0.2)
#>  DBI           1.1.0   2019-12-15 [1] CRAN (R 4.0.2)
#>  dbplyr        1.4.4   2020-05-27 [1] CRAN (R 4.0.2)
#>  desc          1.2.0   2018-05-01 [1] CRAN (R 4.0.2)
#>  devtools      2.3.2   2020-09-18 [1] CRAN (R 4.0.3)
#>  digest        0.6.25  2020-02-23 [1] CRAN (R 4.0.2)
#>  dplyr       * 1.0.2   2020-08-18 [1] CRAN (R 4.0.2)
#>  ellipsis      0.3.1   2020-05-15 [1] CRAN (R 4.0.2)
#>  evaluate      0.14    2019-05-28 [1] CRAN (R 4.0.2)
#>  fansi         0.4.1   2020-01-08 [1] CRAN (R 4.0.2)
#>  forcats     * 0.5.0   2020-03-01 [1] CRAN (R 4.0.2)
#>  fs            1.5.0   2020-07-31 [1] CRAN (R 4.0.2)
#>  generics      0.0.2   2018-11-29 [1] CRAN (R 4.0.2)
#>  ggplot2     * 3.3.2   2020-06-19 [1] CRAN (R 4.0.2)
#>  glue          1.4.2   2020-08-27 [1] CRAN (R 4.0.2)
#>  gtable        0.3.0   2019-03-25 [1] CRAN (R 4.0.2)
#>  haven         2.3.1   2020-06-01 [1] CRAN (R 4.0.3)
#>  highr         0.8     2019-03-20 [1] CRAN (R 4.0.2)
#>  hms           0.5.3   2020-01-08 [1] CRAN (R 4.0.2)
#>  htmltools     0.5.0   2020-06-16 [1] CRAN (R 4.0.2)
#>  httr          1.4.2   2020-07-20 [1] CRAN (R 4.0.2)
#>  jsonlite      1.7.1   2020-09-07 [1] CRAN (R 4.0.2)
#>  knitr         1.29    2020-06-23 [1] CRAN (R 4.0.2)
#>  lifecycle     0.2.0   2020-03-06 [1] CRAN (R 4.0.2)
#>  lubridate     1.7.9   2020-06-08 [1] CRAN (R 4.0.2)
#>  magrittr      1.5     2014-11-22 [1] CRAN (R 4.0.2)
#>  memoise       1.1.0   2017-04-21 [1] CRAN (R 4.0.3)
#>  modelr        0.1.8   2020-05-19 [1] CRAN (R 4.0.2)
#>  munsell       0.5.0   2018-06-12 [1] CRAN (R 4.0.2)
#>  pillar        1.4.6   2020-07-10 [1] CRAN (R 4.0.2)
#>  pkgbuild      1.1.0   2020-07-13 [1] CRAN (R 4.0.2)
#>  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.0.2)
#>  pkgload       1.1.0   2020-05-29 [1] CRAN (R 4.0.2)
#>  prettyunits   1.1.1   2020-01-24 [1] CRAN (R 4.0.2)
#>  processx      3.4.4   2020-09-03 [1] CRAN (R 4.0.2)
#>  ps            1.3.4   2020-08-11 [1] CRAN (R 4.0.2)
#>  purrr       * 0.3.4   2020-04-17 [1] CRAN (R 4.0.2)
#>  R6            2.4.1   2019-11-12 [1] CRAN (R 4.0.2)
#>  Rcpp          1.0.5   2020-07-06 [1] CRAN (R 4.0.2)
#>  readr       * 1.3.1   2018-12-21 [1] CRAN (R 4.0.2)
#>  readxl        1.3.1   2019-03-13 [1] CRAN (R 4.0.2)
#>  remotes       2.2.0   2020-07-21 [1] CRAN (R 4.0.3)
#>  reprex        0.3.0   2019-05-16 [1] CRAN (R 4.0.2)
#>  rlang         0.4.7   2020-07-09 [1] CRAN (R 4.0.2)
#>  rmarkdown     2.7     2021-02-19 [1] CRAN (R 4.0.2)
#>  rprojroot     2.0.2   2020-11-15 [1] CRAN (R 4.0.4)
#>  rvest         0.3.6   2020-07-25 [1] CRAN (R 4.0.2)
#>  scales        1.1.1   2020-05-11 [1] CRAN (R 4.0.2)
#>  sessioninfo   1.1.1   2018-11-05 [1] CRAN (R 4.0.3)
#>  stringi       1.5.3   2020-09-09 [1] CRAN (R 4.0.2)
#>  stringr     * 1.4.0   2019-02-10 [1] CRAN (R 4.0.2)
#>  testthat      2.3.2   2020-03-02 [1] CRAN (R 4.0.2)
#>  tibble      * 3.0.3   2020-07-10 [1] CRAN (R 4.0.2)
#>  tidyr       * 1.1.2   2020-08-27 [1] CRAN (R 4.0.2)
#>  tidyselect    1.1.0   2020-05-11 [1] CRAN (R 4.0.2)
#>  tidyverse   * 1.3.0   2019-11-21 [1] CRAN (R 4.0.2)
#>  usethis       1.6.3   2020-09-17 [1] CRAN (R 4.0.3)
#>  utf8          1.1.4   2018-05-24 [1] CRAN (R 4.0.2)
#>  vctrs         0.3.4   2020-08-29 [1] CRAN (R 4.0.2)
#>  withr         2.2.0   2020-04-20 [1] CRAN (R 4.0.2)
#>  xfun          0.16    2020-07-24 [1] CRAN (R 4.0.2)
#>  xml2          1.3.2   2020-04-23 [1] CRAN (R 4.0.2)
#>  yaml          2.2.1   2020-02-01 [1] CRAN (R 4.0.2)
#> 
#> [1] C:/Users/tnkil/OneDrive/Documents/R/win-library/4.0
#> [2] C:/Program Files/R/R-4.0.2/library

这很棘手,下面的答案并不能 100% 解决这个问题。现在有许多不同的方式 assertr 让你处理 errors/stops,看看 ?error_stop(这是默认的)。

您不仅需要过滤掉失败的行,还需要收集(全部)它们以供以后检查。

下面我写了我自己的错误处理程序。它获取那些失败的行,将它们过滤掉,并将它们存储在 全局环境 变量 my.failed.rows.


library(tidyverse)
library(assertr)
#> Warning: package 'assertr' was built under R version 4.0.5

test = tribble(
  ~x,~y,
  1,4,
  3,8,
  1,-2,
  3,1
)

my_error_fun <- function( ... ) {
    
    args <- list(...)

    do.call( just_warn, args )
    
    bad.data <- args[[1]][[1]]$error_df
    these.failed.rows <- args$data %>% slice( bad.data$index )

    if(!exists("my.failed.rows", inherits=TRUE)) {
        my.failed.rows <- NULL
    }
    my.failed.rows <- rbind( my.failed.rows, these.failed.rows )
    assign( "my.failed.rows", my.failed.rows, envir=parent.frame(n=3) )
    
    good.rows <- args$data %>% slice( -bad.data$index )
    
    return( good.rows )
    
}

my.result <- test %>%
     assert(within_bounds(0,100),y, error_fun = my_error_fun ) %>% 
     group_by(x) %>%
     summarise(avg = mean(y))

print(my.result)

print(my.failed.rows)

输出:

> print(my.result)
# A tibble: 2 x 2
      x   avg
  <dbl> <dbl>
1     1   4  
2     3   4.5
> print(my.failed.rows)
# A tibble: 1 x 2
      x     y
  <dbl> <dbl>
1     1    -2

下次,它将继续附加到 my.failed.rows,因此您可以在完成检查后截断或删除它。我还没有想出一种方法来自动执行此操作。本质上是一种检测是否正在进行新的链式 dplyr 操作的方法。