如何使用 dplyr R select 以名称作为日期的列

Question

我有一个数据框，其中一些 header 有名称、字符和日期。我想 select 所有没有日期的列 header 和所有日期小于当前日期或系统日期 (sys.date()) 的列。如何在 dplyr.

中使用 select 语句来做事

下面是数据框

> dput(job_times[1:5,])
structure(list(Skill = c("KAC", "KAC", "KAC", "KAC", "KAC"), 
    Patch = c("A1", "A2", "A3", "A4", "A5"), `Work Code` = c("W01", 
    "W01", "W01", "W01", "W01"), Product = c("KAC Repair", "KAC Repair", 
    "KAC Repair", "KAC Repair", "KAC Repair"), `Visit Time` = c(45.68, 
    42.55, 46.45, 51.86, 43.49), Travel = c(32.5, 21.66, 26.33, 
    28.63, 27.03), Success = c(0.69, 0.66, 0.67, 0.65, 0.67), 
    `Completion Time` = c(1.9, 1.61, 1.8, 2.05, 1.74), `28-12-2020` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `04-01-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `11-01-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `18-01-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `25-01-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `01-02-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `08-02-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `15-02-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `22-02-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `01-03-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `08-03-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `15-03-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `22-03-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `29-03-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `05-04-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `12-04-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `19-04-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `26-04-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `03-05-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `10-05-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `17-05-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `24-05-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `31-05-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `07-06-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `14-06-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `21-06-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `28-06-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `05-07-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `12-07-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `19-07-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `26-07-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `02-08-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `09-08-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `16-08-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `23-08-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `30-08-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `06-09-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `13-09-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `20-09-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `27-09-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `04-10-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `11-10-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `18-10-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `25-10-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `01-11-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `08-11-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `15-11-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `22-11-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `29-11-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `06-12-2021` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `13-12-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `20-12-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `27-12-2021` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `03-01-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `10-01-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `17-01-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `24-01-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `31-01-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `07-02-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `14-02-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `21-02-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `28-02-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `07-03-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `14-03-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `21-03-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `28-03-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `04-04-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `11-04-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `18-04-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `25-04-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `02-05-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `09-05-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `16-05-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `23-05-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `30-05-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `06-06-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `13-06-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `20-06-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `27-06-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `04-07-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `11-07-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `18-07-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `25-07-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `01-08-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `08-08-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `15-08-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `22-08-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `29-08-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `05-09-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `12-09-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `19-09-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `26-09-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `03-10-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `10-10-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `17-10-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `24-10-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `31-10-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `07-11-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `14-11-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `21-11-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `28-11-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74), `05-12-2022` = c(1.9, 1.61, 1.8, 
    2.05, 1.74), `12-12-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), 
    `19-12-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `26-12-2022` = c(1.9, 
    1.61, 1.8, 2.05, 1.74)), row.names = c(NA, -5L), class = c("tbl_df", 
"tbl", "data.frame"))

我想要技能、补丁、工作代码、产品、访问时间、旅行、成功、完成时间列以及日期小于或等于 sys.Date() 的所有列。使用 dplyr 和 select 语句。

Answer 1

我就是这样解决的-

cols <- grep('\d{2}-\d{2}-\d{2}', names(job_times), value = TRUE)
result <- job_times[, c(setdiff(names(job_times), cols), 
              cols[Sys.Date() > as.Date(cols, '%d-%m-%Y')])]

您可以将其集成到 dplyr 管道中，如 -

library(dplyr)

job_times %>%
  select({
    cols <- grep('\d{2}-\d{2}-\d{2}', names(.), value = TRUE)
    c(setdiff(names(.), cols),
      cols[Sys.Date() > as.Date(cols, '%d-%m-%Y')])
  })

Answer 2

Ronak Shah 的回答非常棒。这是我的做法。

    ## Get the List of All Column Names
    ColumnNames <- names(TestDF)

    ## Retain Only those don't have Dates
    CharacterColumnNames <- ColumnNames[grepl( "[[:alpha:]]" , names( TestDF ) )] 

    ## Get the List of all Date Column Names
    DateColumns <- setdiff(names(TestDF),CharacterColumnNames)

    ## Filter Required Date Column Names
    RequiredDateColumns <- DateColumns[ Sys.Date() > as.Date(DateColumns, '%d-%m-%Y')]

    ## Get the Modified DF
    ModifiedDF <- TestDF[, c(CharacterColumnNames, RequiredDateColumns)]

Answer 3

我建议创建一个辅助函数，然后您可以像这样使用 select：

library(tidyverse)
library(lubridate)

is_before_today <- function(x) {
  (dmy(x, quiet = TRUE) < Sys.Date()) %>% coalesce(FALSE)
}

df %>% 
  select(
    matches("^\D"), all_of(colnames(.) %>% keep(is_before_today))
  )
#> # A tibble: 5 x 38
#>   Skill Patch `Work Code` Product   `Visit Time` Travel Success `Completion Tim~
#>   <chr> <chr> <chr>       <chr>            <dbl>  <dbl>   <dbl>            <dbl>
#> 1 KAC   A1    W01         KAC Repa~         45.7   32.5    0.69             1.9 
#> 2 KAC   A2    W01         KAC Repa~         42.6   21.7    0.66             1.61
#> 3 KAC   A3    W01         KAC Repa~         46.4   26.3    0.67             1.8 
#> 4 KAC   A4    W01         KAC Repa~         51.9   28.6    0.65             2.05
#> 5 KAC   A5    W01         KAC Repa~         43.5   27.0    0.67             1.74
#> # ... with 30 more variables: 28-12-2020 <dbl>, 04-01-2021 <dbl>,
#> #   11-01-2021 <dbl>, 18-01-2021 <dbl>, 25-01-2021 <dbl>, 01-02-2021 <dbl>,
#> #   08-02-2021 <dbl>, 15-02-2021 <dbl>, 22-02-2021 <dbl>, 01-03-2021 <dbl>,
#> #   08-03-2021 <dbl>, 15-03-2021 <dbl>, 22-03-2021 <dbl>, 29-03-2021 <dbl>,
#> #   05-04-2021 <dbl>, 12-04-2021 <dbl>, 19-04-2021 <dbl>, 26-04-2021 <dbl>,
#> #   03-05-2021 <dbl>, 10-05-2021 <dbl>, 17-05-2021 <dbl>, 24-05-2021 <dbl>,
#> #   31-05-2021 <dbl>, 07-06-2021 <dbl>, 14-06-2021 <dbl>, 21-06-2021 <dbl>,
#> #   28-06-2021 <dbl>, 05-07-2021 <dbl>, 12-07-2021 <dbl>, 19-07-2021 <dbl>

^{由 reprex package (v1.0.0)}

于 2021-07-20 创建

如何使用 dplyr R select 以名称作为日期的列

How to select columns with names as dates using dplyr R

r

data-analysis

dplyr