矢量化日期函数

Vectorise date function

我需要清理一系列大型数据库。一列是日期向量。根据年份的不同,需要对变量进行不同的清理。

我写了一个函数,但它非常非常慢,尤其是当我必须使用它时 rowwise()

我怎样才能让它变得更好?

这是复杂的清理函数:

cleandate_fn <- function(date,year){

  if(year<=2010){
    date = str_pad(date,8,"left","0") 
    date = as.Date(date,format="%d%m%Y")
  } else if (year==2011) {
    date = str_pad(date,6,"left","0") 
    date = ifelse(str_sub(date,1,2)=="02",paste0("20",str_sub(date,3,8)),date)
    date = ifelse(str_sub(date,5,6)=="00",paste0(str_sub(date,1,4),"01",str_sub(date,7,8)),date)
    date = gsub("\\|/","0",date)
    date = ifelse(str_sub(date,5,6)=="00",paste0(str_sub(date,1,4),"01",str_sub(date,7,8)),date)
    date = ifelse(str_sub(date,5,8) %in% c("0229","0230","0231"),paste0(str_sub(date,1,4),"0131"),date)
    date = as.Date(date,format="%Y%m%d")
  } else {
    date = ifelse(str_sub(date,4,6)=="//1",paste0(str_sub(date,1,3),"/19",str_sub(date,7,8)),date)
    date = ifelse(str_sub(date,4,6)=="//2",paste0(str_sub(date,1,3),"/20",str_sub(date,7,8)),date)
    date = ifelse(str_sub(date,5,6)=="/9",paste0(str_sub(date,1,4),"19",str_sub(date,7,8)),date)    
    date = ifelse(str_sub(date,5,6)=="/0",paste0(str_sub(date,1,4),"20",str_sub(date,7,8)),date)
    date = gsub("\\|/","0",date)
    date = ifelse(str_sub(date,1,2)=="00",paste0("01",str_sub(date,3,8)),date)
    date = ifelse(str_sub(date,3,4)=="00",paste0(str_sub(date,1,2),"01",str_sub(date,5,8)),date)
    date = paste0(str_sub(date,1,2),"05",str_sub(date,5,8))
    date = as.Date(date,format="%d%m%Y")}

  return(date)

}

玩具示例:

data <- data.frame(date=c("19052003","29062012","008//210","05/2/001","01011980"),
                   year=rep(2010,5)) 

data <- data %>% 
  rowwise() %>% 
  mutate(date=cleandate_fn(date,unique(data$year))) 

通过'year'进行分组,然后获取year'的第一个元素作为函数的入口可能更容易

library(dplyr)
df1 %>%
    group_by(year) %>%
    mutate(datenew = cleandate_fn(date, first(year))) %>%
    ungroup

-输出

# A tibble: 5 × 3
  date      year datenew   
  <chr>    <dbl> <date>    
1 19052003  2010 2003-05-19
2 29062012  2010 2012-06-29
3 008//210  2010 NA        
4 05/2/001  2010 NA        
5 01011980  2010 1980-01-01