在 R 中,将 "Oct 29 - Nov 1" 转换为“20201029”和“20201101”

In R, convert "Oct 29 - Nov 1" to "20201029" and "20201101"

我正在处理从网站上抓取的凌乱 table,为了使日期栏更有用,我需要清理抓取的内容。我们的数据看起来像这样:

mydata <- structure(list(Dates = c("Sep\r\n            \r\n            10 - 13", 
"Oct\r\n            \r\n            8 - 11", "Oct 29 - Nov 1", 
"Nov\r\n            \r\n            19 - 22", "Jan\r\n            \r\n            21 - 24", 
"Mar\r\n            \r\n            4 - 7", "Apr 29 - May 2"), 
    points = c("500", "500", "500", "500", "500", "550", "500"
    )), row.names = c(1L, 5L, 8L, 11L, 16L, 23L, 32L), class = "data.frame")


> mydata
                                        Dates points
1  Sep\r\n            \r\n            10 - 13    500
5   Oct\r\n            \r\n            8 - 11    500
8                              Oct 29 - Nov 1    500
11 Nov\r\n            \r\n            19 - 22    500
16 Jan\r\n            \r\n            21 - 24    500
23   Mar\r\n            \r\n            4 - 7    550
32                             Apr 29 - May 2    500

Dates 中的每个日期都是一个日期范围,实际上应该是 startDateendDate。那么我们要创建的是:

> newdata
     StartDate   EndDate  points
1     20200910  20200913     500
1     20201008  20201011     500
1     20201029  20201101     500
1     20201119  20201122     500
1     20210121  20210124     500
1     20210304  20210307     500
1     20210429  20210502     500

我们可以假设 9 月 - 12 月的所有日期都是 2020 年,1 月 - 8 月的所有日期都是 2021 年。

编辑 1

这可能不是最简洁的代码,但我成功地将 Dates 列拆分为 2 列

  cleaning_dates_df <- do.call('rbind', strsplit(mydata$Dates, '-')) %>% as.data.frame()
  colnames(cleaning_dates_df) <- c('start', 'end')
  cleaning_dates_df$start <- as.character(cleaning_dates_df$start)
  cleaning_dates_df$end <- as.character(cleaning_dates_df$end)
  cleaning_dates_df <- cleaning_dates_df %>%
    dplyr::mutate(end = ifelse(nchar(end) > 4, end, paste0(trimws(sub("\r\n.*", "", start)), end))) %>%
    dplyr::mutate(start = ifelse(nchar(start) < 8, start, paste0(trimws(sub("\r\n.*", "", start)), sub(".*\s", "", start)))) %>%
    dplyr::mutate(end = trimws(end)) %>% dplyr::mutate(start = trimws(start))

  head(cleaning_dates_df, 8)

...还需要转换成YYYYMMDD

我不会说它很漂亮,但是你可以先使用正则表达式来提取所有部分:

rgx <- "^([a-z]+)(\r|\n|\s)+(\d+)\s\-\s([a-z]+)*\s*(\d+)$"
td <- strcapture(rgx, tolower(mydata$Dates), 
                 proto=list(mth1="",x="",day1="",mth2="",day2=""))

只提到一个月时重复月份:

td$mth2[td$mth2 == ''] <- td$mth1[td$mth2 == '']

将月份转换为数字,然后决定是 2020 年还是 2021 年:

td[c("mth1","mth2")] <- lapply(td[c("mth1","mth2")],
                               function(x) match(x, tolower(month.abb)))
td[c("yr1","yr2")]   <- lapply(td[c("mth1","mth2")],
                               function(x) ifelse(x >= 9, 2020, 2021) )

从不同的部分构建日期:

mydata$startdate <- as.Date(paste(td$yr1, td$mth1, td$day1, sep="/"))
mydata$enddate   <- as.Date(paste(td$yr2, td$mth2, td$day2, sep="/"))

完成!:

mydata

#                                        Dates points  startdate    enddate
#1  Sep\r\n            \r\n            10 - 13    500 2020-09-10 2020-09-13
#5   Oct\r\n            \r\n            8 - 11    500 2020-10-08 2020-10-11
#8                              Oct 29 - Nov 1    500 2020-10-29 2020-11-01
#11 Nov\r\n            \r\n            19 - 22    500 2020-11-19 2020-11-22
#16 Jan\r\n            \r\n            21 - 24    500 2021-01-21 2021-01-24
#23   Mar\r\n            \r\n            4 - 7    550 2021-03-04 2021-03-07
#32                             Apr 29 - May 2    500 2021-04-29 2021-05-02

你可以试试:

library(lubridate)
library(dplyr)

d <- do.call(rbind, lapply(str_split(gsub("[\v-]", " ", mydata$Dates), "\s+"), function(x) if (length(x) == 3) append(x, x[1], after = 2) else x) )

start_date <- as.Date(paste(d[,1], d[,2], "2020", sep = "-"), format = "%b-%d-%Y")
end_date <- as.Date(paste(d[,3], d[,4], "2020", sep = "-"), format = "%b-%d-%Y")

start_date <- if_else(month(start_date) < 9, start_date + years(1), start_date)
end_date <- if_else(month(end_date) < 9, end_date + years(1), end_date)

data.frame(start_date, end_date,mydata$points)

  start_date   end_date mydata.points
1 2020-09-10 2020-09-13           500
2 2020-10-08 2020-10-11           500
3 2020-10-29 2020-11-01           500
4 2020-11-19 2020-11-22           500
5 2021-01-21 2021-01-24           500
6 2021-03-04 2021-03-07           550
7 2021-04-29 2021-05-02           500

除非您有理由不这样做,否则最好以日期格式保存数据。但如果你需要它们作为显示的字符串,你可以使用 format(),例如:

format(start_date, "%Y%m%d")

这是一个凌乱的 Base R 解决方案:

# Function to pad dateparts: pad_dateparts => function()
pad_dateparts <- function(date_vec){
  return(ifelse(nchar(date_vec) == 1, paste0("0", date_vec), date_vec))
}
   
# Store the months for each obersvation: months_ => list of characters
months_ <-
  lapply(regmatches(mydata$Dates, gregexpr(
    paste0(month.abb, collapse = "|"), mydata$Dates)), function(x) {
    if (length(x) == 1) {
      pad_dateparts(match(rep(x, 2), month.abb))
    } else{
      pad_dateparts(match(x, month.abb))
    }
  }
)

# Store the day numbers for each obersvation: days_ => list of characters
days_ <- lapply(sapply(trimws(gsub('\D+',' ', mydata$Dates), "both"), strsplit, "\s+"),
                pad_dateparts)

# Function to increment years from ordered vector of month parts:
# increment_years => function()
increment_years <- function(start_year, ordered_month_vec){
  return(start_year + cumsum(c(FALSE, diff(as.integer(ordered_month_vec)) < 0)))
}

# Store the year parts: years_ => list of data.frames
years_ <- split(apply(data.frame(do.call(rbind, months_)), 2, 
                      function(x){increment_years(2020, x)}), seq_along(months_))
    
# Create the required data.frame: clean_df => data.frame
clean_df <- cbind(setNames(
  data.frame(
    do.call(rbind, Map(function(x, y, z) {
      as.integer(paste0(x, y, z))
    },
    years_, months_, days_)),
    row.names = NULL,
    stringsAsFactors = FALSE
  ),
  c("StartDate", "EndDate")
),
Points = mydata$points)

数据:

mydata <- structure(list(Dates = c("Sep\r\n            \r\n            10 - 13", 
"Oct\r\n            \r\n            8 - 11", "Oct 29 - Nov 1", 
"Nov\r\n            \r\n            19 - 22", "Jan\r\n            \r\n            21 - 24", 
"Mar\r\n            \r\n            4 - 7", "Apr 29 - May 2"), 
points = c("500", "500", "500", "500", "500", "550", "500"
)), row.names = c(1L, 5L, 8L, 11L, 16L, 23L, 32L), class = "data.frame")