R - 扩展一系列日期之间的值并将其作为列添加到 data.table

R - Expanding a value between a sequence of dates and add as columns to data.table

更新: akrun 提供的建议解决方案对我有效 但是 ,我的问题是 value.var = RATING 中定义的值是 only相应的日期栏。请注意,所有定义为 RATING_DATEVALID_THRU_DATE 之间的时间段的月份都没有填写。

到目前为止我尝试过但失败了: 而不是像这样定义 dcast 操作

dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE, 
      value.var = 'RATING')

我试过了

  dt1 <- dcast(setDT(ratings.dt), 
                 ISSUE_ID + RATING_TYPE ~ (VALID_THRU_DATE - RATING_DATE), 
                 value.var = 'RATING')      


 dt1 <- dcast(setDT(ratings.dt), 
             ISSUE_ID + RATING_TYPE ~ as.yearmon(seq(
             RATING_DATE, VALID_THRU_DATE), frac = 1), 
             value.var = 'RATING')


dt1 <- dcast(setDT(ratings.dt), 
                 ISSUE_ID + RATING_TYPE ~ (RATING_DATE:VALID_THRU_DATE), 
                 value.var = 'RATING')

我认为我可以只使用定义每个评级有效期的 2 列,因为它们都是 dcast() 函数调用中的日期列,但显然该任务背后的逻辑更难以概念化。

现在我通过首先构建一个“骨架 data.table”手动概念化此任务,然后通过在长格式中循环 row-wise 原始评分 data.table 来填充骨架并展开骨架中两个日期之间定义的评级 table。 (我将 RATING 重命名为 RATING_NUM 以区别于“原始”字母数字评级)

# (0) Filter only the most recent rating within a given month
ratings_num.dt <- ratings_num.dt[, 
                                 .SD[.N], 
                                 by = .(ISSUE_ID, RATING_TYPE, RATING_DATE)] 

# (1) Defining start and end date for the rating time series
start_date    <- as.Date("1990-01-01", "%Y-%m-%d")
end_date      <- as.Date("2021-01-31", "%Y-%m-%d")

# (2) Define the dates as new columns for a skeleton data.table
new_cols      <- seq(from = start_date, 
                     to = end_date,
                     by = "month")
new_cols      <- date_ymd_to_m_end(new_cols)
new_col_names <- as.character(new_cols, "%Y-%m-%d")

# (3) Determine how many months the rating time series spans 
N_months <- elapsed_months_lubri(start_date, end_date) + 1 
            # some function to do just what the name implies

MONTH_ID <- c(1:N_months)

# (4) Define the layout of the new skeleton table
# Note: The new table should contain the 3 rows per issue ID, namely the rating time series of each issue ID for every considered rating ageny 

rating_type.vec <- c("FR", "MR", "SPR")    

df_skeleton <- data.frame(rep(issue_IDs.vec, each = 3), rating_type.vec)

someInitialValue <- 0

# Credit to Jonas
to_Add <- setNames(data.frame(matrix(rep(
            someInitialValue, nrow(df_skeleton)*length(new_col_names)), 
            ncol = length(new_col_names), 
            nrow = NROW(df))), 
            new_col_names)

ratings_num_ts.df <- cbind(df_skeleton, to_Add)
ratings_num_ts.dt <- setDT(ratings_num_ts.df)

setnames(ratings_num_ts.dt, 
         c("rep.issue_IDs.vec..each...3.", "rating_type.vec"),
         c("ISSUE_ID", "RATING_TYPE"))

# (5) Create a data.table to join on ratings_num.dt to add month IDs to use for assigning ratings

seq_dates.dt <- setDT(data.frame(new_cols, MONTH_ID))
seq_dates.dt <- setnames(seq_dates.dt, c("new_cols"), c("RATING_DATE"))

ratings_num.dt <- ratings_num.dt[seq_dates.dt, 
                                 on = .(RATING_DATE = RATING_DATE)]

ratings_num.dt <- ratings_num.dt[seq_dates.dt, 
                                 on = .(RATING_VAL_THRU = RATING_DATE)]

# (6) If for the joined MONTH_IDs there is no corresponding RATING_DATE or RATING_VAL_THRU entry, the join will write NA values for these values in the joined table and can be filtered out accordingly

ratings_num.dt <- ratings_num.dt[!is.na(ISSUE_ID)]

# (7) Rename column of second MONTH_ID
setnames(ratings_num.dt,
         c("MONTH_ID", "i.MONTH_ID"),
         c("MONTH_ID_START", "MONTH_ID_END"))

# (8) Sort table by setting keys 
setkey(ratings_num.dt, ISSUE_ID, RATING_TYPE, RATING_DATE)

# (9) Defining logic as loop 
tic()

i <- 1
j <- nrow(ratings_num.dt)
  
id.vec             <- ratings_num.dt[, ISSUE_ID] 
rating_type.vec    <- ratings_num.dt[, RATING_TYPE]
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2)  
month_ID_end.vec   <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec     <- ratings_num.dt[, RATING_NUM]

total <- j
pb <- progress_bar$new(format = "[:bar] :current/:total 
                        (:percent) eta: :eta", total = total)

  
spread_ratings_to_ts <- function(dt_source, dt_ts) {
  pb$tick(0)
  for (i in 1:j) {
    id             <- id.vec[i]  # alternatively ROW_ID == i
    rating_type    <- rating_type.vec[i]
    month_ID_start <- month_ID_start.vec[i]  # change to right value
    month_ID_end   <- month_ID_end.vec[i]
    rating_num     <- rating_num.vec[i]
    
    dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type, 
          (month_ID_start:month_ID_end) := rating_num]
    
    if (i %% 50 == 0) {
      pb$tick()
    }  
    
    i <- i + 1
  }
}

spread_ratings_to_ts(ratings_num.dt, ratings_num_ts.dt)

toc() 
## ~ 3,600 sec for ~ 250k rows to loop through ##


# (10) Compute rating means
# Substitute all pre-filled zeros in the table with NA as there is simply no 
# rating available at this point in time

ratings_num_ts.dt <- ratings_num_ts.dt %>% 
                       na_if(0)
ratings_num_ts.dt <- rbind(ratings_num_ts.dt, 
                       ratings_num_ts.dt[, 
                                         c(.(RATING_TYPE = 'Mean'), 
                                             lapply(.SD, mean, na.rm=TRUE)), 
                                         by = .(ISSUE_ID), 
                                         .SDcols = -(1:2)])

setkey(ratings_num_ts.dt, ISSUE_ID, RATING_TYPE)

我尝试使用 foreach(...) %dopar% function(...) 并行化此循环,如下所示,但它目前无法正常工作。这主要是由上面非常低效的循环的运行时所激发的——尽管它工作得很好并且完成了我想要的。在处理 foreach 函数调用时,我特别不确定如何编写一个 suitable 组合函数,我可以将其放入 foreach 调用中,以根据需要包装结果。

i <- 1
j <- nrow(ratings_num.dt)

id.vec             <- ratings_num.dt[, ISSUE_ID]
rating_type.vec    <- ratings_num.dt[, RATING_TYPE]

# col 1+2 not rating but ISSUE_ID and RATING_TYPE
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2) 
month_ID_end.vec   <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec     <- ratings_num.dt[, RATING_NUM]

spread_ratings_to_ts <- function(dt_source, dt_ts) {
  id             <- id.vec[i]
  rating_type    <- rating_type.vec[i]
  month_ID_start <- month_ID_start.vec[i]
  month_ID_end   <- month_ID_end.vec[i]
  rating_num     <- rating_num.vec[i]
  
  dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type][, 
        (month_ID_start:month_ID_end) := rating_num]
}   

myCluster <- makeCluster(((detectCores()/2) - 1), type = "PSOCK")
registerDoParallel(myCluster)

clusterEvalQ(cl = myCluster, {
  setMKLthreads(1)
})

foreach(i = 1:j, .combine = 'rbind') %dopar% 
    spread_ratings_to_ts(dt_source = ratings_num.dt,
                         dt_ts = ratings_num_ts.dt)

stopCluster(myCluster)

背景/数据: 理论上这很容易,即使是 3 岁的孩子也可以手动完成这项任务,但即使在解决这个问题将近一周后,我也没有进一步的解决方案。

问题: 我正在处理一个大型财务数据集。它包含由 ISSUE_ID 确定的债券发行及其相应的 RATING,由 3 家评级机构惠誉、穆迪和标准普尔提供,定义为 RATING_TYPE。我为每个评级确定了一个发布日期和一个定义为 RATING_DATEVALID_THRU_DATE 的 valid-thru 日期,两者都是 DATE 类型。所有日期都由 yearmonth() 格式化为给定月份的最后一天,因为它们的评级用于确定 index-inclusion 在月底评估其规则。

ISSUE_ID 的类型是 numeric

RATING 的类型是 character

RATING_TYPE 的类型是 character

我的数据设置为 data.table,名为 ratings.dt,我需要为其添加开始日期和结束日期之间的列。我的目标是每个问题 ID 有 3 行,每个评级机构各自的评级历史记录的时间序列。

data.table 的键设置为 ISSUE_ID、RATING_TYPE 和 RATING_DATE。

数据现在如下所示:

ISSUE_ID  RATING_TYPE  RATING   RATING_DATE   VALID_THRU_DATE RATING_DATE_SEQ
  123       FR           3.33   2000-01-31    2000-04-31             1
  123       FR           4.00   2000-05-31    2000-02-28             2
  123       FR           3.66   2001-03-31    2001-04-31             3
  123       FR           2.00   2001-05-31    2001-04-30             4
  123       FR           2.33   2001-04-30    2003-12-31             5
  123       FR           3.00   2004-01-31    2004-06-30             6
  123       MR           2.33   1999-04-31    1999-12-31             1
  123       MR           2.66   2000-01-31    2000-04-31             2
  123       MR           3.00   2001-03-31    2001-04-30             3
  123       MR           3.33   2001-05-31    2003-01-31             4
  123       MR           3.00   2003-02-28    2003-07-31             5
  123       SP           3.33   1999-04-31    2002-03-31             1
  123       SP           3.00   2002-04-31    2003-05-31             2 
  244       ...

现在我想基本上将 RATING 中定义的评级分布到一系列日期中。 我想这样去做:

 ISSUE_ID  RATING_TYPE   1999-04-30  1999-05-31  ...   2000-01-31  2000-02-28    ...  2004-06-30 
   123        FR                                 ...      3.33         2.33      ...     3.00
   123        MR            2.33         2.33    ...      2.66         2.66      ...
   123        SP            3.33         3.33    ...      3.33         2.66      ...
   244       ...

这样我可以做:

 ISSUE_ID  RATING_TYPE   1999-04-30  1999-05-31  ...   2000-01-31  2000-02-28    ...  2004-06-30 
   123       FR                                  ...      3.33         2.33      ...     3.00
   123       MR            2.33         2.33     ...      2.66         2.66      ...
   123       SP            3.33         3.33     ...      3.33         2.66      ...
   123      Mean           2.83         2.83     ...      3.11         2.55      ... 

然后我可以通过 data.table 这样的语法计算每个问题 ID 每月的平均评分

ratings.dt[, 
           lapply(.SD, mean),
           .SDcols = x:y,       # col indexes of added date sequence columns
           by = .(ISSUE_ID)]

使用我的映射 table 将字母数字评级(例如 AAA、B+、C- 等)转换为数值以允许 numeric-based 算术计算(例如平均值),我可以将数字评分平均回到字母数字评分。那将意味着任务完成!

另外,我现在不确定这个问题是否可以更有效地概念化。非常感谢任何指点!

我们使用 pivot_wider 转换宽格式,按 summarise 进行分组以通过将其他观察值与 mean 值连接来创建 'Mean' 行。使用 dplyr version >=1.0summarise 可以 return 每组多行

library(dplyr)
library(tidyr)
ratings.dt %>%
     select(-VALID_THRU_DATE, -RATING_DATE_SEQ) %>% 
     pivot_wider(names_from = RATING_DATE, values_from = RATING) %>% 
     group_by(ISSUE_ID) %>% 
     summarise(RATING_TYPE = c(RATING_TYPE, "Mean"), 
       across(where(is.numeric), ~ c(., mean(., na.rm = TRUE))), .groups = 'drop')

-输出

# A tibble: 4 x 11
#  ISSUE_ID RATING_TYPE `2000-01-31` `2000-05-31` `2001-03-31` `2001-05-31` `2001-04-30` `2004-01-31` `1999-04-31`
#     <int> <chr>              <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
#1      123 FR                  3.33            4         3.66         2            2.33            3        NA   
#2      123 MR                  2.66           NA         3            3.33        NA              NA         2.33
#3      123 SP                 NA              NA        NA           NA           NA              NA         3.33
#4      123 Mean                3.00            4         3.33         2.66         2.33            3         2.83
# … with 2 more variables: `2003-02-28` <dbl>, `2002-04-31` <dbl>
 

或使用data.table

library(data.table)
dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE, 
      value.var = 'RATING')
rbind(dt1, dt1[, c(.(RATING_TYPE = 'Mean'), lapply(.SD, mean, na.rm = TRUE)), .(ISSUE_ID), .SDcols = -(1:2)])
#   ISSUE_ID RATING_TYPE 1999-04-31 2000-01-31 2000-05-31 2001-03-31 2001-04-30 2001-05-31 2002-04-31 2003-02-28
#1:      123          FR         NA      3.330          4       3.66       2.33      2.000         NA         NA
#2:      123          MR       2.33      2.660         NA       3.00         NA      3.330         NA          3
#3:      123          SP       3.33         NA         NA         NA         NA         NA          3         NA
#4:      123        Mean       2.83      2.995          4       3.33       2.33      2.665          3          3
#   2004-01-31
#1:          3
#2:         NA
#3:         NA
#4:          3

数据

ratings.dt <- structure(list(ISSUE_ID = c(123L, 123L, 123L, 123L, 123L, 123L, 
123L, 123L, 123L, 123L, 123L, 123L, 123L), RATING_TYPE = c("FR", 
"FR", "FR", "FR", "FR", "FR", "MR", "MR", "MR", "MR", "MR", "SP", 
"SP"), RATING = c(3.33, 4, 3.66, 2, 2.33, 3, 2.33, 2.66, 3, 3.33, 
3, 3.33, 3), RATING_DATE = c("2000-01-31", "2000-05-31", "2001-03-31", 
"2001-05-31", "2001-04-30", "2004-01-31", "1999-04-31", "2000-01-31", 
"2001-03-31", "2001-05-31", "2003-02-28", "1999-04-31", "2002-04-31"
), VALID_THRU_DATE = c("2000-04-31", "2000-02-28", "2001-04-31", 
"2001-04-30", "2003-12-31", "2004-06-30", "1999-12-31", "2000-04-31", 
"2001-04-30", "2003-01-31", "2003-07-31", "2002-03-31", "2003-05-31"
), RATING_DATE_SEQ = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 
5L, 1L, 2L)), class = "data.frame", row.names = c(NA, -13L))