R - 扩展一系列日期之间的值并将其作为列添加到 data.table
R - Expanding a value between a sequence of dates and add as columns to data.table
更新:
akrun 提供的建议解决方案对我有效 但是 ,我的问题是 value.var = RATING
中定义的值是 only相应的日期栏。请注意,所有定义为 RATING_DATE
和 VALID_THRU_DATE
之间的时间段的月份都没有填写。
到目前为止我尝试过但失败了:
而不是像这样定义 dcast 操作
dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE,
value.var = 'RATING')
我试过了
dt1 <- dcast(setDT(ratings.dt),
ISSUE_ID + RATING_TYPE ~ (VALID_THRU_DATE - RATING_DATE),
value.var = 'RATING')
dt1 <- dcast(setDT(ratings.dt),
ISSUE_ID + RATING_TYPE ~ as.yearmon(seq(
RATING_DATE, VALID_THRU_DATE), frac = 1),
value.var = 'RATING')
dt1 <- dcast(setDT(ratings.dt),
ISSUE_ID + RATING_TYPE ~ (RATING_DATE:VALID_THRU_DATE),
value.var = 'RATING')
我认为我可以只使用定义每个评级有效期的 2 列,因为它们都是 dcast()
函数调用中的日期列,但显然该任务背后的逻辑更难以概念化。
现在我通过首先构建一个“骨架 data.table”手动概念化此任务,然后通过在长格式中循环 row-wise 原始评分 data.table 来填充骨架并展开骨架中两个日期之间定义的评级 table。 (我将 RATING 重命名为 RATING_NUM 以区别于“原始”字母数字评级)
# (0) Filter only the most recent rating within a given month
ratings_num.dt <- ratings_num.dt[,
.SD[.N],
by = .(ISSUE_ID, RATING_TYPE, RATING_DATE)]
# (1) Defining start and end date for the rating time series
start_date <- as.Date("1990-01-01", "%Y-%m-%d")
end_date <- as.Date("2021-01-31", "%Y-%m-%d")
# (2) Define the dates as new columns for a skeleton data.table
new_cols <- seq(from = start_date,
to = end_date,
by = "month")
new_cols <- date_ymd_to_m_end(new_cols)
new_col_names <- as.character(new_cols, "%Y-%m-%d")
# (3) Determine how many months the rating time series spans
N_months <- elapsed_months_lubri(start_date, end_date) + 1
# some function to do just what the name implies
MONTH_ID <- c(1:N_months)
# (4) Define the layout of the new skeleton table
# Note: The new table should contain the 3 rows per issue ID, namely the rating time series of each issue ID for every considered rating ageny
rating_type.vec <- c("FR", "MR", "SPR")
df_skeleton <- data.frame(rep(issue_IDs.vec, each = 3), rating_type.vec)
someInitialValue <- 0
# Credit to Jonas
to_Add <- setNames(data.frame(matrix(rep(
someInitialValue, nrow(df_skeleton)*length(new_col_names)),
ncol = length(new_col_names),
nrow = NROW(df))),
new_col_names)
ratings_num_ts.df <- cbind(df_skeleton, to_Add)
ratings_num_ts.dt <- setDT(ratings_num_ts.df)
setnames(ratings_num_ts.dt,
c("rep.issue_IDs.vec..each...3.", "rating_type.vec"),
c("ISSUE_ID", "RATING_TYPE"))
# (5) Create a data.table to join on ratings_num.dt to add month IDs to use for assigning ratings
seq_dates.dt <- setDT(data.frame(new_cols, MONTH_ID))
seq_dates.dt <- setnames(seq_dates.dt, c("new_cols"), c("RATING_DATE"))
ratings_num.dt <- ratings_num.dt[seq_dates.dt,
on = .(RATING_DATE = RATING_DATE)]
ratings_num.dt <- ratings_num.dt[seq_dates.dt,
on = .(RATING_VAL_THRU = RATING_DATE)]
# (6) If for the joined MONTH_IDs there is no corresponding RATING_DATE or RATING_VAL_THRU entry, the join will write NA values for these values in the joined table and can be filtered out accordingly
ratings_num.dt <- ratings_num.dt[!is.na(ISSUE_ID)]
# (7) Rename column of second MONTH_ID
setnames(ratings_num.dt,
c("MONTH_ID", "i.MONTH_ID"),
c("MONTH_ID_START", "MONTH_ID_END"))
# (8) Sort table by setting keys
setkey(ratings_num.dt, ISSUE_ID, RATING_TYPE, RATING_DATE)
# (9) Defining logic as loop
tic()
i <- 1
j <- nrow(ratings_num.dt)
id.vec <- ratings_num.dt[, ISSUE_ID]
rating_type.vec <- ratings_num.dt[, RATING_TYPE]
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2)
month_ID_end.vec <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec <- ratings_num.dt[, RATING_NUM]
total <- j
pb <- progress_bar$new(format = "[:bar] :current/:total
(:percent) eta: :eta", total = total)
spread_ratings_to_ts <- function(dt_source, dt_ts) {
pb$tick(0)
for (i in 1:j) {
id <- id.vec[i] # alternatively ROW_ID == i
rating_type <- rating_type.vec[i]
month_ID_start <- month_ID_start.vec[i] # change to right value
month_ID_end <- month_ID_end.vec[i]
rating_num <- rating_num.vec[i]
dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type,
(month_ID_start:month_ID_end) := rating_num]
if (i %% 50 == 0) {
pb$tick()
}
i <- i + 1
}
}
spread_ratings_to_ts(ratings_num.dt, ratings_num_ts.dt)
toc()
## ~ 3,600 sec for ~ 250k rows to loop through ##
# (10) Compute rating means
# Substitute all pre-filled zeros in the table with NA as there is simply no
# rating available at this point in time
ratings_num_ts.dt <- ratings_num_ts.dt %>%
na_if(0)
ratings_num_ts.dt <- rbind(ratings_num_ts.dt,
ratings_num_ts.dt[,
c(.(RATING_TYPE = 'Mean'),
lapply(.SD, mean, na.rm=TRUE)),
by = .(ISSUE_ID),
.SDcols = -(1:2)])
setkey(ratings_num_ts.dt, ISSUE_ID, RATING_TYPE)
我尝试使用 foreach(...) %dopar% function(...)
并行化此循环,如下所示,但它目前无法正常工作。这主要是由上面非常低效的循环的运行时所激发的——尽管它工作得很好并且完成了我想要的。在处理 foreach 函数调用时,我特别不确定如何编写一个 suitable 组合函数,我可以将其放入 foreach 调用中,以根据需要包装结果。
i <- 1
j <- nrow(ratings_num.dt)
id.vec <- ratings_num.dt[, ISSUE_ID]
rating_type.vec <- ratings_num.dt[, RATING_TYPE]
# col 1+2 not rating but ISSUE_ID and RATING_TYPE
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2)
month_ID_end.vec <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec <- ratings_num.dt[, RATING_NUM]
spread_ratings_to_ts <- function(dt_source, dt_ts) {
id <- id.vec[i]
rating_type <- rating_type.vec[i]
month_ID_start <- month_ID_start.vec[i]
month_ID_end <- month_ID_end.vec[i]
rating_num <- rating_num.vec[i]
dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type][,
(month_ID_start:month_ID_end) := rating_num]
}
myCluster <- makeCluster(((detectCores()/2) - 1), type = "PSOCK")
registerDoParallel(myCluster)
clusterEvalQ(cl = myCluster, {
setMKLthreads(1)
})
foreach(i = 1:j, .combine = 'rbind') %dopar%
spread_ratings_to_ts(dt_source = ratings_num.dt,
dt_ts = ratings_num_ts.dt)
stopCluster(myCluster)
背景/数据:
理论上这很容易,即使是 3 岁的孩子也可以手动完成这项任务,但即使在解决这个问题将近一周后,我也没有进一步的解决方案。
问题:
我正在处理一个大型财务数据集。它包含由 ISSUE_ID
确定的债券发行及其相应的 RATING
,由 3 家评级机构惠誉、穆迪和标准普尔提供,定义为 RATING_TYPE
。我为每个评级确定了一个发布日期和一个定义为 RATING_DATE
和 VALID_THRU_DATE
的 valid-thru 日期,两者都是 DATE
类型。所有日期都由 yearmonth() 格式化为给定月份的最后一天,因为它们的评级用于确定 index-inclusion 在月底评估其规则。
ISSUE_ID
的类型是 numeric
RATING
的类型是 character
RATING_TYPE
的类型是 character
我的数据设置为 data.table,名为 ratings.dt
,我需要为其添加开始日期和结束日期之间的列。我的目标是每个问题 ID 有 3 行,每个评级机构各自的评级历史记录的时间序列。
data.table 的键设置为 ISSUE_ID、RATING_TYPE 和 RATING_DATE。
数据现在如下所示:
ISSUE_ID RATING_TYPE RATING RATING_DATE VALID_THRU_DATE RATING_DATE_SEQ
123 FR 3.33 2000-01-31 2000-04-31 1
123 FR 4.00 2000-05-31 2000-02-28 2
123 FR 3.66 2001-03-31 2001-04-31 3
123 FR 2.00 2001-05-31 2001-04-30 4
123 FR 2.33 2001-04-30 2003-12-31 5
123 FR 3.00 2004-01-31 2004-06-30 6
123 MR 2.33 1999-04-31 1999-12-31 1
123 MR 2.66 2000-01-31 2000-04-31 2
123 MR 3.00 2001-03-31 2001-04-30 3
123 MR 3.33 2001-05-31 2003-01-31 4
123 MR 3.00 2003-02-28 2003-07-31 5
123 SP 3.33 1999-04-31 2002-03-31 1
123 SP 3.00 2002-04-31 2003-05-31 2
244 ...
现在我想基本上将 RATING
中定义的评级分布到一系列日期中。
我想这样去做:
ISSUE_ID RATING_TYPE 1999-04-30 1999-05-31 ... 2000-01-31 2000-02-28 ... 2004-06-30
123 FR ... 3.33 2.33 ... 3.00
123 MR 2.33 2.33 ... 2.66 2.66 ...
123 SP 3.33 3.33 ... 3.33 2.66 ...
244 ...
这样我可以做:
ISSUE_ID RATING_TYPE 1999-04-30 1999-05-31 ... 2000-01-31 2000-02-28 ... 2004-06-30
123 FR ... 3.33 2.33 ... 3.00
123 MR 2.33 2.33 ... 2.66 2.66 ...
123 SP 3.33 3.33 ... 3.33 2.66 ...
123 Mean 2.83 2.83 ... 3.11 2.55 ...
然后我可以通过 data.table 这样的语法计算每个问题 ID 每月的平均评分
ratings.dt[,
lapply(.SD, mean),
.SDcols = x:y, # col indexes of added date sequence columns
by = .(ISSUE_ID)]
使用我的映射 table 将字母数字评级(例如 AAA、B+、C- 等)转换为数值以允许 numeric-based 算术计算(例如平均值),我可以将数字评分平均回到字母数字评分。那将意味着任务完成!
另外,我现在不确定这个问题是否可以更有效地概念化。非常感谢任何指点!
我们使用 pivot_wider
转换宽格式,按 summarise
进行分组以通过将其他观察值与 mean
值连接来创建 'Mean' 行。使用 dplyr
version >=1.0
,summarise
可以 return 每组多行
library(dplyr)
library(tidyr)
ratings.dt %>%
select(-VALID_THRU_DATE, -RATING_DATE_SEQ) %>%
pivot_wider(names_from = RATING_DATE, values_from = RATING) %>%
group_by(ISSUE_ID) %>%
summarise(RATING_TYPE = c(RATING_TYPE, "Mean"),
across(where(is.numeric), ~ c(., mean(., na.rm = TRUE))), .groups = 'drop')
-输出
# A tibble: 4 x 11
# ISSUE_ID RATING_TYPE `2000-01-31` `2000-05-31` `2001-03-31` `2001-05-31` `2001-04-30` `2004-01-31` `1999-04-31`
# <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 123 FR 3.33 4 3.66 2 2.33 3 NA
#2 123 MR 2.66 NA 3 3.33 NA NA 2.33
#3 123 SP NA NA NA NA NA NA 3.33
#4 123 Mean 3.00 4 3.33 2.66 2.33 3 2.83
# … with 2 more variables: `2003-02-28` <dbl>, `2002-04-31` <dbl>
或使用data.table
library(data.table)
dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE,
value.var = 'RATING')
rbind(dt1, dt1[, c(.(RATING_TYPE = 'Mean'), lapply(.SD, mean, na.rm = TRUE)), .(ISSUE_ID), .SDcols = -(1:2)])
# ISSUE_ID RATING_TYPE 1999-04-31 2000-01-31 2000-05-31 2001-03-31 2001-04-30 2001-05-31 2002-04-31 2003-02-28
#1: 123 FR NA 3.330 4 3.66 2.33 2.000 NA NA
#2: 123 MR 2.33 2.660 NA 3.00 NA 3.330 NA 3
#3: 123 SP 3.33 NA NA NA NA NA 3 NA
#4: 123 Mean 2.83 2.995 4 3.33 2.33 2.665 3 3
# 2004-01-31
#1: 3
#2: NA
#3: NA
#4: 3
数据
ratings.dt <- structure(list(ISSUE_ID = c(123L, 123L, 123L, 123L, 123L, 123L,
123L, 123L, 123L, 123L, 123L, 123L, 123L), RATING_TYPE = c("FR",
"FR", "FR", "FR", "FR", "FR", "MR", "MR", "MR", "MR", "MR", "SP",
"SP"), RATING = c(3.33, 4, 3.66, 2, 2.33, 3, 2.33, 2.66, 3, 3.33,
3, 3.33, 3), RATING_DATE = c("2000-01-31", "2000-05-31", "2001-03-31",
"2001-05-31", "2001-04-30", "2004-01-31", "1999-04-31", "2000-01-31",
"2001-03-31", "2001-05-31", "2003-02-28", "1999-04-31", "2002-04-31"
), VALID_THRU_DATE = c("2000-04-31", "2000-02-28", "2001-04-31",
"2001-04-30", "2003-12-31", "2004-06-30", "1999-12-31", "2000-04-31",
"2001-04-30", "2003-01-31", "2003-07-31", "2002-03-31", "2003-05-31"
), RATING_DATE_SEQ = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L,
5L, 1L, 2L)), class = "data.frame", row.names = c(NA, -13L))
更新:
akrun 提供的建议解决方案对我有效 但是 ,我的问题是 value.var = RATING
中定义的值是 only相应的日期栏。请注意,所有定义为 RATING_DATE
和 VALID_THRU_DATE
之间的时间段的月份都没有填写。
到目前为止我尝试过但失败了: 而不是像这样定义 dcast 操作
dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE,
value.var = 'RATING')
我试过了
dt1 <- dcast(setDT(ratings.dt),
ISSUE_ID + RATING_TYPE ~ (VALID_THRU_DATE - RATING_DATE),
value.var = 'RATING')
dt1 <- dcast(setDT(ratings.dt),
ISSUE_ID + RATING_TYPE ~ as.yearmon(seq(
RATING_DATE, VALID_THRU_DATE), frac = 1),
value.var = 'RATING')
dt1 <- dcast(setDT(ratings.dt),
ISSUE_ID + RATING_TYPE ~ (RATING_DATE:VALID_THRU_DATE),
value.var = 'RATING')
我认为我可以只使用定义每个评级有效期的 2 列,因为它们都是 dcast()
函数调用中的日期列,但显然该任务背后的逻辑更难以概念化。
现在我通过首先构建一个“骨架 data.table”手动概念化此任务,然后通过在长格式中循环 row-wise 原始评分 data.table 来填充骨架并展开骨架中两个日期之间定义的评级 table。 (我将 RATING 重命名为 RATING_NUM 以区别于“原始”字母数字评级)
# (0) Filter only the most recent rating within a given month
ratings_num.dt <- ratings_num.dt[,
.SD[.N],
by = .(ISSUE_ID, RATING_TYPE, RATING_DATE)]
# (1) Defining start and end date for the rating time series
start_date <- as.Date("1990-01-01", "%Y-%m-%d")
end_date <- as.Date("2021-01-31", "%Y-%m-%d")
# (2) Define the dates as new columns for a skeleton data.table
new_cols <- seq(from = start_date,
to = end_date,
by = "month")
new_cols <- date_ymd_to_m_end(new_cols)
new_col_names <- as.character(new_cols, "%Y-%m-%d")
# (3) Determine how many months the rating time series spans
N_months <- elapsed_months_lubri(start_date, end_date) + 1
# some function to do just what the name implies
MONTH_ID <- c(1:N_months)
# (4) Define the layout of the new skeleton table
# Note: The new table should contain the 3 rows per issue ID, namely the rating time series of each issue ID for every considered rating ageny
rating_type.vec <- c("FR", "MR", "SPR")
df_skeleton <- data.frame(rep(issue_IDs.vec, each = 3), rating_type.vec)
someInitialValue <- 0
# Credit to Jonas
to_Add <- setNames(data.frame(matrix(rep(
someInitialValue, nrow(df_skeleton)*length(new_col_names)),
ncol = length(new_col_names),
nrow = NROW(df))),
new_col_names)
ratings_num_ts.df <- cbind(df_skeleton, to_Add)
ratings_num_ts.dt <- setDT(ratings_num_ts.df)
setnames(ratings_num_ts.dt,
c("rep.issue_IDs.vec..each...3.", "rating_type.vec"),
c("ISSUE_ID", "RATING_TYPE"))
# (5) Create a data.table to join on ratings_num.dt to add month IDs to use for assigning ratings
seq_dates.dt <- setDT(data.frame(new_cols, MONTH_ID))
seq_dates.dt <- setnames(seq_dates.dt, c("new_cols"), c("RATING_DATE"))
ratings_num.dt <- ratings_num.dt[seq_dates.dt,
on = .(RATING_DATE = RATING_DATE)]
ratings_num.dt <- ratings_num.dt[seq_dates.dt,
on = .(RATING_VAL_THRU = RATING_DATE)]
# (6) If for the joined MONTH_IDs there is no corresponding RATING_DATE or RATING_VAL_THRU entry, the join will write NA values for these values in the joined table and can be filtered out accordingly
ratings_num.dt <- ratings_num.dt[!is.na(ISSUE_ID)]
# (7) Rename column of second MONTH_ID
setnames(ratings_num.dt,
c("MONTH_ID", "i.MONTH_ID"),
c("MONTH_ID_START", "MONTH_ID_END"))
# (8) Sort table by setting keys
setkey(ratings_num.dt, ISSUE_ID, RATING_TYPE, RATING_DATE)
# (9) Defining logic as loop
tic()
i <- 1
j <- nrow(ratings_num.dt)
id.vec <- ratings_num.dt[, ISSUE_ID]
rating_type.vec <- ratings_num.dt[, RATING_TYPE]
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2)
month_ID_end.vec <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec <- ratings_num.dt[, RATING_NUM]
total <- j
pb <- progress_bar$new(format = "[:bar] :current/:total
(:percent) eta: :eta", total = total)
spread_ratings_to_ts <- function(dt_source, dt_ts) {
pb$tick(0)
for (i in 1:j) {
id <- id.vec[i] # alternatively ROW_ID == i
rating_type <- rating_type.vec[i]
month_ID_start <- month_ID_start.vec[i] # change to right value
month_ID_end <- month_ID_end.vec[i]
rating_num <- rating_num.vec[i]
dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type,
(month_ID_start:month_ID_end) := rating_num]
if (i %% 50 == 0) {
pb$tick()
}
i <- i + 1
}
}
spread_ratings_to_ts(ratings_num.dt, ratings_num_ts.dt)
toc()
## ~ 3,600 sec for ~ 250k rows to loop through ##
# (10) Compute rating means
# Substitute all pre-filled zeros in the table with NA as there is simply no
# rating available at this point in time
ratings_num_ts.dt <- ratings_num_ts.dt %>%
na_if(0)
ratings_num_ts.dt <- rbind(ratings_num_ts.dt,
ratings_num_ts.dt[,
c(.(RATING_TYPE = 'Mean'),
lapply(.SD, mean, na.rm=TRUE)),
by = .(ISSUE_ID),
.SDcols = -(1:2)])
setkey(ratings_num_ts.dt, ISSUE_ID, RATING_TYPE)
我尝试使用 foreach(...) %dopar% function(...)
并行化此循环,如下所示,但它目前无法正常工作。这主要是由上面非常低效的循环的运行时所激发的——尽管它工作得很好并且完成了我想要的。在处理 foreach 函数调用时,我特别不确定如何编写一个 suitable 组合函数,我可以将其放入 foreach 调用中,以根据需要包装结果。
i <- 1
j <- nrow(ratings_num.dt)
id.vec <- ratings_num.dt[, ISSUE_ID]
rating_type.vec <- ratings_num.dt[, RATING_TYPE]
# col 1+2 not rating but ISSUE_ID and RATING_TYPE
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2)
month_ID_end.vec <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec <- ratings_num.dt[, RATING_NUM]
spread_ratings_to_ts <- function(dt_source, dt_ts) {
id <- id.vec[i]
rating_type <- rating_type.vec[i]
month_ID_start <- month_ID_start.vec[i]
month_ID_end <- month_ID_end.vec[i]
rating_num <- rating_num.vec[i]
dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type][,
(month_ID_start:month_ID_end) := rating_num]
}
myCluster <- makeCluster(((detectCores()/2) - 1), type = "PSOCK")
registerDoParallel(myCluster)
clusterEvalQ(cl = myCluster, {
setMKLthreads(1)
})
foreach(i = 1:j, .combine = 'rbind') %dopar%
spread_ratings_to_ts(dt_source = ratings_num.dt,
dt_ts = ratings_num_ts.dt)
stopCluster(myCluster)
背景/数据: 理论上这很容易,即使是 3 岁的孩子也可以手动完成这项任务,但即使在解决这个问题将近一周后,我也没有进一步的解决方案。
问题:
我正在处理一个大型财务数据集。它包含由 ISSUE_ID
确定的债券发行及其相应的 RATING
,由 3 家评级机构惠誉、穆迪和标准普尔提供,定义为 RATING_TYPE
。我为每个评级确定了一个发布日期和一个定义为 RATING_DATE
和 VALID_THRU_DATE
的 valid-thru 日期,两者都是 DATE
类型。所有日期都由 yearmonth() 格式化为给定月份的最后一天,因为它们的评级用于确定 index-inclusion 在月底评估其规则。
ISSUE_ID
的类型是 numeric
RATING
的类型是 character
RATING_TYPE
的类型是 character
我的数据设置为 data.table,名为 ratings.dt
,我需要为其添加开始日期和结束日期之间的列。我的目标是每个问题 ID 有 3 行,每个评级机构各自的评级历史记录的时间序列。
data.table 的键设置为 ISSUE_ID、RATING_TYPE 和 RATING_DATE。
数据现在如下所示:
ISSUE_ID RATING_TYPE RATING RATING_DATE VALID_THRU_DATE RATING_DATE_SEQ
123 FR 3.33 2000-01-31 2000-04-31 1
123 FR 4.00 2000-05-31 2000-02-28 2
123 FR 3.66 2001-03-31 2001-04-31 3
123 FR 2.00 2001-05-31 2001-04-30 4
123 FR 2.33 2001-04-30 2003-12-31 5
123 FR 3.00 2004-01-31 2004-06-30 6
123 MR 2.33 1999-04-31 1999-12-31 1
123 MR 2.66 2000-01-31 2000-04-31 2
123 MR 3.00 2001-03-31 2001-04-30 3
123 MR 3.33 2001-05-31 2003-01-31 4
123 MR 3.00 2003-02-28 2003-07-31 5
123 SP 3.33 1999-04-31 2002-03-31 1
123 SP 3.00 2002-04-31 2003-05-31 2
244 ...
现在我想基本上将 RATING
中定义的评级分布到一系列日期中。
我想这样去做:
ISSUE_ID RATING_TYPE 1999-04-30 1999-05-31 ... 2000-01-31 2000-02-28 ... 2004-06-30
123 FR ... 3.33 2.33 ... 3.00
123 MR 2.33 2.33 ... 2.66 2.66 ...
123 SP 3.33 3.33 ... 3.33 2.66 ...
244 ...
这样我可以做:
ISSUE_ID RATING_TYPE 1999-04-30 1999-05-31 ... 2000-01-31 2000-02-28 ... 2004-06-30
123 FR ... 3.33 2.33 ... 3.00
123 MR 2.33 2.33 ... 2.66 2.66 ...
123 SP 3.33 3.33 ... 3.33 2.66 ...
123 Mean 2.83 2.83 ... 3.11 2.55 ...
然后我可以通过 data.table 这样的语法计算每个问题 ID 每月的平均评分
ratings.dt[,
lapply(.SD, mean),
.SDcols = x:y, # col indexes of added date sequence columns
by = .(ISSUE_ID)]
使用我的映射 table 将字母数字评级(例如 AAA、B+、C- 等)转换为数值以允许 numeric-based 算术计算(例如平均值),我可以将数字评分平均回到字母数字评分。那将意味着任务完成!
另外,我现在不确定这个问题是否可以更有效地概念化。非常感谢任何指点!
我们使用 pivot_wider
转换宽格式,按 summarise
进行分组以通过将其他观察值与 mean
值连接来创建 'Mean' 行。使用 dplyr
version >=1.0
,summarise
可以 return 每组多行
library(dplyr)
library(tidyr)
ratings.dt %>%
select(-VALID_THRU_DATE, -RATING_DATE_SEQ) %>%
pivot_wider(names_from = RATING_DATE, values_from = RATING) %>%
group_by(ISSUE_ID) %>%
summarise(RATING_TYPE = c(RATING_TYPE, "Mean"),
across(where(is.numeric), ~ c(., mean(., na.rm = TRUE))), .groups = 'drop')
-输出
# A tibble: 4 x 11
# ISSUE_ID RATING_TYPE `2000-01-31` `2000-05-31` `2001-03-31` `2001-05-31` `2001-04-30` `2004-01-31` `1999-04-31`
# <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 123 FR 3.33 4 3.66 2 2.33 3 NA
#2 123 MR 2.66 NA 3 3.33 NA NA 2.33
#3 123 SP NA NA NA NA NA NA 3.33
#4 123 Mean 3.00 4 3.33 2.66 2.33 3 2.83
# … with 2 more variables: `2003-02-28` <dbl>, `2002-04-31` <dbl>
或使用data.table
library(data.table)
dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE,
value.var = 'RATING')
rbind(dt1, dt1[, c(.(RATING_TYPE = 'Mean'), lapply(.SD, mean, na.rm = TRUE)), .(ISSUE_ID), .SDcols = -(1:2)])
# ISSUE_ID RATING_TYPE 1999-04-31 2000-01-31 2000-05-31 2001-03-31 2001-04-30 2001-05-31 2002-04-31 2003-02-28
#1: 123 FR NA 3.330 4 3.66 2.33 2.000 NA NA
#2: 123 MR 2.33 2.660 NA 3.00 NA 3.330 NA 3
#3: 123 SP 3.33 NA NA NA NA NA 3 NA
#4: 123 Mean 2.83 2.995 4 3.33 2.33 2.665 3 3
# 2004-01-31
#1: 3
#2: NA
#3: NA
#4: 3
数据
ratings.dt <- structure(list(ISSUE_ID = c(123L, 123L, 123L, 123L, 123L, 123L,
123L, 123L, 123L, 123L, 123L, 123L, 123L), RATING_TYPE = c("FR",
"FR", "FR", "FR", "FR", "FR", "MR", "MR", "MR", "MR", "MR", "SP",
"SP"), RATING = c(3.33, 4, 3.66, 2, 2.33, 3, 2.33, 2.66, 3, 3.33,
3, 3.33, 3), RATING_DATE = c("2000-01-31", "2000-05-31", "2001-03-31",
"2001-05-31", "2001-04-30", "2004-01-31", "1999-04-31", "2000-01-31",
"2001-03-31", "2001-05-31", "2003-02-28", "1999-04-31", "2002-04-31"
), VALID_THRU_DATE = c("2000-04-31", "2000-02-28", "2001-04-31",
"2001-04-30", "2003-12-31", "2004-06-30", "1999-12-31", "2000-04-31",
"2001-04-30", "2003-01-31", "2003-07-31", "2002-03-31", "2003-05-31"
), RATING_DATE_SEQ = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L,
5L, 1L, 2L)), class = "data.frame", row.names = c(NA, -13L))