估计面板中组合过去时期的公共集合元素的百分比
Estimating the percentage of common set elements over combined past periods in a panel
我有一个时间序列面板数据集,其结构如下:有 2 只基金在每个时间段各自拥有不同的股票。
df <- data.frame(
fund_id = c(1,1,1,1,1,1,1,1, 1, 2,2,2,2),
time_Q = c(1,1,1,2,2,2,2,3, 3, 1,1,2,2),
stock_id = c("A", "B", "C", "A", "C", "D", "E", "D", "E", "A", "B", "B", "C")
)
> df
fund_id time_Q stock_id
1 1 1 A
2 1 1 B
3 1 1 C
4 1 2 A
5 1 2 C
6 1 2 D
7 1 2 E
8 1 3 D
9 1 3 E
10 2 1 A
11 2 1 B
12 2 2 B
13 2 2 C
对于每只基金,我想计算当前 time_Q 持有的股票在前一到 2 年中曾经 持有的股票的百分比宿舍。所以基本上对于每个基金和每个 time_Q,我想有 2 列过去 1 time_Q,过去 1-2 time_Q 显示当时持有的股票百分比也是出现在过去的任何 time_Q 中。
结果应该是这样的:
result <- data.frame(
fund_id = c(1,1,1,2,2),
time_Q = c(1,2,3,1,2),
past_1Q = c("NA",0.5,1,"NA",0.5),
past_2Q = c("NA",0.5,1,"NA",0.5)
)
> result
fund_id time_Q past_1Q past_1_2Q
1 1 1 NA NA
2 1 2 0.5 0.5
3 1 3 1 1
4 2 1 NA NA
5 2 2 0.5 0.5
我已经问过类似的问题 ,但现在我正在寻找过去任何滞后时期的共同元素。我正在寻找一个 dplyr 或 data.table 可扩展的解决方案,我可以在过去的 12 个季度左右处理多个基金、股票和时间段。
提前致谢!
我的解决方案
# dummy data
df <- data.table(fund_id = c(1,1,1,1,1,1,1,1, 1, 2,2,2,2)
, time_Q = c(1,1,1,2,2,2,2,3, 3, 1,1,2,2)
, stock_id = c("A", "B", "C", "A", "C", "D", "E", "D", "E", "A", "B", "B", "C")
); df
# lower case col names
names(df) <- tolower(names(df))
# unique grouping
x <- df[, .(dummy =.N), .(fund_id, time_q)][, dummy := NULL]
# initialise empty table
y <- NULL
# loop
for(i in 1:nrow(x))
{
# current quarter & before
z <- df[fund_id == x[i, fund_id]
& time_q %between% c( x[i, time_q] - 12, x[i, time_q])
]
# current quarter
a <- z[fund_id == x[i, fund_id]
& time_q == x[i, time_q]
, unique(stock_id)
]
# minus 1 to minus 12 quarter (lapply)
b <- lapply(1:12, \(j) z[fund_id == x[i, fund_id]
& time_q %between% c( x[i, time_q] - j, x[i, time_q] - 1)
, unique(stock_id)
]
)
# results
c <- data.table(fund_id = x[i, fund_id]
, current_q = x[i, time_q]
)
# no. of stocks in current quarter
d <- length(a)
# calculate % for the 12 periods
c[, paste0('past_1_to_', 1:12, '_q') := lapply(1:12, \(j) length(intersect(a,b[[j]])) / d) ]
# collect results
y <- rbind(y, c)
}
基准
x <- 1e3
df <- data.table(fund_id = rep(1:x, each = x/10)
, time_Q = rep(1:4, each = x/4)
, stock_id = sample(letters[1:26], size=20, replace=T)
)
在上面 df
上用了 20 秒,有 100k 行和 1,200 个组(fund_id
,time_q
)
我有一个时间序列面板数据集,其结构如下:有 2 只基金在每个时间段各自拥有不同的股票。
df <- data.frame(
fund_id = c(1,1,1,1,1,1,1,1, 1, 2,2,2,2),
time_Q = c(1,1,1,2,2,2,2,3, 3, 1,1,2,2),
stock_id = c("A", "B", "C", "A", "C", "D", "E", "D", "E", "A", "B", "B", "C")
)
> df
fund_id time_Q stock_id
1 1 1 A
2 1 1 B
3 1 1 C
4 1 2 A
5 1 2 C
6 1 2 D
7 1 2 E
8 1 3 D
9 1 3 E
10 2 1 A
11 2 1 B
12 2 2 B
13 2 2 C
对于每只基金,我想计算当前 time_Q 持有的股票在前一到 2 年中曾经 持有的股票的百分比宿舍。所以基本上对于每个基金和每个 time_Q,我想有 2 列过去 1 time_Q,过去 1-2 time_Q 显示当时持有的股票百分比也是出现在过去的任何 time_Q 中。 结果应该是这样的:
result <- data.frame(
fund_id = c(1,1,1,2,2),
time_Q = c(1,2,3,1,2),
past_1Q = c("NA",0.5,1,"NA",0.5),
past_2Q = c("NA",0.5,1,"NA",0.5)
)
> result
fund_id time_Q past_1Q past_1_2Q
1 1 1 NA NA
2 1 2 0.5 0.5
3 1 3 1 1
4 2 1 NA NA
5 2 2 0.5 0.5
我已经问过类似的问题
提前致谢!
我的解决方案
# dummy data
df <- data.table(fund_id = c(1,1,1,1,1,1,1,1, 1, 2,2,2,2)
, time_Q = c(1,1,1,2,2,2,2,3, 3, 1,1,2,2)
, stock_id = c("A", "B", "C", "A", "C", "D", "E", "D", "E", "A", "B", "B", "C")
); df
# lower case col names
names(df) <- tolower(names(df))
# unique grouping
x <- df[, .(dummy =.N), .(fund_id, time_q)][, dummy := NULL]
# initialise empty table
y <- NULL
# loop
for(i in 1:nrow(x))
{
# current quarter & before
z <- df[fund_id == x[i, fund_id]
& time_q %between% c( x[i, time_q] - 12, x[i, time_q])
]
# current quarter
a <- z[fund_id == x[i, fund_id]
& time_q == x[i, time_q]
, unique(stock_id)
]
# minus 1 to minus 12 quarter (lapply)
b <- lapply(1:12, \(j) z[fund_id == x[i, fund_id]
& time_q %between% c( x[i, time_q] - j, x[i, time_q] - 1)
, unique(stock_id)
]
)
# results
c <- data.table(fund_id = x[i, fund_id]
, current_q = x[i, time_q]
)
# no. of stocks in current quarter
d <- length(a)
# calculate % for the 12 periods
c[, paste0('past_1_to_', 1:12, '_q') := lapply(1:12, \(j) length(intersect(a,b[[j]])) / d) ]
# collect results
y <- rbind(y, c)
}
基准
x <- 1e3
df <- data.table(fund_id = rep(1:x, each = x/10)
, time_Q = rep(1:4, each = x/4)
, stock_id = sample(letters[1:26], size=20, replace=T)
)
在上面 df
上用了 20 秒,有 100k 行和 1,200 个组(fund_id
,time_q
)