R中累积概率质量函数矩阵的快速随机抽样
Fast random sampling from matrix of cumulative probability mass functions in R
我有一个矩阵 (mat_cdf
),表示人口普查区 i
中的个人在给定日期移动到人口普查区 j
的累积概率。给定一个决定不“待在家里”的代理向量,我有一个函数,下面的 GetCTMove
函数,从这个矩阵中随机抽样以确定他们将花时间在哪个人口普查区。
# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts
# Matrix where rows correspond to probability mass of column integer
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
# Convert to cumulative probability mass
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
# Create cdf for row i
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
GetCTMove <- function(agent_cts, ct_mat_cdf){
# Expand such that every agent has its own row corresponding to CDF of movement from their home ct i to j
mat_expand <- ct_mat_cdf[agent_cts,]
# Probabilistically sample column index for every row by generating random number and then determining corresponding closest column
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
# Sample of 500,000 agents' residence ct
agents <- sample(1:cts, size = 500000, replace = T)
# Run function
system.time(GetCTMove(agents, mat_cdf))
user system elapsed
3.09 1.19 4.30
使用 100 万个代理,每个样本需要大约 10 秒到 运行,乘以许多时间步长导致每次模拟需要数小时,而这个函数是迄今为止模型的速率限制因素。我想知道是否有人对更快地实施这种随机抽样有建议。我已经使用 dqrng
包来加速随机数生成,但与矩阵扩展 (mat_expand
) 和 max.col
调用相比,这是相对微不足道的,后者需要最长的 运行.
您首先可以优化的是以下代码:
max.col(s < mat_expand, "first")
由于 s < mat_expand
returns 是逻辑矩阵,应用 max.col
函数与获取每行中的第一个 TRUE
相同。在这种情况下,使用 which
会更有效率。此外,如下所示,您将所有 CDF 存储在一个矩阵中。
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
这个结构可能不是最优的。 list
结构更适合应用 which
等函数。 运行 也更快,因为您不必经过 do.call(rbind, ...)
.
# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))
下面是你的实现:
# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
mat_expand <- ct_mat_cdf[agent_cts,]
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
在我的桌面上,运行大约需要 2.68 秒。
> system.time(GetCTMove(agents, mat_cdf))
user system elapsed
2.25 0.41 2.68
使用list
结构和which
函数,运行时间可以减少1s左右
# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
n <- length(agent_cts)
s <- runif(n)
out <- integer(n)
i <- 1L
while (i <= n) {
out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
i <- i + 1L
}
out
}
> system.time(GetCTMove2(agents, ls_cdf))
user system elapsed
1.59 0.02 1.64
据我所知,只有 R 没有其他方法可以进一步加速代码。但是,您确实可以通过 re-writing C++ 中的关键函数 GetCTMove
来提高性能。使用 Rcpp
包,您可以执行以下操作:
# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
int n = agents.size();
NumericVector out(n);
for (int i = 0; i < n; ++i) {
NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]);
int m = cdf.size();
for (int j = 0; j < m; ++j) {
if (s[i] < cdf[j]) {
out[i] = j + 1;
break;
}
}
}
return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
s <- runif(length(agent_cts))
fast_GetCTMove(agent_cts, s, ls_cdf)
}
此实现速度快如闪电,应该可以满足您的所有需求。
> system.time(GetCTMove3(agents, ls_cdf))
user system elapsed
0.07 0.00 0.06
完整脚本如下:
# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts
agents <- sample(1:cts, size = 500000, replace = T)
# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))
# below is your code
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
mat_expand <- ct_mat_cdf[agent_cts,]
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
n <- length(agent_cts)
s <- runif(n)
out <- integer(n)
i <- 1L
while (i <= n) {
out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
i <- i + 1L
}
out
}
# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
int n = agents.size();
NumericVector out(n);
for (int i = 0; i < n; ++i) {
NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]);
int m = cdf.size();
for (int j = 0; j < m; ++j) {
if (s[i] < cdf[j]) {
out[i] = j + 1;
break;
}
}
}
return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
s <- runif(length(agent_cts))
fast_GetCTMove(agent_cts, s, ls_cdf)
}
system.time(GetCTMove(agents, mat_cdf))
system.time(GetCTMove2(agents, ls_cdf))
system.time(GetCTMove3(agents, ls_cdf))
我有一个矩阵 (mat_cdf
),表示人口普查区 i
中的个人在给定日期移动到人口普查区 j
的累积概率。给定一个决定不“待在家里”的代理向量,我有一个函数,下面的 GetCTMove
函数,从这个矩阵中随机抽样以确定他们将花时间在哪个人口普查区。
# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts
# Matrix where rows correspond to probability mass of column integer
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
# Convert to cumulative probability mass
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
# Create cdf for row i
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
GetCTMove <- function(agent_cts, ct_mat_cdf){
# Expand such that every agent has its own row corresponding to CDF of movement from their home ct i to j
mat_expand <- ct_mat_cdf[agent_cts,]
# Probabilistically sample column index for every row by generating random number and then determining corresponding closest column
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
# Sample of 500,000 agents' residence ct
agents <- sample(1:cts, size = 500000, replace = T)
# Run function
system.time(GetCTMove(agents, mat_cdf))
user system elapsed
3.09 1.19 4.30
使用 100 万个代理,每个样本需要大约 10 秒到 运行,乘以许多时间步长导致每次模拟需要数小时,而这个函数是迄今为止模型的速率限制因素。我想知道是否有人对更快地实施这种随机抽样有建议。我已经使用 dqrng
包来加速随机数生成,但与矩阵扩展 (mat_expand
) 和 max.col
调用相比,这是相对微不足道的,后者需要最长的 运行.
您首先可以优化的是以下代码:
max.col(s < mat_expand, "first")
由于 s < mat_expand
returns 是逻辑矩阵,应用 max.col
函数与获取每行中的第一个 TRUE
相同。在这种情况下,使用 which
会更有效率。此外,如下所示,您将所有 CDF 存储在一个矩阵中。
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
这个结构可能不是最优的。 list
结构更适合应用 which
等函数。 运行 也更快,因为您不必经过 do.call(rbind, ...)
.
# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))
下面是你的实现:
# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
mat_expand <- ct_mat_cdf[agent_cts,]
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
在我的桌面上,运行大约需要 2.68 秒。
> system.time(GetCTMove(agents, mat_cdf))
user system elapsed
2.25 0.41 2.68
使用list
结构和which
函数,运行时间可以减少1s左右
# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
n <- length(agent_cts)
s <- runif(n)
out <- integer(n)
i <- 1L
while (i <= n) {
out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
i <- i + 1L
}
out
}
> system.time(GetCTMove2(agents, ls_cdf))
user system elapsed
1.59 0.02 1.64
据我所知,只有 R 没有其他方法可以进一步加速代码。但是,您确实可以通过 re-writing C++ 中的关键函数 GetCTMove
来提高性能。使用 Rcpp
包,您可以执行以下操作:
# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
int n = agents.size();
NumericVector out(n);
for (int i = 0; i < n; ++i) {
NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]);
int m = cdf.size();
for (int j = 0; j < m; ++j) {
if (s[i] < cdf[j]) {
out[i] = j + 1;
break;
}
}
}
return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
s <- runif(length(agent_cts))
fast_GetCTMove(agent_cts, s, ls_cdf)
}
此实现速度快如闪电,应该可以满足您的所有需求。
> system.time(GetCTMove3(agents, ls_cdf))
user system elapsed
0.07 0.00 0.06
完整脚本如下:
# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts
agents <- sample(1:cts, size = 500000, replace = T)
# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))
# below is your code
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
mat_expand <- ct_mat_cdf[agent_cts,]
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
n <- length(agent_cts)
s <- runif(n)
out <- integer(n)
i <- 1L
while (i <= n) {
out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
i <- i + 1L
}
out
}
# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
int n = agents.size();
NumericVector out(n);
for (int i = 0; i < n; ++i) {
NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]);
int m = cdf.size();
for (int j = 0; j < m; ++j) {
if (s[i] < cdf[j]) {
out[i] = j + 1;
break;
}
}
}
return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
s <- runif(length(agent_cts))
fast_GetCTMove(agent_cts, s, ls_cdf)
}
system.time(GetCTMove(agents, mat_cdf))
system.time(GetCTMove2(agents, ls_cdf))
system.time(GetCTMove3(agents, ls_cdf))