R中累积概率质量函数矩阵的快速随机抽样

Fast random sampling from matrix of cumulative probability mass functions in R

我有一个矩阵 (mat_cdf),表示人口普查区 i 中的个人在给定日期移动到人口普查区 j 的累积概率。给定一个决定不“待在家里”的代理向量,我有一个函数,下面的 GetCTMove 函数,从这个矩阵中随机抽样以确定他们将花时间在哪个人口普查区。

# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts

# Matrix where rows correspond to probability mass of column integer
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))

# Convert to cumulative probability mass
mat_cdf <- matrix(NA, cts, cts)

  for(i in 1:cts){
  # Create cdf for row i
    mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
  }

GetCTMove <- function(agent_cts, ct_mat_cdf){
# Expand such that every agent has its own row corresponding to CDF of movement from their home ct i to j
  mat_expand <- ct_mat_cdf[agent_cts,]
  
# Probabilistically sample column index for every row by generating random number and then determining corresponding closest column 
  s <- runif(length(agent_cts))
    
  fin_col <- max.col(s < mat_expand, "first")

  return(fin_col)
}

# Sample of 500,000 agents' residence ct
agents <- sample(1:cts, size = 500000, replace = T)

# Run function
system.time(GetCTMove(agents, mat_cdf))
 user  system elapsed 
   3.09    1.19    4.30 

使用 100 万个代理,每个样本需要大约 10 秒到 运行,乘以许多时间步长导致每次模拟需要数小时,而这个函数是迄今为止模型的速率限制因素。我想知道是否有人对更快地实施这种随机抽样有建议。我已经使用 dqrng 包来加速随机数生成,但与矩阵扩展 (mat_expand) 和 max.col 调用相比,这是相对微不足道的,后者需要最长的 运行.

您首先可以优化的是以下代码:

max.col(s < mat_expand, "first")

由于 s < mat_expand returns 是逻辑矩阵,应用 max.col 函数与获取每行中的第一个 TRUE 相同。在这种情况下,使用 which 会更有效率。此外,如下所示,您将所有 CDF 存储在一个矩阵中。

mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
  mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}

这个结构可能不是最优的。 list 结构更适合应用 which 等函数。 运行 也更快,因为您不必经过 do.call(rbind, ...).

# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))

下面是你的实现:

# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
  mat_expand <- ct_mat_cdf[agent_cts,]
  s <- runif(length(agent_cts))
  fin_col <- max.col(s < mat_expand, "first")
  return(fin_col)
}

在我的桌面上,运行大约需要 2.68 秒。

> system.time(GetCTMove(agents, mat_cdf))
   user  system elapsed 
   2.25    0.41    2.68 

使用list结构和which函数,运行时间可以减少1s左右

# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
  n <- length(agent_cts)
  s <- runif(n)
  out <- integer(n)
  i <- 1L
  while (i <= n) {
    out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
    i <- i + 1L
  }
  out
}

> system.time(GetCTMove2(agents, ls_cdf))
   user  system elapsed 
   1.59    0.02    1.64 

据我所知,只有 R 没有其他方法可以进一步加速代码。但是,您确实可以通过 re-writing C++ 中的关键函数 GetCTMove 来提高性能。使用 Rcpp 包,您可以执行以下操作:

# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
  int n = agents.size(); 
  NumericVector out(n); 
  for (int i = 0; i < n; ++i) {
    NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]); 
    int m = cdf.size(); 
    for (int j = 0; j < m; ++j) {
      if (s[i] < cdf[j]) {
        out[i] = j + 1;
        break;
      }
    }
  }
  return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
  s <- runif(length(agent_cts))
  fast_GetCTMove(agent_cts, s, ls_cdf)
}

此实现速度快如闪电,应该可以满足您的所有需求。

> system.time(GetCTMove3(agents, ls_cdf))
   user  system elapsed 
   0.07    0.00    0.06 

完整脚本如下:

# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts
agents <- sample(1:cts, size = 500000, replace = T)

# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))
# below is your code
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
  mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}

# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
  mat_expand <- ct_mat_cdf[agent_cts,]
  s <- runif(length(agent_cts))
  fin_col <- max.col(s < mat_expand, "first")
  return(fin_col)
}


# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
  n <- length(agent_cts)
  s <- runif(n)
  out <- integer(n)
  i <- 1L
  while (i <= n) {
    out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
    i <- i + 1L
  }
  out
}


# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
  int n = agents.size(); 
  NumericVector out(n); 
  for (int i = 0; i < n; ++i) {
    NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]); 
    int m = cdf.size(); 
    for (int j = 0; j < m; ++j) {
      if (s[i] < cdf[j]) {
        out[i] = j + 1;
        break;
      }
    }
  }
  return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
  s <- runif(length(agent_cts))
  fast_GetCTMove(agent_cts, s, ls_cdf)
}


system.time(GetCTMove(agents, mat_cdf))
system.time(GetCTMove2(agents, ls_cdf))
system.time(GetCTMove3(agents, ls_cdf))