如何使用 R 在基于代理的模型中管理 3D 数组?
How to manage 3D arrays in agent-based models with R?
我正在使用 R 构建基于代理的模型,但我在尝试使用大型 3D 阵列时遇到了内存问题。在三维数组中,第一维对应时间(从 1 到 3650 天),第二维定义个体或景观单元的属性,第三维代表每个个体或景观单元。在每个时间步长(1 天),每个 3D 数组都使用多个函数进行填充。理想情况下,我想 运行 我的 ABM 在包含大量个体(例如 720000)的大景观(例如,90000 个细胞)上。实际上,这是不可能的,因为内存问题。
目前,3D 数组是在初始化时定义的,因此数据在每个时间步都存储在数组中。然而,要从模型中在 t 处填充一个 3D 数组,我只需要将数据保留在 t – 1 和 t – tf – 1 处,其中 tf 是一个固定的持续时间参数(例如,tf = 320 天)。这是一个用于填充一个 3D 数组的模型函数的示例(有 3 个持续时间参数):
library(ff)
library(magrittr)
library(dplyr)
library(tidyr)
library(gtools)
## Define parameters
tf1 <- 288
tf2 <- 292
tf3 <- 150
## Define 3D array
col_array <- c(letters[seq( from = 1, to = 9 )])
s_array <- ff(-999, dim=c(3650, 9, 2500), dimnames=list(NULL, col_array, as.character(seq(1, 2500, 1))),
filename="s_array.ffd", vmode="double", overwrite = t) ## 3th dimension = patch ID
## Define initial array
initial_s_array <- matrix(sample.int(100, size = 2500*9, replace = TRUE), nrow = 2500, ncol = 9, dimnames=list(NULL, col_array))
## Loop over time
line <- 1
for(t in 1:3650){
print(t)
s_array[t,c("a", "b", "c", "d", "e", "f", "g", "h", "i"),] <- func(v_t_1 = ifelse((t - 1) >= 1, list(s_array[(t - 1),,]), NA)[[1]],
v_t_tf1_1 = ifelse((t - tf1 - 1) >= 1, list(s_array[(t - tf1 - 1),,]), NA)[[1]],
v_t_tf2_1 = ifelse((t - tf2 - 1) >= 1, list(s_array[(t - tf2 - 1),,]), NA)[[1]],
v_t_tf3_1 = ifelse((t - tf3 - 1) >= 1, list(s_array[(t - tf3 - 1),,]), NA)[[1]],
v_t_0 = initial_s_array, columns_names = col_array)
line <- line + 1
}
func <- function(v_t_1, v_t_tf1_1, v_t_tf2_1, v_t_tf3_1, v_t_0, columns_names){
## Data at t-1
dt_t_1 <- (ifelse(!(all(is.na(v_t_1))),
list(v_t_1 %>%
as.data.frame.table(stringsAsFactors = FALSE) %>%
dplyr::mutate_all(as.character)), NA))[[1]]
## Data at t-tf1-1
dt_t_tf1_1 <- (ifelse(!(all(is.na(v_t_tf1_1))),
list(v_t_tf1_1 %>%
as.data.frame.table(stringsAsFactors = FALSE) %>%
dplyr::mutate_all(as.character)), NA))[[1]]
## Data at t-tf2-1
dt_t_tf2_1 <- (ifelse(!(all(is.na(v_t_tf2_1))),
list(v_t_tf2_1 %>%
as.data.frame.table(stringsAsFactors = FALSE) %>%
dplyr::mutate_all(as.character)), NA))[[1]]
## Data at t-tf3-1
dt_t_tf3_1 <- (ifelse(!(all(is.na(v_t_tf3_1))),
list(v_t_tf3_1 %>%
as.data.frame.table(stringsAsFactors = FALSE) %>%
dplyr::mutate_all(as.character)), NA))[[1]]
## Format data at t-1
dt_t_1_reshape <- (ifelse(!(all(is.na(dt_t_1))),
list(dt_t_1 %>%
dplyr::rename(ID = Var2) %>%
tidyr::spread(Var1, Freq) %>%
dplyr::select(ID, columns_names) %>%
dplyr::arrange(match(ID, mixedsort(colnames(v_t_1))))), NA))[[1]]
## Format data at t-tf1-1
dt_t_tf1_1_reshape <- (ifelse(!(all(is.na(dt_t_tf1_1))),
list(dt_t_tf1_1 %>%
dplyr::rename(ID = Var2) %>%
tidyr::spread(Var1, Freq) %>%
dplyr::select(ID, columns_names) %>%
dplyr::arrange(match(ID, mixedsort(colnames(v_t_tf1_1))))), NA))[[1]]
## Format data at t-tf2-1
dt_t_tf2_1_reshape <- (ifelse(!(all(is.na(dt_t_tf2_1))),
list(dt_t_tf2_1 %>%
dplyr::rename(ID = Var2) %>%
tidyr::spread(Var1, Freq) %>%
dplyr::select(ID, columns_names) %>%
dplyr::arrange(match(ID, mixedsort(colnames(v_t_tf2_1))))), NA))[[1]]
## Format data at t-tf3-1
dt_t_tf3_1_reshape <- (ifelse(!(all(is.na(dt_t_tf3_1))),
list(dt_t_tf3_1 %>%
dplyr::rename(ID = Var2) %>%
tidyr::spread(Var1, Freq) %>%
dplyr::select(ID, columns_names) %>%
dplyr::arrange(match(ID, mixedsort(colnames(v_t_tf3_1))))), NA))[[1]]
## Retrieve data
a_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("a")])), list(v_t_0[,c("a")])))[[1]]
d_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("d")])), list(v_t_0[,c("d")])))[[1]]
g_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("f")])), list(v_t_0[,c("f")])))[[1]]
a_t_tf1_1 <- (ifelse(!(all(is.na(dt_t_tf1_1_reshape))), list(as.numeric(dt_t_tf1_1_reshape[,c("a")])), 0))[[1]]
d_t_tf2_1 <- (ifelse(!(all(is.na(dt_t_tf2_1_reshape))), list(as.numeric(dt_t_tf2_1_reshape[,c("d")])), 0))[[1]]
g_t_tf3_1 <- (ifelse(!(all(is.na(dt_t_tf3_1_reshape))), list(as.numeric(dt_t_tf3_1_reshape[,c("f")])), 0))[[1]]
b_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("b")])), list(v_t_0[,c("b")])))[[1]]
e_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("e")])), list(v_t_0[,c("e")])))[[1]]
h_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("h")])), list(v_t_0[,c("h")])))[[1]]
b_t_tf1_1 <- (ifelse(!(all(is.na(dt_t_tf1_1_reshape))), list(as.numeric(dt_t_tf1_1_reshape[,c("b")])), 0))[[1]]
e_t_tf2_1 <- (ifelse(!(all(is.na(dt_t_tf2_1_reshape))), list(as.numeric(dt_t_tf2_1_reshape[,c("e")])), 0))[[1]]
h_t_tf3_1 <- (ifelse(!(all(is.na(dt_t_tf3_1_reshape))), list(as.numeric(dt_t_tf3_1_reshape[,c("h")])), 0))[[1]]
## Define discrete equations
a_t <- round(0.4*a_t_1 + 0.5*a_t_tf1_1)
b_t <- round(0.5*b_t_1 + 0.6*b_t_tf1_1)
c_t <- a_t + b_t
d_t <- round(0.7*d_t_1 + 0.7*d_t_tf2_1)
e_t <- round(0.9*e_t_1 + 0.4*e_t_tf2_1)
f_t <- d_t + e_t
g_t <- round(0.3*g_t_1 + 0.2*g_t_tf3_1)
h_t <- round(0.5*h_t_1 + 0.1*h_t_tf3_1)
i_t <- g_t + h_t
## Update the values
dt_array <- as.matrix(cbind(a_t, b_t, c_t, d_t, e_t, f_t, g_t, h_t, i_t))
## print(dt_array)
## Build the output matrix
dt_array <- t(dt_array)
return(dt_array)
}
函数“func”将t – 1 和t – tf – 1 处的数据作为参数,提供一个3D 数组“s_array”。函数 returns 用于填充 3D 数组的数据框。
我认为我可以通过仅保留 t – 1 和 t – tf – 1 的数据(而不是在从 1 到 3650 天的每个时间步长保留数据)来减少数组的第一维。但是,我不知道如何在每个时间步在 ABM 中管理这些新的 3D 数组(即,如何初始化 3D 数组并仅在 t – 1 和 t – tf – 1 存储数据)?
编辑:
我已经用 90000 个第三维观察值测试了这个例子。每个数组中的行数(即3650)太大。
> s_array <- ff(-999, dim=c(3650, 9, 90000), dimnames=list(NULL, col_array, as.character(seq(1, 90000, 1))),
+ filename="s_array.ffd", vmode="double", overwrite = TRUE)
Error in if (length < 0 || length > .Machine$integer.max) stop("length must be between 1 and .Machine$integer.max") :
missing value where TRUE/FALSE needed
In addition: Warning message:
In ff(-999, dim = c(3650, 9, 90000), dimnames = list(NULL, col_array, :
NAs introduced by coercion to integer range
有没有办法减少行数并应用用于填充数组的函数?
我说 R 可能不理想的原因是因为它的 copy-on-modify 语义,
所以每次你在任何 array/matrix/data 帧中更改某些内容时,
必须复印一份。
我认为 R 可以通过其内存管理做一些聪明的事情,
但仍然。
使用ff
来避免所有时间片同时在 RAM 中可能确实是有利的,
但是您的代码可能过于频繁地更改存储格式,
来回摆弄数据结构。
我认为我把逻辑打包成一对R6
class
(以及一些改进),
也许它可以帮助您开始改善代码的内存使用情况:
suppressPackageStartupMessages({
library(R6)
library(ff)
})
SArray <- R6::R6Class(
"SArray",
public = list(
time_slices = NULL,
initialize = function(sdim, sdimnames) {
self$time_slices <- lapply(1L:sdim[1L], function(ignored) {
ff(NA_real_, vmode="double", dim=sdim[-1L], dimnames=sdimnames[-1L])#, FF_RETURN=FALSE)
})
names(self$time_slices) <- sdimnames[[1L]]
}
)
)
`[.SArray` <- function(s_array, i, j, ...) {
s_array$time_slices[[i]][j,]
}
`[<-.SArray` <- function(s_array, i, j, ..., value) {
s_array$time_slices[[i]][j,] <- value
s_array
}
dim.SArray <- function(x) {
c(length(x$time_slices), dim(x$time_slices[[1L]]))
}
ABM <- R6::R6Class(
"ABM",
public = list(
s_array = NULL,
tf1 = NULL,
tf2 = NULL,
tf3 = NULL,
initialize = function(sdim, sdimnames, tfs) {
self$tf1 <- tfs[1L]
self$tf2 <- tfs[2L]
self$tf3 <- tfs[3L]
self$s_array <- SArray$new(sdim, sdimnames)
},
init_abm = function(seed = NULL) {
set.seed(seed)
sdim <- dim(self$s_array)
s_init <- matrix(sample.int(100L, size = 6L * sdim[3L], replace=TRUE),
nrow=6L, ncol=sdim[3L],
dimnames=list(c("a", "b", "d", "e", "g", "h"),
as.character(1:sdim[3L])))
self$a(1L, s_init["a", ])
self$b(1L, s_init["b", ])
self$c(1L)
self$d(1L, s_init["d", ])
self$e(1L, s_init["e", ])
self$f(1L)
self$g(1L, s_init["g", ])
self$h(1L, s_init["h", ])
self$i(1L)
private$t <- 1L
invisible()
},
can_advance = function() {
private$t < dim(self$s_array)[1L]
},
advance = function(verbose = FALSE) {
t <- private$t + 1L
if (verbose) print(t)
self$a(t)
self$b(t)
self$c(t)
self$d(t)
self$e(t)
self$f(t)
self$g(t)
self$h(t)
self$i(t)
private$t <- t
invisible()
},
# get time slice at t - tf - 1 for given letter
s_tf = function(t, tf, letter) {
t_tf_1 <- t - tf - 1L
if (t_tf_1 > 0L)
self$s_array[t_tf_1, letter, ]
else
0
},
# discrete equations
a = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "a", ]
t_tf_1 <- self$s_tf(t, self$tf1, "a")
}
self$s_array[t, "a", ] <- round(0.4 * t_1 + 0.5 * t_tf_1)
invisible()
},
b = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "b", ]
t_tf_1 <- self$s_tf(t, self$tf1, "b")
}
self$s_array[t, "b", ] <- round(0.5 * t_1 + 0.6 * t_tf_1)
invisible()
},
c = function(t) {
if (t < 1L) stop("t must be positive")
a_t <- self$s_array[t, "a", ]
b_t <- self$s_array[t, "b", ]
self$s_array[t, "c", ] <- a_t + b_t
invisible()
},
d = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "d", ]
t_tf_1 <- self$s_tf(t, self$tf2, "d")
}
self$s_array[t, "d", ] <- round(0.7 * t_1 + 0.7 * t_tf_1)
invisible()
},
e = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "e", ]
t_tf_1 <- self$s_tf(t, self$tf2, "e")
}
self$s_array[t, "e", ] <- round(0.9 * t_1 + 0.4 * t_tf_1)
invisible()
},
f = function(t) {
if (t < 1L) stop("t must be positive")
d_t <- self$s_array[t, "d", ]
e_t <- self$s_array[t, "e", ]
self$s_array[t, "f", ] <- d_t + e_t
invisible()
},
g = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "g", ]
t_tf_1 <- self$s_tf(t, self$tf3, "g")
}
self$s_array[t, "g", ] <- round(0.3 * t_1 + 0.2 * t_tf_1)
invisible()
},
h = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "h", ]
t_tf_1 <- self$s_tf(t, self$tf3, "h")
}
self$s_array[t, "h", ] <- round(0.5 * t_1 + 0.1 * t_tf_1)
invisible()
},
i = function(t) {
if (t < 1L) stop("t must be positive")
g_t <- self$s_array[t, "g", ]
h_t <- self$s_array[t, "h", ]
self$s_array[t, "i", ] <- g_t + h_t
invisible()
}
),
private = list(
t = NULL
)
)
max_t <- 10
abm <- ABM$new(c(max_t, 9, 2500),
list(NULL, letters[1:9], as.character(1:2500)),
c(288L, 292L, 150L))
abm$init_abm()
while (abm$can_advance()) {
abm$advance(TRUE)
}
anyNA(abm$s_array[])
# FALSE
离散方程下的部分函数封装了t < 2L
时初始化的逻辑。
SArray
class 将 3D 数组分成 2D 数组列表以解决 .Machine$integer.max
限制。
我正在使用 R 构建基于代理的模型,但我在尝试使用大型 3D 阵列时遇到了内存问题。在三维数组中,第一维对应时间(从 1 到 3650 天),第二维定义个体或景观单元的属性,第三维代表每个个体或景观单元。在每个时间步长(1 天),每个 3D 数组都使用多个函数进行填充。理想情况下,我想 运行 我的 ABM 在包含大量个体(例如 720000)的大景观(例如,90000 个细胞)上。实际上,这是不可能的,因为内存问题。
目前,3D 数组是在初始化时定义的,因此数据在每个时间步都存储在数组中。然而,要从模型中在 t 处填充一个 3D 数组,我只需要将数据保留在 t – 1 和 t – tf – 1 处,其中 tf 是一个固定的持续时间参数(例如,tf = 320 天)。这是一个用于填充一个 3D 数组的模型函数的示例(有 3 个持续时间参数):
library(ff)
library(magrittr)
library(dplyr)
library(tidyr)
library(gtools)
## Define parameters
tf1 <- 288
tf2 <- 292
tf3 <- 150
## Define 3D array
col_array <- c(letters[seq( from = 1, to = 9 )])
s_array <- ff(-999, dim=c(3650, 9, 2500), dimnames=list(NULL, col_array, as.character(seq(1, 2500, 1))),
filename="s_array.ffd", vmode="double", overwrite = t) ## 3th dimension = patch ID
## Define initial array
initial_s_array <- matrix(sample.int(100, size = 2500*9, replace = TRUE), nrow = 2500, ncol = 9, dimnames=list(NULL, col_array))
## Loop over time
line <- 1
for(t in 1:3650){
print(t)
s_array[t,c("a", "b", "c", "d", "e", "f", "g", "h", "i"),] <- func(v_t_1 = ifelse((t - 1) >= 1, list(s_array[(t - 1),,]), NA)[[1]],
v_t_tf1_1 = ifelse((t - tf1 - 1) >= 1, list(s_array[(t - tf1 - 1),,]), NA)[[1]],
v_t_tf2_1 = ifelse((t - tf2 - 1) >= 1, list(s_array[(t - tf2 - 1),,]), NA)[[1]],
v_t_tf3_1 = ifelse((t - tf3 - 1) >= 1, list(s_array[(t - tf3 - 1),,]), NA)[[1]],
v_t_0 = initial_s_array, columns_names = col_array)
line <- line + 1
}
func <- function(v_t_1, v_t_tf1_1, v_t_tf2_1, v_t_tf3_1, v_t_0, columns_names){
## Data at t-1
dt_t_1 <- (ifelse(!(all(is.na(v_t_1))),
list(v_t_1 %>%
as.data.frame.table(stringsAsFactors = FALSE) %>%
dplyr::mutate_all(as.character)), NA))[[1]]
## Data at t-tf1-1
dt_t_tf1_1 <- (ifelse(!(all(is.na(v_t_tf1_1))),
list(v_t_tf1_1 %>%
as.data.frame.table(stringsAsFactors = FALSE) %>%
dplyr::mutate_all(as.character)), NA))[[1]]
## Data at t-tf2-1
dt_t_tf2_1 <- (ifelse(!(all(is.na(v_t_tf2_1))),
list(v_t_tf2_1 %>%
as.data.frame.table(stringsAsFactors = FALSE) %>%
dplyr::mutate_all(as.character)), NA))[[1]]
## Data at t-tf3-1
dt_t_tf3_1 <- (ifelse(!(all(is.na(v_t_tf3_1))),
list(v_t_tf3_1 %>%
as.data.frame.table(stringsAsFactors = FALSE) %>%
dplyr::mutate_all(as.character)), NA))[[1]]
## Format data at t-1
dt_t_1_reshape <- (ifelse(!(all(is.na(dt_t_1))),
list(dt_t_1 %>%
dplyr::rename(ID = Var2) %>%
tidyr::spread(Var1, Freq) %>%
dplyr::select(ID, columns_names) %>%
dplyr::arrange(match(ID, mixedsort(colnames(v_t_1))))), NA))[[1]]
## Format data at t-tf1-1
dt_t_tf1_1_reshape <- (ifelse(!(all(is.na(dt_t_tf1_1))),
list(dt_t_tf1_1 %>%
dplyr::rename(ID = Var2) %>%
tidyr::spread(Var1, Freq) %>%
dplyr::select(ID, columns_names) %>%
dplyr::arrange(match(ID, mixedsort(colnames(v_t_tf1_1))))), NA))[[1]]
## Format data at t-tf2-1
dt_t_tf2_1_reshape <- (ifelse(!(all(is.na(dt_t_tf2_1))),
list(dt_t_tf2_1 %>%
dplyr::rename(ID = Var2) %>%
tidyr::spread(Var1, Freq) %>%
dplyr::select(ID, columns_names) %>%
dplyr::arrange(match(ID, mixedsort(colnames(v_t_tf2_1))))), NA))[[1]]
## Format data at t-tf3-1
dt_t_tf3_1_reshape <- (ifelse(!(all(is.na(dt_t_tf3_1))),
list(dt_t_tf3_1 %>%
dplyr::rename(ID = Var2) %>%
tidyr::spread(Var1, Freq) %>%
dplyr::select(ID, columns_names) %>%
dplyr::arrange(match(ID, mixedsort(colnames(v_t_tf3_1))))), NA))[[1]]
## Retrieve data
a_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("a")])), list(v_t_0[,c("a")])))[[1]]
d_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("d")])), list(v_t_0[,c("d")])))[[1]]
g_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("f")])), list(v_t_0[,c("f")])))[[1]]
a_t_tf1_1 <- (ifelse(!(all(is.na(dt_t_tf1_1_reshape))), list(as.numeric(dt_t_tf1_1_reshape[,c("a")])), 0))[[1]]
d_t_tf2_1 <- (ifelse(!(all(is.na(dt_t_tf2_1_reshape))), list(as.numeric(dt_t_tf2_1_reshape[,c("d")])), 0))[[1]]
g_t_tf3_1 <- (ifelse(!(all(is.na(dt_t_tf3_1_reshape))), list(as.numeric(dt_t_tf3_1_reshape[,c("f")])), 0))[[1]]
b_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("b")])), list(v_t_0[,c("b")])))[[1]]
e_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("e")])), list(v_t_0[,c("e")])))[[1]]
h_t_1 <- (ifelse((!(all(is.na(dt_t_1_reshape)))), list(as.numeric(dt_t_1_reshape[,c("h")])), list(v_t_0[,c("h")])))[[1]]
b_t_tf1_1 <- (ifelse(!(all(is.na(dt_t_tf1_1_reshape))), list(as.numeric(dt_t_tf1_1_reshape[,c("b")])), 0))[[1]]
e_t_tf2_1 <- (ifelse(!(all(is.na(dt_t_tf2_1_reshape))), list(as.numeric(dt_t_tf2_1_reshape[,c("e")])), 0))[[1]]
h_t_tf3_1 <- (ifelse(!(all(is.na(dt_t_tf3_1_reshape))), list(as.numeric(dt_t_tf3_1_reshape[,c("h")])), 0))[[1]]
## Define discrete equations
a_t <- round(0.4*a_t_1 + 0.5*a_t_tf1_1)
b_t <- round(0.5*b_t_1 + 0.6*b_t_tf1_1)
c_t <- a_t + b_t
d_t <- round(0.7*d_t_1 + 0.7*d_t_tf2_1)
e_t <- round(0.9*e_t_1 + 0.4*e_t_tf2_1)
f_t <- d_t + e_t
g_t <- round(0.3*g_t_1 + 0.2*g_t_tf3_1)
h_t <- round(0.5*h_t_1 + 0.1*h_t_tf3_1)
i_t <- g_t + h_t
## Update the values
dt_array <- as.matrix(cbind(a_t, b_t, c_t, d_t, e_t, f_t, g_t, h_t, i_t))
## print(dt_array)
## Build the output matrix
dt_array <- t(dt_array)
return(dt_array)
}
函数“func”将t – 1 和t – tf – 1 处的数据作为参数,提供一个3D 数组“s_array”。函数 returns 用于填充 3D 数组的数据框。
我认为我可以通过仅保留 t – 1 和 t – tf – 1 的数据(而不是在从 1 到 3650 天的每个时间步长保留数据)来减少数组的第一维。但是,我不知道如何在每个时间步在 ABM 中管理这些新的 3D 数组(即,如何初始化 3D 数组并仅在 t – 1 和 t – tf – 1 存储数据)?
编辑: 我已经用 90000 个第三维观察值测试了这个例子。每个数组中的行数(即3650)太大。
> s_array <- ff(-999, dim=c(3650, 9, 90000), dimnames=list(NULL, col_array, as.character(seq(1, 90000, 1))),
+ filename="s_array.ffd", vmode="double", overwrite = TRUE)
Error in if (length < 0 || length > .Machine$integer.max) stop("length must be between 1 and .Machine$integer.max") :
missing value where TRUE/FALSE needed
In addition: Warning message:
In ff(-999, dim = c(3650, 9, 90000), dimnames = list(NULL, col_array, :
NAs introduced by coercion to integer range
有没有办法减少行数并应用用于填充数组的函数?
我说 R 可能不理想的原因是因为它的 copy-on-modify 语义, 所以每次你在任何 array/matrix/data 帧中更改某些内容时, 必须复印一份。 我认为 R 可以通过其内存管理做一些聪明的事情, 但仍然。
使用ff
来避免所有时间片同时在 RAM 中可能确实是有利的,
但是您的代码可能过于频繁地更改存储格式,
来回摆弄数据结构。
我认为我把逻辑打包成一对R6
class
(以及一些改进),
也许它可以帮助您开始改善代码的内存使用情况:
suppressPackageStartupMessages({
library(R6)
library(ff)
})
SArray <- R6::R6Class(
"SArray",
public = list(
time_slices = NULL,
initialize = function(sdim, sdimnames) {
self$time_slices <- lapply(1L:sdim[1L], function(ignored) {
ff(NA_real_, vmode="double", dim=sdim[-1L], dimnames=sdimnames[-1L])#, FF_RETURN=FALSE)
})
names(self$time_slices) <- sdimnames[[1L]]
}
)
)
`[.SArray` <- function(s_array, i, j, ...) {
s_array$time_slices[[i]][j,]
}
`[<-.SArray` <- function(s_array, i, j, ..., value) {
s_array$time_slices[[i]][j,] <- value
s_array
}
dim.SArray <- function(x) {
c(length(x$time_slices), dim(x$time_slices[[1L]]))
}
ABM <- R6::R6Class(
"ABM",
public = list(
s_array = NULL,
tf1 = NULL,
tf2 = NULL,
tf3 = NULL,
initialize = function(sdim, sdimnames, tfs) {
self$tf1 <- tfs[1L]
self$tf2 <- tfs[2L]
self$tf3 <- tfs[3L]
self$s_array <- SArray$new(sdim, sdimnames)
},
init_abm = function(seed = NULL) {
set.seed(seed)
sdim <- dim(self$s_array)
s_init <- matrix(sample.int(100L, size = 6L * sdim[3L], replace=TRUE),
nrow=6L, ncol=sdim[3L],
dimnames=list(c("a", "b", "d", "e", "g", "h"),
as.character(1:sdim[3L])))
self$a(1L, s_init["a", ])
self$b(1L, s_init["b", ])
self$c(1L)
self$d(1L, s_init["d", ])
self$e(1L, s_init["e", ])
self$f(1L)
self$g(1L, s_init["g", ])
self$h(1L, s_init["h", ])
self$i(1L)
private$t <- 1L
invisible()
},
can_advance = function() {
private$t < dim(self$s_array)[1L]
},
advance = function(verbose = FALSE) {
t <- private$t + 1L
if (verbose) print(t)
self$a(t)
self$b(t)
self$c(t)
self$d(t)
self$e(t)
self$f(t)
self$g(t)
self$h(t)
self$i(t)
private$t <- t
invisible()
},
# get time slice at t - tf - 1 for given letter
s_tf = function(t, tf, letter) {
t_tf_1 <- t - tf - 1L
if (t_tf_1 > 0L)
self$s_array[t_tf_1, letter, ]
else
0
},
# discrete equations
a = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "a", ]
t_tf_1 <- self$s_tf(t, self$tf1, "a")
}
self$s_array[t, "a", ] <- round(0.4 * t_1 + 0.5 * t_tf_1)
invisible()
},
b = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "b", ]
t_tf_1 <- self$s_tf(t, self$tf1, "b")
}
self$s_array[t, "b", ] <- round(0.5 * t_1 + 0.6 * t_tf_1)
invisible()
},
c = function(t) {
if (t < 1L) stop("t must be positive")
a_t <- self$s_array[t, "a", ]
b_t <- self$s_array[t, "b", ]
self$s_array[t, "c", ] <- a_t + b_t
invisible()
},
d = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "d", ]
t_tf_1 <- self$s_tf(t, self$tf2, "d")
}
self$s_array[t, "d", ] <- round(0.7 * t_1 + 0.7 * t_tf_1)
invisible()
},
e = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "e", ]
t_tf_1 <- self$s_tf(t, self$tf2, "e")
}
self$s_array[t, "e", ] <- round(0.9 * t_1 + 0.4 * t_tf_1)
invisible()
},
f = function(t) {
if (t < 1L) stop("t must be positive")
d_t <- self$s_array[t, "d", ]
e_t <- self$s_array[t, "e", ]
self$s_array[t, "f", ] <- d_t + e_t
invisible()
},
g = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "g", ]
t_tf_1 <- self$s_tf(t, self$tf3, "g")
}
self$s_array[t, "g", ] <- round(0.3 * t_1 + 0.2 * t_tf_1)
invisible()
},
h = function(t, t_0) {
if (t < 2L) {
t <- 1L
t_1 <- t_0
t_tf_1 <- 0
}
else {
t_1 <- self$s_array[t - 1L, "h", ]
t_tf_1 <- self$s_tf(t, self$tf3, "h")
}
self$s_array[t, "h", ] <- round(0.5 * t_1 + 0.1 * t_tf_1)
invisible()
},
i = function(t) {
if (t < 1L) stop("t must be positive")
g_t <- self$s_array[t, "g", ]
h_t <- self$s_array[t, "h", ]
self$s_array[t, "i", ] <- g_t + h_t
invisible()
}
),
private = list(
t = NULL
)
)
max_t <- 10
abm <- ABM$new(c(max_t, 9, 2500),
list(NULL, letters[1:9], as.character(1:2500)),
c(288L, 292L, 150L))
abm$init_abm()
while (abm$can_advance()) {
abm$advance(TRUE)
}
anyNA(abm$s_array[])
# FALSE
离散方程下的部分函数封装了t < 2L
时初始化的逻辑。
SArray
class 将 3D 数组分成 2D 数组列表以解决 .Machine$integer.max
限制。