当模型中包含因子变量时,光栅预测不会在会话之间重现
Raster predictions do not reproduce session to session when factor variable is included in model
这个问题与我一年半前发布的这个问题有关:Reproducibility of results from predict() function - raster package。但由于它没有示例,我也创建了一个包含更新信息的新问题。
在将我的预测重现为栅格时,我遇到了一个有点模糊的问题。我正在创建一个带有数值变量和单因子变量的 gbm 模型。然后,我使用栅格包使用我训练的模型预测栅格。预测因会话而异,但会在单个 R 会话中重现。如果我删除因子变量,结果将在会话之间重现。此外,在我下面的示例中,如果我在训练数据中的因子水平多于栅格变量版本中的因子水平,我可以让它在会话之间重现。是什么原因造成的?如何在包含因子变量的同时将我的结果重现到会话中?
# This code will not reproduce session to session, but does if I leave many many factor levels in newwine with the
# commented out code
library(breakDown)
library(gbm)
library(dplyr)
library(raster)
# leave in many levels and code will reproduce session to session
#newwine <- wine[1:500,c(1:3,6)]
# specify only levels which are in the below raster and code will not reproduce session to session
newwine <- wine[,c(1:3,6)] %>%
filter(free.sulfur.dioxide == 3 | free.sulfur.dioxide == 10 | free.sulfur.dioxide == 15 |
free.sulfur.dioxide == 37 | free.sulfur.dioxide == 76)
head(newwine)
# make free.sulfur.dioxide as factor variable
newwine$free.sulfur.dioxide <- as.factor(newwine$free.sulfur.dioxide)
levels(newwine$free.sulfur.dioxide)
set.seed(123)
model <- gbm(fixed.acidity ~ ., data = newwine,
distribution = "gaussian",
bag.fraction = 0.50,
n.trees = 1000,
interaction.depth = 16,
shrinkage = 0.016,
n.minobsinnode = 10, verbose = FALSE)
summary(model)
plot(model, i.var = 3, n.trees = 1000)
# make some rasters for the predictor variables
free.sulfur.dioxide <- c(rep(3,times=10), rep(10, times = 10),
rep(15, times = 10), rep(37, times = 10),
rep(76, times = 10))
free.sulfur.dioxide.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(free.sulfur.dioxide.r) <- free.sulfur.dioxide
set.seed(123)
volatile.acidity <- newwine %>%
dplyr::select(volatile.acidity) %>%
sample_n(50)
volatile.acidity <- as.vector(volatile.acidity)[,1]
volatile.acidity.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(volatile.acidity.r) <- volatile.acidity
set.seed(123)
citric.acid <- newwine %>%
dplyr::select(citric.acid) %>%
sample_n(50)
citric.acid <- as.vector(citric.acid)[,1]
citric.acid.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(citric.acid.r) <- citric.acid
# create a raster stack
r <- stack(free.sulfur.dioxide.r, volatile.acidity.r, citric.acid.r)
names(r) <- c("free.sulfur.dioxide", "volatile.acidity", "citric.acid")
###########################################################################################################################
# predict to a raster with raster predict
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction1.tif", overwrite = TRUE)
###########################################################################################################################
# close the session and reopen, run until line 61, then run below to make a new prediction, called prediction 2
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction2.tif", overwrite = TRUE)
# read in the previous prediction
prediction1 <- raster("prediction1.tif")
prediction2 <- raster("prediction2.tif")
# compare rasters built across sessions
compareRaster(prediction1, prediction2, values = TRUE)
summary(prediction1-prediction2)
# compare rasters built within same session
pred2 <- predict(r, model, n.trees = model$n.trees, format="GTiff")
compareRaster(pred, pred2, values = TRUE)
但是,下面的代码不使用 factor 变量,并且会在会话之间重现会话。
### Same exercise but without setting the free sulfur dioxide to factor
## this code will reproduce session to session
library(breakDown)
library(gbm)
library(dplyr)
library(raster)
newwine <- wine[1:500,c(1:3)]
head(newwine)
set.seed(123)
model <- gbm(fixed.acidity ~ ., data = newwine,
distribution = "gaussian",
bag.fraction = 0.50,
n.trees = 1000,
interaction.depth = 16,
shrinkage = 0.016,
n.minobsinnode = 10, verbose = FALSE)
summary(model)
set.seed(123)
volatile.acidity <- newwine %>%
dplyr::select(volatile.acidity) %>%
sample_n(50)
volatile.acidity <- as.vector(volatile.acidity)[,1]
volatile.acidity.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(volatile.acidity.r) <- volatile.acidity
set.seed(123)
citric.acid <- newwine %>%
dplyr::select(citric.acid) %>%
sample_n(50)
citric.acid <- as.vector(citric.acid)[,1]
citric.acid.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(citric.acid.r) <- citric.acid
# create a raster stack
r <- stack( volatile.acidity.r, citric.acid.r)
names(r) <- c( "volatile.acidity", "citric.acid")
#######################################################################################################################
# predict to a raster with raster predict
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction1.tif", overwrite = TRUE)
#######################################################################################################################
# close the session and reopen to make a new prediction, called prediction 2
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction2.tif", overwrite = TRUE)
# read in the previous prediction
prediction1 <- raster("prediction1.tif")
prediction2 <- raster("prediction2.tif")
# compare rasters built across sessions
compareRaster(prediction1, prediction2, values = TRUE)
summary(prediction1-prediction2)
# compare rasters built within same session
pred2 <- predict(r, model, n.trees = model$n.trees, format="GTiff")
compareRaster(pred, pred2, values = TRUE)
summary(pred-pred2)
这不是解决方案,而是尝试解决问题。在我看来,这与raster
无关。
当我这样做时:
v <- values(r)
pred <- predict(model, data.frame(v), n.trees = model$n.trees)
rpred <- predict(r, model, n.trees = model$n.trees)
退出,保存会话,开始新会话并执行:
library(gbm)
library(raster)
pred2 <- predict(model, data.frame(v), n.trees = model$n.trees )
rpred2 <- predict(r, model, n.trees = model$n.trees)
我看到 pred
和 pred2
的值不太一样。 (参见plot(pred, pred2)
。但是,pred2
和rpred2
的值是相同的:plot(values(rpred2), pred2)
。
或者,当我保存 pred
(saveRDS(pred, 'pred.rds')
,然后将其加载到新会话 pred1 <- readRDS(pred.rds)
,结果并不完全相同。
它向我暗示 gbm
某处正在进行一些不受 set.seed
控制的随机化。
看来这个问题不是由于 raster
软件包引起的,而是由于 gbm
软件包引起的。经过一番挖掘,我发现 gbm
包在 2017 年 3 月被孤立了,并且有一个新的 gbm 包,在 github 上称为 gbm3
(在 CRAN 上尚不可用)https://github.com/gbm-developers/gbm3 .当您预测栅格时,您可以使用您的模型类型要求的任何预测方法(例如 predict.gbm()
用于 gbm
和 predict.GBMFit()
用于 gbm3
。似乎 predict.gbm()
只是没有正确处理来自模型中栅格的因素。它可能是也可能不是错误,但无论哪种情况,这个包都不再被维护。gbm3
可以解决问题并且可以重现。
# This code will reproduce session to session for the gbm3 model, but not for old gbm model
library(breakDown)
# install gbm3 from github
library(gbm3)
library(dplyr)
library(raster)
# specify only levels which are in the below raster
newwine <- wine[,c(1:3,6)] %>%
filter(free.sulfur.dioxide == 3 | free.sulfur.dioxide == 10 | free.sulfur.dioxide == 15 |
free.sulfur.dioxide == 37 | free.sulfur.dioxide == 76)
head(newwine)
# make free.sulfur.dioxide as factor variable
newwine$free.sulfur.dioxide <- as.factor(newwine$free.sulfur.dioxide)
levels(newwine$free.sulfur.dioxide)
#set.seed(123)
# model <- gbm(fixed.acidity ~ ., data = newwine, #gbm.fit(x = newwine[,2:4], y = newwine[,1],
# distribution = "gaussian",
# bag.fraction = 0.50,
# n.trees = 1000,
# interaction.depth = 16,
# shrinkage = 0.016,
# n.minobsinnode = 10, verbose = FALSE)
set.seed(123)
model <- gbmt(fixed.acidity ~ ., data = newwine, distribution = gbm_dist("Gaussian"))
summary(model)
plot(model, var_index = 3, num_trees = 1000)
# make some rasters for the predictor variables
free.sulfur.dioxide <- c(rep(3,times=10), rep(10, times = 10),
rep(15, times = 10), rep(37, times = 10),
rep(76, times = 10))
free.sulfur.dioxide.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(free.sulfur.dioxide.r) <- free.sulfur.dioxide
set.seed(123)
volatile.acidity <- newwine %>%
dplyr::select(volatile.acidity) %>%
sample_n(50)
volatile.acidity <- as.vector(volatile.acidity)[,1]
volatile.acidity.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(volatile.acidity.r) <- volatile.acidity
set.seed(123)
citric.acid <- newwine %>%
dplyr::select(citric.acid) %>%
sample_n(50)
citric.acid <- as.vector(citric.acid)[,1]
citric.acid.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(citric.acid.r) <- citric.acid
# create a raster stack
r <- stack(free.sulfur.dioxide.r, volatile.acidity.r, citric.acid.r)
names(r) <- c("free.sulfur.dioxide", "volatile.acidity", "citric.acid")
###########################################################################################################################
# predict to a raster with raster predict
pred <- raster::predict(r, model, n.trees = 2000, format="GTiff")
writeRaster(pred, "prediction1.tif", overwrite = TRUE)
# predict to a vector with predict
v <- values(r)
v <- data.frame(v)
v$free.sulfur.dioxide <- as.factor(v$free.sulfur.dioxide)
vpred <- predict(model, v, n.trees = 2000)
write.table(vpred, "vector_predict.txt", row.names = FALSE, col.names = TRUE)
###########################################################################################################################
# close the session and reopen, run until #### line, then run below to make a new prediction, called prediction 2
pred <- raster::predict(r, model, n.trees = 2000, format="GTiff")
writeRaster(pred, "prediction2.tif", overwrite = TRUE)
# predict to a vector with predict
v <- values(r)
v <- data.frame(v)
v$free.sulfur.dioxide <- as.factor(v$free.sulfur.dioxide)
vpred <- predict(model, v, n.trees = 2000)
write.table(vpred, "vector_predict2.txt", row.names = FALSE, col.names = TRUE)
# read in the previous prediction
prediction1 <- raster("prediction1.tif")
prediction2 <- raster("prediction2.tif")
# compare rasters built across sessions
compareRaster(prediction1, prediction2, values = TRUE)
summary(prediction1-prediction2)
# compare rasters built within same session
pred2 <- raster::predict(r, model, n.trees = 2000, format="GTiff", factors = f)
compareRaster(pred, pred2, values = TRUE)
# compare the vector predictions
p1 <- read.delim("vector_predict.txt")
p2 <- read.delim("vector_predict2.txt")
plot(p1$x,p2$x)
summary(p1$x - p2$x)
这个问题与我一年半前发布的这个问题有关:Reproducibility of results from predict() function - raster package。但由于它没有示例,我也创建了一个包含更新信息的新问题。
在将我的预测重现为栅格时,我遇到了一个有点模糊的问题。我正在创建一个带有数值变量和单因子变量的 gbm 模型。然后,我使用栅格包使用我训练的模型预测栅格。预测因会话而异,但会在单个 R 会话中重现。如果我删除因子变量,结果将在会话之间重现。此外,在我下面的示例中,如果我在训练数据中的因子水平多于栅格变量版本中的因子水平,我可以让它在会话之间重现。是什么原因造成的?如何在包含因子变量的同时将我的结果重现到会话中?
# This code will not reproduce session to session, but does if I leave many many factor levels in newwine with the
# commented out code
library(breakDown)
library(gbm)
library(dplyr)
library(raster)
# leave in many levels and code will reproduce session to session
#newwine <- wine[1:500,c(1:3,6)]
# specify only levels which are in the below raster and code will not reproduce session to session
newwine <- wine[,c(1:3,6)] %>%
filter(free.sulfur.dioxide == 3 | free.sulfur.dioxide == 10 | free.sulfur.dioxide == 15 |
free.sulfur.dioxide == 37 | free.sulfur.dioxide == 76)
head(newwine)
# make free.sulfur.dioxide as factor variable
newwine$free.sulfur.dioxide <- as.factor(newwine$free.sulfur.dioxide)
levels(newwine$free.sulfur.dioxide)
set.seed(123)
model <- gbm(fixed.acidity ~ ., data = newwine,
distribution = "gaussian",
bag.fraction = 0.50,
n.trees = 1000,
interaction.depth = 16,
shrinkage = 0.016,
n.minobsinnode = 10, verbose = FALSE)
summary(model)
plot(model, i.var = 3, n.trees = 1000)
# make some rasters for the predictor variables
free.sulfur.dioxide <- c(rep(3,times=10), rep(10, times = 10),
rep(15, times = 10), rep(37, times = 10),
rep(76, times = 10))
free.sulfur.dioxide.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(free.sulfur.dioxide.r) <- free.sulfur.dioxide
set.seed(123)
volatile.acidity <- newwine %>%
dplyr::select(volatile.acidity) %>%
sample_n(50)
volatile.acidity <- as.vector(volatile.acidity)[,1]
volatile.acidity.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(volatile.acidity.r) <- volatile.acidity
set.seed(123)
citric.acid <- newwine %>%
dplyr::select(citric.acid) %>%
sample_n(50)
citric.acid <- as.vector(citric.acid)[,1]
citric.acid.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(citric.acid.r) <- citric.acid
# create a raster stack
r <- stack(free.sulfur.dioxide.r, volatile.acidity.r, citric.acid.r)
names(r) <- c("free.sulfur.dioxide", "volatile.acidity", "citric.acid")
###########################################################################################################################
# predict to a raster with raster predict
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction1.tif", overwrite = TRUE)
###########################################################################################################################
# close the session and reopen, run until line 61, then run below to make a new prediction, called prediction 2
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction2.tif", overwrite = TRUE)
# read in the previous prediction
prediction1 <- raster("prediction1.tif")
prediction2 <- raster("prediction2.tif")
# compare rasters built across sessions
compareRaster(prediction1, prediction2, values = TRUE)
summary(prediction1-prediction2)
# compare rasters built within same session
pred2 <- predict(r, model, n.trees = model$n.trees, format="GTiff")
compareRaster(pred, pred2, values = TRUE)
但是,下面的代码不使用 factor 变量,并且会在会话之间重现会话。
### Same exercise but without setting the free sulfur dioxide to factor
## this code will reproduce session to session
library(breakDown)
library(gbm)
library(dplyr)
library(raster)
newwine <- wine[1:500,c(1:3)]
head(newwine)
set.seed(123)
model <- gbm(fixed.acidity ~ ., data = newwine,
distribution = "gaussian",
bag.fraction = 0.50,
n.trees = 1000,
interaction.depth = 16,
shrinkage = 0.016,
n.minobsinnode = 10, verbose = FALSE)
summary(model)
set.seed(123)
volatile.acidity <- newwine %>%
dplyr::select(volatile.acidity) %>%
sample_n(50)
volatile.acidity <- as.vector(volatile.acidity)[,1]
volatile.acidity.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(volatile.acidity.r) <- volatile.acidity
set.seed(123)
citric.acid <- newwine %>%
dplyr::select(citric.acid) %>%
sample_n(50)
citric.acid <- as.vector(citric.acid)[,1]
citric.acid.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(citric.acid.r) <- citric.acid
# create a raster stack
r <- stack( volatile.acidity.r, citric.acid.r)
names(r) <- c( "volatile.acidity", "citric.acid")
#######################################################################################################################
# predict to a raster with raster predict
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction1.tif", overwrite = TRUE)
#######################################################################################################################
# close the session and reopen to make a new prediction, called prediction 2
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction2.tif", overwrite = TRUE)
# read in the previous prediction
prediction1 <- raster("prediction1.tif")
prediction2 <- raster("prediction2.tif")
# compare rasters built across sessions
compareRaster(prediction1, prediction2, values = TRUE)
summary(prediction1-prediction2)
# compare rasters built within same session
pred2 <- predict(r, model, n.trees = model$n.trees, format="GTiff")
compareRaster(pred, pred2, values = TRUE)
summary(pred-pred2)
这不是解决方案,而是尝试解决问题。在我看来,这与raster
无关。
当我这样做时:
v <- values(r)
pred <- predict(model, data.frame(v), n.trees = model$n.trees)
rpred <- predict(r, model, n.trees = model$n.trees)
退出,保存会话,开始新会话并执行:
library(gbm)
library(raster)
pred2 <- predict(model, data.frame(v), n.trees = model$n.trees )
rpred2 <- predict(r, model, n.trees = model$n.trees)
我看到 pred
和 pred2
的值不太一样。 (参见plot(pred, pred2)
。但是,pred2
和rpred2
的值是相同的:plot(values(rpred2), pred2)
。
或者,当我保存 pred
(saveRDS(pred, 'pred.rds')
,然后将其加载到新会话 pred1 <- readRDS(pred.rds)
,结果并不完全相同。
它向我暗示 gbm
某处正在进行一些不受 set.seed
控制的随机化。
看来这个问题不是由于 raster
软件包引起的,而是由于 gbm
软件包引起的。经过一番挖掘,我发现 gbm
包在 2017 年 3 月被孤立了,并且有一个新的 gbm 包,在 github 上称为 gbm3
(在 CRAN 上尚不可用)https://github.com/gbm-developers/gbm3 .当您预测栅格时,您可以使用您的模型类型要求的任何预测方法(例如 predict.gbm()
用于 gbm
和 predict.GBMFit()
用于 gbm3
。似乎 predict.gbm()
只是没有正确处理来自模型中栅格的因素。它可能是也可能不是错误,但无论哪种情况,这个包都不再被维护。gbm3
可以解决问题并且可以重现。
# This code will reproduce session to session for the gbm3 model, but not for old gbm model
library(breakDown)
# install gbm3 from github
library(gbm3)
library(dplyr)
library(raster)
# specify only levels which are in the below raster
newwine <- wine[,c(1:3,6)] %>%
filter(free.sulfur.dioxide == 3 | free.sulfur.dioxide == 10 | free.sulfur.dioxide == 15 |
free.sulfur.dioxide == 37 | free.sulfur.dioxide == 76)
head(newwine)
# make free.sulfur.dioxide as factor variable
newwine$free.sulfur.dioxide <- as.factor(newwine$free.sulfur.dioxide)
levels(newwine$free.sulfur.dioxide)
#set.seed(123)
# model <- gbm(fixed.acidity ~ ., data = newwine, #gbm.fit(x = newwine[,2:4], y = newwine[,1],
# distribution = "gaussian",
# bag.fraction = 0.50,
# n.trees = 1000,
# interaction.depth = 16,
# shrinkage = 0.016,
# n.minobsinnode = 10, verbose = FALSE)
set.seed(123)
model <- gbmt(fixed.acidity ~ ., data = newwine, distribution = gbm_dist("Gaussian"))
summary(model)
plot(model, var_index = 3, num_trees = 1000)
# make some rasters for the predictor variables
free.sulfur.dioxide <- c(rep(3,times=10), rep(10, times = 10),
rep(15, times = 10), rep(37, times = 10),
rep(76, times = 10))
free.sulfur.dioxide.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(free.sulfur.dioxide.r) <- free.sulfur.dioxide
set.seed(123)
volatile.acidity <- newwine %>%
dplyr::select(volatile.acidity) %>%
sample_n(50)
volatile.acidity <- as.vector(volatile.acidity)[,1]
volatile.acidity.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(volatile.acidity.r) <- volatile.acidity
set.seed(123)
citric.acid <- newwine %>%
dplyr::select(citric.acid) %>%
sample_n(50)
citric.acid <- as.vector(citric.acid)[,1]
citric.acid.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(citric.acid.r) <- citric.acid
# create a raster stack
r <- stack(free.sulfur.dioxide.r, volatile.acidity.r, citric.acid.r)
names(r) <- c("free.sulfur.dioxide", "volatile.acidity", "citric.acid")
###########################################################################################################################
# predict to a raster with raster predict
pred <- raster::predict(r, model, n.trees = 2000, format="GTiff")
writeRaster(pred, "prediction1.tif", overwrite = TRUE)
# predict to a vector with predict
v <- values(r)
v <- data.frame(v)
v$free.sulfur.dioxide <- as.factor(v$free.sulfur.dioxide)
vpred <- predict(model, v, n.trees = 2000)
write.table(vpred, "vector_predict.txt", row.names = FALSE, col.names = TRUE)
###########################################################################################################################
# close the session and reopen, run until #### line, then run below to make a new prediction, called prediction 2
pred <- raster::predict(r, model, n.trees = 2000, format="GTiff")
writeRaster(pred, "prediction2.tif", overwrite = TRUE)
# predict to a vector with predict
v <- values(r)
v <- data.frame(v)
v$free.sulfur.dioxide <- as.factor(v$free.sulfur.dioxide)
vpred <- predict(model, v, n.trees = 2000)
write.table(vpred, "vector_predict2.txt", row.names = FALSE, col.names = TRUE)
# read in the previous prediction
prediction1 <- raster("prediction1.tif")
prediction2 <- raster("prediction2.tif")
# compare rasters built across sessions
compareRaster(prediction1, prediction2, values = TRUE)
summary(prediction1-prediction2)
# compare rasters built within same session
pred2 <- raster::predict(r, model, n.trees = 2000, format="GTiff", factors = f)
compareRaster(pred, pred2, values = TRUE)
# compare the vector predictions
p1 <- read.delim("vector_predict.txt")
p2 <- read.delim("vector_predict2.txt")
plot(p1$x,p2$x)
summary(p1$x - p2$x)