mlr:为什么使用并行化时超参数调优的再现性会失败?
mlr: why does reproducibility of hyperparameter tuning fail using parallelization?
我使用的代码基于 mlr
cheatsheet 中的快速入门示例。我添加了并行化并尝试多次调整参数。
问题:为什么每次调优前都设置set.seed()
,但重现性会失败(为什么结果不一样)?我的代码中缺少什么?我应该如何修改代码以实现重现性?
代码(在我的电脑上最多运行 1 分钟):
library(mlr)
#> Loading required package: ParamHelpers
library(parallel)
library(parallelMap)
# Load data
data(Soybean, package = "mlbench")
# Initialize paralelllization
parallelStartSocket(cpus = 2)
#> Starting parallelization in mode=socket with cpus=2.
# Prepare data, task, learner
soy = createDummyFeatures(Soybean, target = "Class")
tsk = makeClassifTask(data = soy, target = "Class")
ho = makeResampleInstance("Holdout", tsk)
tsk.train = subsetTask(tsk, ho$train.inds[[1]])
lrn = makeLearner("classif.xgboost", nrounds = 10)
#> Warning in makeParam(id = id, type = "numeric", learner.param = TRUE, lower = lower, : NA used as a default value for learner parameter missing.
#> ParamHelpers uses NA as a special value for dependent parameters.
# Prepare for hyperparametar tuning
ps = makeParamSet(makeNumericParam("eta", 0, 1))
tc = makeTuneControlMBO(budget = 1)
# Turn off excessive output
configureMlr(show.info = FALSE, show.learner.output = FALSE)
# Tune parameters
suppressMessages({
# set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tr1 = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)
# set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tr2 = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)
})
# Stop paralellization
parallelStop()
#> Stopped parallelization. All cleaned up.
结果不一样:
all.equal(tr1, tr2)
#> [1] "Component \"x\": Component \"eta\": Mean relative difference: 0.1849302"
#> [2] "Component \"y\": Mean relative difference: 1.074668e-05"
#> [3] "Component \"resampling\": Component \"train.inds\": Component 1: Numeric: lengths (228, 227) differ"
#> [4] "Component \"resampling\": Component \"train.inds\": Component 2: Numeric: lengths (227, 228) differ"
#> [5] "Component \"resampling\": Component \"test.inds\": Component 1: Numeric: lengths (227, 228) differ"
#> [6] "Component \"resampling\": Component \"test.inds\": Component 2: Numeric: lengths (228, 227) differ"
#> [7] "Component \"mbo.result\": Component \"x\": Component \"eta\": Mean relative difference: 0.1849302"
#> [8] "Component \"mbo.result\": Component \"y\": Mean relative difference: 1.074668e-05"
#> [9] "Component \"mbo.result\": Component \"opt.path\": Component \"env\": Component \"exec.time\": Mean relative difference: 0.1548913"
#> [10] "Component \"mbo.result\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"eta\": Mean relative difference: 0.773126"
#> [11] "Component \"mbo.result\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"y\": Mean relative difference: 0.03411588"
#> [12] "Component \"mbo.result\": Component \"final.opt.state\": Component \"loop.starttime\": Mean absolute difference: 1.810968"
#> [13] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.path\": Component \"env\": Component \"exec.time\": Mean relative difference: 0.1548913"
#> [14] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"eta\": Mean relative difference: 0.773126"
#> [15] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"y\": Mean relative difference: 0.03411588"
#> [16] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.problem\": Component \"design\": Component \"eta\": Mean relative difference: 0.773126"
#> [17] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.result\": Component \"mbo.result\": Component \"x\": Component \"eta\": Mean relative difference: 0.1849302"
#> [18] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.result\": Component \"mbo.result\": Component \"y\": Mean relative difference: 1.074668e-05"
#> [19] "Component \"mbo.result\": Component \"final.opt.state\": Component \"random.seed\": Mean relative difference: 1.28965"
#> [20] "Component \"mbo.result\": Component \"final.opt.state\": Component \"time.created\": Mean absolute difference: 5.489337"
#> [21] "Component \"mbo.result\": Component \"final.opt.state\": Component \"time.last.saved\": Mean absolute difference: 5.489337"
#> [22] "Component \"mbo.result\": Component \"final.opt.state\": Component \"time.used\": Mean relative difference: 0.6841712"
我也试过了
set.seed(123456, "L'Ecuyer-CMRG")
而不是
parallel::clusterSetRNGStream(iseed = 123456)
这并没有导致可重复性。
但是当关闭并行化时,结果是相同的(set.seed(123456, "L'Ecuyer-CMRG")
(beginning/end 时间和持续时间除外)。
使用基于分叉进程(多核,在 Windows 上不可用)的并行性还是基于具有套接字通信的单独进程确实有所不同。对于多核并行性,在 parallelStart()
之前设置种子和 RNG 种类就足够了,以便在每次调用时获得相同的随机数:
library(parallelMap)
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
parallelStartMulticore(cpus = 2)
r1 <- parallelMap(runif, rep(3, 2))
parallelStop()
set.seed(123456, "L'Ecuyer-CMRG")
parallelStartMulticore(cpus = 2)
r2 <- parallelMap(runif, rep(3, 2))
parallelStop()
})
all.equal(r1, r2)
#> [1] TRUE
对于基于套接字的并行性,我们可以使用 parallel::clusterSetRNGStream()
after parallelStart()
,如 GitHub issue:
中所述
library(parallelMap)
suppressMessages({
parallelStartSocket(cpus = 2)
parallel::clusterSetRNGStream(iseed = 123456)
r1 <- parallelMap(runif, rep(3, 2))
parallelStop()
parallelStartSocket(cpus = 2)
parallel::clusterSetRNGStream(iseed = 123456)
r2 <- parallelMap(runif, rep(3, 2))
parallelStop()
})
all.equal(r1, r2)
#> [1] TRUE
这应该也适用于你的实际问题,虽然我没有测试过。
以下代码创建相同的可重现结果(计时除外)
library(mlr)
library(parallel)
library(parallelMap)
# Load data
data(Soybean, package = "mlbench")
# Initialize paralelllization
parallelStartSocket(cpus = 2)
# Prepare data, task, learner
soy = createDummyFeatures(Soybean, target = "Class")
tsk = makeClassifTask(data = soy, target = "Class")
ho = makeResampleInstance("Holdout", tsk)
tsk.train = subsetTask(tsk, ho$train.inds[[1]])
lrn = makeLearner("classif.xgboost", nrounds = 10)
# Prepare for hyperparametar tuning
ps = makeParamSet(makeNumericParam("eta", 0, 1))
tc = makeTuneControlMBO(budget = 1)
# Turn off excessive output
configureMlr(show.info = FALSE, show.learner.output = FALSE)
# Tune parameters
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tr1 = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tr2 = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)
})
parallelStop()
我改变了什么?
我还设置了本地种子。
为什么?
因为这不仅仅是关于并行进程的播种。主机上的播种也很重要,因为它会影响例如。重采样(在母版上绘制)。
我使用的代码基于 mlr
cheatsheet 中的快速入门示例。我添加了并行化并尝试多次调整参数。
问题:为什么每次调优前都设置set.seed()
,但重现性会失败(为什么结果不一样)?我的代码中缺少什么?我应该如何修改代码以实现重现性?
代码(在我的电脑上最多运行 1 分钟):
library(mlr)
#> Loading required package: ParamHelpers
library(parallel)
library(parallelMap)
# Load data
data(Soybean, package = "mlbench")
# Initialize paralelllization
parallelStartSocket(cpus = 2)
#> Starting parallelization in mode=socket with cpus=2.
# Prepare data, task, learner
soy = createDummyFeatures(Soybean, target = "Class")
tsk = makeClassifTask(data = soy, target = "Class")
ho = makeResampleInstance("Holdout", tsk)
tsk.train = subsetTask(tsk, ho$train.inds[[1]])
lrn = makeLearner("classif.xgboost", nrounds = 10)
#> Warning in makeParam(id = id, type = "numeric", learner.param = TRUE, lower = lower, : NA used as a default value for learner parameter missing.
#> ParamHelpers uses NA as a special value for dependent parameters.
# Prepare for hyperparametar tuning
ps = makeParamSet(makeNumericParam("eta", 0, 1))
tc = makeTuneControlMBO(budget = 1)
# Turn off excessive output
configureMlr(show.info = FALSE, show.learner.output = FALSE)
# Tune parameters
suppressMessages({
# set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tr1 = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)
# set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tr2 = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)
})
# Stop paralellization
parallelStop()
#> Stopped parallelization. All cleaned up.
结果不一样:
all.equal(tr1, tr2)
#> [1] "Component \"x\": Component \"eta\": Mean relative difference: 0.1849302"
#> [2] "Component \"y\": Mean relative difference: 1.074668e-05"
#> [3] "Component \"resampling\": Component \"train.inds\": Component 1: Numeric: lengths (228, 227) differ"
#> [4] "Component \"resampling\": Component \"train.inds\": Component 2: Numeric: lengths (227, 228) differ"
#> [5] "Component \"resampling\": Component \"test.inds\": Component 1: Numeric: lengths (227, 228) differ"
#> [6] "Component \"resampling\": Component \"test.inds\": Component 2: Numeric: lengths (228, 227) differ"
#> [7] "Component \"mbo.result\": Component \"x\": Component \"eta\": Mean relative difference: 0.1849302"
#> [8] "Component \"mbo.result\": Component \"y\": Mean relative difference: 1.074668e-05"
#> [9] "Component \"mbo.result\": Component \"opt.path\": Component \"env\": Component \"exec.time\": Mean relative difference: 0.1548913"
#> [10] "Component \"mbo.result\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"eta\": Mean relative difference: 0.773126"
#> [11] "Component \"mbo.result\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"y\": Mean relative difference: 0.03411588"
#> [12] "Component \"mbo.result\": Component \"final.opt.state\": Component \"loop.starttime\": Mean absolute difference: 1.810968"
#> [13] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.path\": Component \"env\": Component \"exec.time\": Mean relative difference: 0.1548913"
#> [14] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"eta\": Mean relative difference: 0.773126"
#> [15] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.path\": Component \"env\": Component \"path\": Component \"y\": Mean relative difference: 0.03411588"
#> [16] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.problem\": Component \"design\": Component \"eta\": Mean relative difference: 0.773126"
#> [17] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.result\": Component \"mbo.result\": Component \"x\": Component \"eta\": Mean relative difference: 0.1849302"
#> [18] "Component \"mbo.result\": Component \"final.opt.state\": Component \"opt.result\": Component \"mbo.result\": Component \"y\": Mean relative difference: 1.074668e-05"
#> [19] "Component \"mbo.result\": Component \"final.opt.state\": Component \"random.seed\": Mean relative difference: 1.28965"
#> [20] "Component \"mbo.result\": Component \"final.opt.state\": Component \"time.created\": Mean absolute difference: 5.489337"
#> [21] "Component \"mbo.result\": Component \"final.opt.state\": Component \"time.last.saved\": Mean absolute difference: 5.489337"
#> [22] "Component \"mbo.result\": Component \"final.opt.state\": Component \"time.used\": Mean relative difference: 0.6841712"
我也试过了
set.seed(123456, "L'Ecuyer-CMRG")
而不是
parallel::clusterSetRNGStream(iseed = 123456)
这并没有导致可重复性。
但是当关闭并行化时,结果是相同的(set.seed(123456, "L'Ecuyer-CMRG")
(beginning/end 时间和持续时间除外)。
使用基于分叉进程(多核,在 Windows 上不可用)的并行性还是基于具有套接字通信的单独进程确实有所不同。对于多核并行性,在 parallelStart()
之前设置种子和 RNG 种类就足够了,以便在每次调用时获得相同的随机数:
library(parallelMap)
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
parallelStartMulticore(cpus = 2)
r1 <- parallelMap(runif, rep(3, 2))
parallelStop()
set.seed(123456, "L'Ecuyer-CMRG")
parallelStartMulticore(cpus = 2)
r2 <- parallelMap(runif, rep(3, 2))
parallelStop()
})
all.equal(r1, r2)
#> [1] TRUE
对于基于套接字的并行性,我们可以使用 parallel::clusterSetRNGStream()
after parallelStart()
,如 GitHub issue:
library(parallelMap)
suppressMessages({
parallelStartSocket(cpus = 2)
parallel::clusterSetRNGStream(iseed = 123456)
r1 <- parallelMap(runif, rep(3, 2))
parallelStop()
parallelStartSocket(cpus = 2)
parallel::clusterSetRNGStream(iseed = 123456)
r2 <- parallelMap(runif, rep(3, 2))
parallelStop()
})
all.equal(r1, r2)
#> [1] TRUE
这应该也适用于你的实际问题,虽然我没有测试过。
以下代码创建相同的可重现结果(计时除外)
library(mlr)
library(parallel)
library(parallelMap)
# Load data
data(Soybean, package = "mlbench")
# Initialize paralelllization
parallelStartSocket(cpus = 2)
# Prepare data, task, learner
soy = createDummyFeatures(Soybean, target = "Class")
tsk = makeClassifTask(data = soy, target = "Class")
ho = makeResampleInstance("Holdout", tsk)
tsk.train = subsetTask(tsk, ho$train.inds[[1]])
lrn = makeLearner("classif.xgboost", nrounds = 10)
# Prepare for hyperparametar tuning
ps = makeParamSet(makeNumericParam("eta", 0, 1))
tc = makeTuneControlMBO(budget = 1)
# Turn off excessive output
configureMlr(show.info = FALSE, show.learner.output = FALSE)
# Tune parameters
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tr1 = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tr2 = tuneParams(lrn, tsk.train, cv2, acc, ps, tc)
})
parallelStop()
我改变了什么? 我还设置了本地种子。 为什么? 因为这不仅仅是关于并行进程的播种。主机上的播种也很重要,因为它会影响例如。重采样(在母版上绘制)。