如何 "serialize" 一个非 R 对象和一个 R 对象

how to "serialize" a non-R object together with an R object

R 中的一些对象实际上是指向较低级别(不确定这是否是正确的术语)构造的指针,需要专门的函数才能保存到磁盘。例如,saveRDS 不足以保留 lightgbm 提升树:

## Create a lightgbm booster
library(lightgbm)
data(agaricus.train, package = "lightgbm")
train = agaricus.train
bst = lightgbm(data = train$data,label = train$label,
               nrounds = 1, objective = "binary")

## but suppose bst is only one part of a bigger analysis
results = list(bst = bst, metadata = 'other stuff')

## then it would be nice if this IO cycle worked, but the last line crashes R
# saveRDS(results, file = 'so_post_temp')
# rm(results)
# rm(bst)
# lgb.unloader(wipe = TRUE)
# results = readRDS('so_post_temp')
# predict(results$bst, train$data)

标准解决方案并不可怕,但足以让我烦恼。它需要使用单独的特定于 lightgbm 的保护程序,并为我要保存的任何分析创建单独的 'companion' 文件:

results = list(lgbpath = 'bst.lightgbm', metadata = 'other stuff')
saveRDS(results, file = 'so_post_temp')
lgb.save(bst, file = 'bst.lightgbm')
# destruct:
rm(results)
rm(bst)
lgb.unloader(wipe = TRUE)
# reconstruct:
results = readRDS('so_post_temp')
bst = lgb.load(results$lgbpath)
predict(bst, train$data)

有什么方法可以清理它以某种方式将 R 对象和其他对象绑定到一个文件中吗?像

fake_pointer_to_disk = [points to some kind of R object instead]
fake_file_object = lgb.save(bst, file = fake_pointer_to_disk)
results = list(bst = fake_file_object, metadata = 'other stuff')
# later loaded as
bst = lgb.load(results$bst)

我认为 readBin 应该足够了:

tf <- tempfile()

lgb.save(bst, file=tf)
# since I don't have lightgbm loaded, this is my fake model/save
bst <- 100:150 # my fake data
writeBin(bst, file = tf) # poor man's lgb.save :-)

现在将其作为 blob 读入:

rawbst <- readBin(tf, raw(), n=file.size(tf))
file.remove(tf)

并按照您想要的方式保存:

saveRDS(list(bst = rawbst, metadata = 'other stuff'), file = 'so_post_temp')

当您准备好重新水合您的结果和模型时:

tf2 <- tempfile()
results <- readRDS('so_post_temp')
writeBin(results$bst, tf2)
bst <- lgb.load(tf2)
file.remove(tf2)

(警告:测试不足:它使用假数据,我没有尝试过类似 bst 的对象。)

这是@r2evans 解决方案的一个实现,它有效。

library(R6)
library(lightgbm)

data(agaricus.train, package = "lightgbm")
train = agaricus.train
bst = lightgbm(data = train$data,label = train$label,
               nrounds = 1, objective = "binary")

ClassWithBst = R6::R6Class(
    classname = "ClassFoo",
    public = list(
        bst = NULL,
        bst_binary = NULL,
        initialize = function(bst){
            self$bst = bst
        },
        save = function(file){
            tf = tempfile()
            lgb.save(self$bst, file = tf)
            self$bst_binary = readBin(tf, raw(), n=file.size(tf))
            saveRDS(self, file)
            null = file.remove(tf)
        },
        refresh = function(){
            tf = tempfile()
            writeBin(self$bst_binary, tf)
            self$bst = lgb.load(tf)
            null = file.remove(tf)
        }
    )
)

cwb = ClassWithBst$new(bst = bst)
cwb$save('test_class_with_bst')
rm(cwb)
rm(bst)
lgb.unloader(wipe = TRUE)
cwb = readRDS('test_class_with_bst')
cwb$refresh()
predict(cwb$bst, train$data)