如何save/load张量来加速训练?
How to save/load tensors to speed up training?
我想知道是否可以在 tensorflow.js 中保存和加载张量以避免为每个批次重新计算它们?问题是我的 gpu 几乎没有使用,因为它必须等待 cpu 在训练之前将我的数组转换为张量。
我的工作流程现在是这样的:
- 正在加载数据集(从硬盘读取到阵列)(1-2 秒)
2.cpu 将数组转换为张量(需要很长时间)
3.gpu 训练(用时不超过 1 秒)
卸载/整理(5秒,也有点长)
重复
编辑:
这是一些有问题的代码(意味着长时间的繁重计算)和没有问题的行注释:
async function learn_on(ep){
for (var learn_ep = ep+1; learn_ep <= 1200; learn_ep++) {
var batch_start = 0;
var mini_batch_in = [];
var mini_batch_out = [];
var shuffle_arr=[];
for(var i=0;i<in_tensor_sum.length;i++){
shuffle_arr.push(i); // needs no time
}
shuffle_arr=F_shuffle_array(shuffle_arr); // needs no time
// in_tensor_sum / out_tensor_sum is just an 2 dimensional array = data_set number , data points
for (var batch_num = batch_start; batch_num < in_tensor_sum.length; batch_num++) {
mini_batch_in.push(in_tensor_sum[shuffle_arr[batch_num]]); // very fast also
mini_batch_out.push(out_tensor_sum[shuffle_arr[batch_num]]);// very fast also
if (batch_num + 1 == batch_start + 250 || batch_num == in_tensor_sum.length - 1) {
//possible to import/export xs/ys?????
var xs = tf.tensor(mini_batch_in); //here CPU heavy computation!!!!!!!!!!!!!!!! TAKES LONG TIME 9600 input units here
var ys = tf.tensor(mini_batch_out); // and here CPU heavy computation!!!!!!!! TAKES not so Long time, but this is because of small output size just 400
// GPU ACCELARATION starts here Super fast only one second! This rocks!!!
await model.fit(xs, ys, {
epochs: 1, shuffle: true,
callbacks: {
onEpochEnd: async (epoch, log) => {
console.log(`${batch_num}:|Epoch ${learn_ep}: | set: ${batch_num / in_tensor_sum.length} | loss = ${log.loss}`);
},
onTrainEnd: async () => {
}
}
});
//avoid memory leaks START (ALSO TAKES a little time!!!!)
await tf.tidy(() => {
tf.tensor([xs, ys]);
console.log('numTensors (inside tidy): ' + tf.memory().numTensors);
});
console.log('numTensors (outside tidy): ' + tf.memory().numTensors);
xs.dispose();
ys.dispose();
console.log('numTensors (after dispose): ' + tf.memory().numTensors);
batch_start = batch_num + 1;
mini_batch_in = [];
mini_batch_out = [];
//avoid memory leaks END
}
}
}
}
编辑 2:
我现在尝试使用 'tfjs-npy' 来保存和加载 tensor.But 我得到一个错误:
.
.
.
var xs = await tf.tensor(mini_batch_in);
var ys = await tf.tensor(mini_batch_out);
var fs = require('fs');
var tf_parser= require ('tfjs-npy');
var writeTO=await tf_parser.serialize(ys);
await fs.writeFileSync('/home/test/NetBeansProjects/ispeed_tensload/save_tensors/test.js',new Buffer(writeTO));
var tensor_data =await fs.readFileSync("/home/test/NetBeansProjects/ispeed_tensload/save_tensors/test.js");
var my_arrayBuffer = new Uint8Array(tensor_data).buffer;
var ys2=await tf_parser.parse(my_arrayBuffer);
await model.fit(xs, ys2, {....
错误:
(node:26576) UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'values' of undefined
at NodeJSKernelBackend.getInputTensorIds (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-node/dist/nodejs_kernel_backend.js:142:26)
at NodeJSKernelBackend.executeSingleOutput (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-node/dist/nodejs_kernel_backend.js:186:73)
at NodeJSKernelBackend.gather (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-node/dist/nodejs_kernel_backend.js:965:21)
at environment_1.ENV.engine.runKernel.$x (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/ops/segment_ops.js:56:84)
at /home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/engine.js:129:26
at Engine.scopedRun (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/engine.js:101:23)
at Engine.runKernel (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/engine.js:127:14)
at gather_ (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/ops/segment_ops.js:56:38)
at Object.gather (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/ops/operation.js:23:29)
at /home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-layers/dist/backend/tfjs_backend.js:275:20
我想 'tfjs-npy' 生成的格式不匹配。但我不知道。另一个可接受的解决方案是在 GPU 训练时让张量创建过程 运行 在多个线程上(c++ 后端优化),以将空闲时间减少到最低限度。但我不知道这是否可能。创建进程现在 运行 仅在 node.js 进程中是单线程的,性能非常差。
nodejs 使用的内存可以通过标志 --max-old-space-size
增加,如 所示。 nodejs
和 tensorflow.js
都没有问题。唯一的问题可能是您的内存容量。这可能是来回读取数据的唯一原因。
话虽如此,不清楚这里是做什么的:
await tf.tidy(() => {
tf.tensor([xs, ys]);
console.log('numTensors (inside tidy): ' + tf.memory().numTensors);
});
没用因为:
创建张量并销毁。
xs
和 ys
不是类似数组的 tf.tensor([xs, ys])
将创建一个包含 2 个 NaN 值的张量。对代码的性能没有任何影响。
张量xs
和ys
分别用xs.dispose()
和ys.dispose()
有效处理掉
我想知道是否可以在 tensorflow.js 中保存和加载张量以避免为每个批次重新计算它们?问题是我的 gpu 几乎没有使用,因为它必须等待 cpu 在训练之前将我的数组转换为张量。
我的工作流程现在是这样的:
- 正在加载数据集(从硬盘读取到阵列)(1-2 秒)
2.cpu 将数组转换为张量(需要很长时间)
3.gpu 训练(用时不超过 1 秒)
卸载/整理(5秒,也有点长)
重复
编辑: 这是一些有问题的代码(意味着长时间的繁重计算)和没有问题的行注释:
async function learn_on(ep){
for (var learn_ep = ep+1; learn_ep <= 1200; learn_ep++) {
var batch_start = 0;
var mini_batch_in = [];
var mini_batch_out = [];
var shuffle_arr=[];
for(var i=0;i<in_tensor_sum.length;i++){
shuffle_arr.push(i); // needs no time
}
shuffle_arr=F_shuffle_array(shuffle_arr); // needs no time
// in_tensor_sum / out_tensor_sum is just an 2 dimensional array = data_set number , data points
for (var batch_num = batch_start; batch_num < in_tensor_sum.length; batch_num++) {
mini_batch_in.push(in_tensor_sum[shuffle_arr[batch_num]]); // very fast also
mini_batch_out.push(out_tensor_sum[shuffle_arr[batch_num]]);// very fast also
if (batch_num + 1 == batch_start + 250 || batch_num == in_tensor_sum.length - 1) {
//possible to import/export xs/ys?????
var xs = tf.tensor(mini_batch_in); //here CPU heavy computation!!!!!!!!!!!!!!!! TAKES LONG TIME 9600 input units here
var ys = tf.tensor(mini_batch_out); // and here CPU heavy computation!!!!!!!! TAKES not so Long time, but this is because of small output size just 400
// GPU ACCELARATION starts here Super fast only one second! This rocks!!!
await model.fit(xs, ys, {
epochs: 1, shuffle: true,
callbacks: {
onEpochEnd: async (epoch, log) => {
console.log(`${batch_num}:|Epoch ${learn_ep}: | set: ${batch_num / in_tensor_sum.length} | loss = ${log.loss}`);
},
onTrainEnd: async () => {
}
}
});
//avoid memory leaks START (ALSO TAKES a little time!!!!)
await tf.tidy(() => {
tf.tensor([xs, ys]);
console.log('numTensors (inside tidy): ' + tf.memory().numTensors);
});
console.log('numTensors (outside tidy): ' + tf.memory().numTensors);
xs.dispose();
ys.dispose();
console.log('numTensors (after dispose): ' + tf.memory().numTensors);
batch_start = batch_num + 1;
mini_batch_in = [];
mini_batch_out = [];
//avoid memory leaks END
}
}
}
}
编辑 2:
我现在尝试使用 'tfjs-npy' 来保存和加载 tensor.But 我得到一个错误:
.
.
.
var xs = await tf.tensor(mini_batch_in);
var ys = await tf.tensor(mini_batch_out);
var fs = require('fs');
var tf_parser= require ('tfjs-npy');
var writeTO=await tf_parser.serialize(ys);
await fs.writeFileSync('/home/test/NetBeansProjects/ispeed_tensload/save_tensors/test.js',new Buffer(writeTO));
var tensor_data =await fs.readFileSync("/home/test/NetBeansProjects/ispeed_tensload/save_tensors/test.js");
var my_arrayBuffer = new Uint8Array(tensor_data).buffer;
var ys2=await tf_parser.parse(my_arrayBuffer);
await model.fit(xs, ys2, {....
错误:
(node:26576) UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'values' of undefined
at NodeJSKernelBackend.getInputTensorIds (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-node/dist/nodejs_kernel_backend.js:142:26)
at NodeJSKernelBackend.executeSingleOutput (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-node/dist/nodejs_kernel_backend.js:186:73)
at NodeJSKernelBackend.gather (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-node/dist/nodejs_kernel_backend.js:965:21)
at environment_1.ENV.engine.runKernel.$x (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/ops/segment_ops.js:56:84)
at /home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/engine.js:129:26
at Engine.scopedRun (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/engine.js:101:23)
at Engine.runKernel (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/engine.js:127:14)
at gather_ (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/ops/segment_ops.js:56:38)
at Object.gather (/home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-core/dist/ops/operation.js:23:29)
at /home/test/NetBeansProjects/ispeed_tensload/node_modules/@tensorflow/tfjs-layers/dist/backend/tfjs_backend.js:275:20
我想 'tfjs-npy' 生成的格式不匹配。但我不知道。另一个可接受的解决方案是在 GPU 训练时让张量创建过程 运行 在多个线程上(c++ 后端优化),以将空闲时间减少到最低限度。但我不知道这是否可能。创建进程现在 运行 仅在 node.js 进程中是单线程的,性能非常差。
nodejs 使用的内存可以通过标志 --max-old-space-size
增加,如 nodejs
和 tensorflow.js
都没有问题。唯一的问题可能是您的内存容量。这可能是来回读取数据的唯一原因。
话虽如此,不清楚这里是做什么的:
await tf.tidy(() => {
tf.tensor([xs, ys]);
console.log('numTensors (inside tidy): ' + tf.memory().numTensors);
});
没用因为:
创建张量并销毁。
xs
和ys
不是类似数组的tf.tensor([xs, ys])
将创建一个包含 2 个 NaN 值的张量。对代码的性能没有任何影响。
张量xs
和ys
分别用xs.dispose()
和ys.dispose()