Tensorflow.js 三次多项式逼近
Tensorflow.js cubic polynomial approximation
所以我正在尝试进入 Tensorflow 并想在打字稿中尝试这个 fit-curve tutorial。
我看不出文档有什么不同,但在我的案例中,SGD 优化器似乎在数字范围之外振荡而不是最小化?也许有人可以立即看到问题...
- 尝试降低学习率肯定没有多大帮助
- 使用不同的优化器有效(例如用 .adam 替换 .sgd)
步骤概述(见代码底部):
- 用 x=rand(0..100) 为 f(x)=1x³+2x²+3x+4 生成 10 个点
- 将费率设置为 .0000001(想要 .5 或更高)
- 1、5、10步的近似曲线
控制台输出(x: x, y: y <=> yTrained ==> diff):
f(x)=ax³+bx²+cx+d: 1 in 832ms: 23966.115234375, 298.5378112792969, 3.8008997440338135, 0.05001421645283699
x: 30.47792458272469, y: 30264.304156120106 <=> 678783617.7128912 ==> 678753353.4087352
x: 80.28933091786573, y: 530712.8901260403 <=> 12406193327.27859 ==> 12405662614.388464
x: 83.20465291191101, y: 590126.6416137978 <=> 13807196536.914392 ==> 13806606410.27278
x: 4.705203030318961, y: 166.5616668725893 <=> 2503134.008839548 ==> 2502967.4471726753
x: 51.6146399264698, y: 142992.0542581485 <=> 3296257832.6580544 ==> 3296114840.603796
x: 63.04112413466787, y: 258678.4748135199 <=> 6005584931.290034 ==> 6005326252.815221
x: 87.3460615554209, y: 681917.0253005795 <=> 15973113065.600096 ==> 15972431148.574795
x: 37.19785928759356, y: 54352.916350066305 <=> 1233948246.1033068 ==> 1233893893.1869566
x: 58.41298898556424, y: 206313.02623606965 <=> 4777696480.876462 ==> 4777490167.850225
x: 16.60852306193672, y: 5186.8571088452845 <=> 109879466.05047359 ==> 109874279.19336474
f(x)=ax³+bx²+cx+d: 5 in 740ms: 7.155174409828215e+21, 89080444165388500000, 1133177711745826800, 14889370641235968
x: 30.47792458272469, y: 30264.304156120106 <=> 2.0265337241494757e+26 ==> 2.0265337241494757e+26
x: 80.28933091786573, y: 530712.8901260403 <=> 3.7039156556767173e+27 ==> 3.7039156556767173e+27
x: 83.20465291191101, y: 590126.6416137978 <=> 4.1221904429790747e+27 ==> 4.1221904429790747e+27
x: 4.705203030318961, y: 166.5616668725893 <=> 7.473190365025798e+23 ==> 7.473190365025798e+23
x: 51.6146399264698, y: 142992.0542581485 <=> 9.841101205493355e+26 ==> 9.841101205493355e+26
x: 63.04112413466787, y: 258678.4748135199 <=> 1.7929899156585234e+27 ==> 1.7929899156585234e+27
x: 87.3460615554209, y: 681917.0253005795 <=> 4.76883298542593e+27 ==> 4.76883298542593e+27
x: 37.19785928759356, y: 54352.916350066305 <=> 3.683998510372637e+26 ==> 3.683998510372637e+26
x: 58.41298898556424, y: 206313.02623606965 <=> 1.4263991991276407e+27 ==> 1.4263991991276407e+27
x: 16.60852306193672, y: 5186.8571088452845 <=> 3.2804916875550836e+25 ==> 3.2804916875550836e+25
f(x)=ax³+bx²+cx+d: 10 in 819ms: NaN, NaN, NaN, NaN
x: 30.47792458272469, y: 30264.304156120106 <=> NaN ==> NaN
x: 80.28933091786573, y: 530712.8901260403 <=> NaN ==> NaN
x: 83.20465291191101, y: 590126.6416137978 <=> NaN ==> NaN
x: 4.705203030318961, y: 166.5616668725893 <=> NaN ==> NaN
x: 51.6146399264698, y: 142992.0542581485 <=> NaN ==> NaN
x: 63.04112413466787, y: 258678.4748135199 <=> NaN ==> NaN
x: 87.3460615554209, y: 681917.0253005795 <=> NaN ==> NaN
x: 37.19785928759356, y: 54352.916350066305 <=> NaN ==> NaN
x: 58.41298898556424, y: 206313.02623606965 <=> NaN ==> NaN
x: 16.60852306193672, y: 5186.8571088452845 <=> NaN ==> NaN
源代码:
import * as tf from '@tensorflow/tfjs';
const arrayFrom = (len: number) => Array.from(Array(Math.max(len || 0, 0)).keys());
/** Detect a b c d for ax³+bx²+cx+d */
async function detectCubicPolynom({ xyFlatData = <number[]>[], loops = 100, learningRate = .01 }) {
// VARIABLES: init with 0
const [aa, bb, cc, dd] = arrayFrom(4).map(ii => tf.variable(tf.scalar(0)));
// MODEL: f(x)=ax³+bx²+cx+d
const doPredict = (xs: tf.Tensor) => tf.tidy(() =>
aa.mul(xs.pow(tf.scalar(3)))
.add(bb.mul(xs.square()))
.add(cc.mul(xs))
.add(dd)
);
// LOSS FUNCTION: MSE (mean squared error) i.e. mean of square of diff
const doLoss = (predictions: tf.Tensor, labels: tf.Tensor) => tf.tidy(() => predictions.sub(labels).square().mean());
// OPTIMIZER - SGD (stochastic gradient descent)
const optimizer = tf.train.sgd(learningRate);
// TRAIN
const doTrain = (xs: tf.Tensor, ys: tf.Tensor) => tf.tidy(() =>
optimizer.minimize(() => <tf.Tensor<tf.Rank.R0>>doLoss(doPredict(xs), ys)));
const doTrainTimes = (xs: tf.Tensor, ys: tf.Tensor, times: number) =>
arrayFrom(times).map(ii => doTrain(xs, ys)).filter(ii => !!ii).forEach(ii => ii.dispose());
// EXECUTE
const xData = tf.tensor1d(xyFlatData.filter((ii, index) => index % 2 === 0));
const yData = tf.tensor1d(xyFlatData.filter((ii, index) => index % 2 !== 0));
if (xyFlatData.length > 0 && loops > 0) {
doTrainTimes(xData, yData, loops);
}
// RESULT
const result = (await Promise.all([aa, bb, cc, dd].map(ii => ii.data())))
.reduce((acc, ii) => acc.concat([...ii]), <number[]>[]);
// CLEANUP
[aa, bb, cc, dd, xData, yData].forEach(ii => ii.dispose());
return result;
}
const tryDetectCubicPolynom = (xyFlatData: number[], loops: number, learningRate: number) => {
const now = Date.now();
detectCubicPolynom({ xyFlatData: xyFlatData || [], loops: Math.max(loops || 0, 1), learningRate: learningRate || .5 })
.then(abcd => {
console.log(`f(x)=ax³+bx²+cx+d: ${loops} in ${Date.now() - now}ms: ${abcd.join(', ')}`);
for (let ii = 0; ii < xyFlatData.length; ii += 2) {
const x = xyFlatData[ii];
const y = xyFlatData[ii + 1];
const yTrained = abcd[0] * x ** 3 + abcd[1] * x ** 2 + abcd[2] * x + abcd[3];
console.log(`\tx: ${x}, y: ${y} <=> ${yTrained} ==> ${Math.abs(yTrained - y)}`);
}
});
}
const generatePolynomialPoints = (weights: number[], points: number, xUntil = 10) => {
const flatPoints = Array<number>(points * 2);
for (let ii = 0; ii < points; ++ii) {
const xx = Math.random() * 100;
const yy = weights.reduce((acc, val, index) => acc + val * xx ** (weights.length - 1 - index), 0);
flatPoints[2 * ii] = xx;
flatPoints[2 * ii + 1] = yy;
}
return flatPoints;
}
const generatedPoints = generatePolynomialPoints([1, 2, 3, 4], 10, 100);
const rate = .0000001;
tryDetectCubicPolynom(generatedPoints, 1, rate);
tryDetectCubicPolynom(generatedPoints, 5, rate);
tryDetectCubicPolynom(generatedPoints, 10, rate);
在训练数值稳定性模型之前,您应该将 X 的值归一化到 0 和 1 之间。 (有关详细信息,请参阅:https://www.coursera.org/learn/deep-neural-network/lecture/lXv6U/normalizing-inputs)
对于这个例子,您可以简单地限制随机生成的 X 的范围,它应该可以工作。请注意本教程中 X 的值如何介于 0 和 1 之间。
我正在尝试做同样的事情,想知道为什么使用 SGD 优化器和损失函数的 meanSquareError 会完全失控,所以为了调试它,我将学习率降低到非常小的水平,我猜什么?那行得通...我现在使用的学习率为 0.000000000001。为什么他们在他们的示例中使用 0.5,为什么它似乎对他们有用?我不知道。因此,请尝试添加更多的零,直到它起作用。如果有谁知道为什么官方例子用0.5的时候要这么小,请告诉我。
@TaVirot 和@Hans 的回答提出了一些观点。他们的答案可以是解决问题的方法。
问题的一个原因是某些数据点对梯度的影响比其他数据点大得多。例如,当
x=100
和
a=1,b=2,c=3,d=4
然后
f(x)=f(100) = 1*100^3+2*100^2+3*100+4 = 1020304.
并假设 a
更改为 1.1,然后 f(100) = 1120304
。差异是 100,000。
否则,当x=1
和a=1,b=2,c=3,d=4
时,则
f(x)=f(1) = 1*1^3+2*1^2+3*1^1+4 = 10
并假设 a
更改为 1.1,然后 f(1) = 10.1
。在这种情况下,差异是 0.1。
所以相同的权重变化最终会导致巨大的 f(x) 差异,具体取决于 x 值 。所以,如果你只使用 均方误差 作为成本函数,这个 x=100 点 会在反向传播中产生巨大的梯度。总之,为避免模型发散,请考虑低学习率或数据标准化或其他成本函数。 (也许还有其他一些方法。)
所以我正在尝试进入 Tensorflow 并想在打字稿中尝试这个 fit-curve tutorial。
我看不出文档有什么不同,但在我的案例中,SGD 优化器似乎在数字范围之外振荡而不是最小化?也许有人可以立即看到问题...
- 尝试降低学习率肯定没有多大帮助
- 使用不同的优化器有效(例如用 .adam 替换 .sgd)
步骤概述(见代码底部):
- 用 x=rand(0..100) 为 f(x)=1x³+2x²+3x+4 生成 10 个点
- 将费率设置为 .0000001(想要 .5 或更高)
- 1、5、10步的近似曲线
控制台输出(x: x, y: y <=> yTrained ==> diff):
f(x)=ax³+bx²+cx+d: 1 in 832ms: 23966.115234375, 298.5378112792969, 3.8008997440338135, 0.05001421645283699
x: 30.47792458272469, y: 30264.304156120106 <=> 678783617.7128912 ==> 678753353.4087352
x: 80.28933091786573, y: 530712.8901260403 <=> 12406193327.27859 ==> 12405662614.388464
x: 83.20465291191101, y: 590126.6416137978 <=> 13807196536.914392 ==> 13806606410.27278
x: 4.705203030318961, y: 166.5616668725893 <=> 2503134.008839548 ==> 2502967.4471726753
x: 51.6146399264698, y: 142992.0542581485 <=> 3296257832.6580544 ==> 3296114840.603796
x: 63.04112413466787, y: 258678.4748135199 <=> 6005584931.290034 ==> 6005326252.815221
x: 87.3460615554209, y: 681917.0253005795 <=> 15973113065.600096 ==> 15972431148.574795
x: 37.19785928759356, y: 54352.916350066305 <=> 1233948246.1033068 ==> 1233893893.1869566
x: 58.41298898556424, y: 206313.02623606965 <=> 4777696480.876462 ==> 4777490167.850225
x: 16.60852306193672, y: 5186.8571088452845 <=> 109879466.05047359 ==> 109874279.19336474
f(x)=ax³+bx²+cx+d: 5 in 740ms: 7.155174409828215e+21, 89080444165388500000, 1133177711745826800, 14889370641235968
x: 30.47792458272469, y: 30264.304156120106 <=> 2.0265337241494757e+26 ==> 2.0265337241494757e+26
x: 80.28933091786573, y: 530712.8901260403 <=> 3.7039156556767173e+27 ==> 3.7039156556767173e+27
x: 83.20465291191101, y: 590126.6416137978 <=> 4.1221904429790747e+27 ==> 4.1221904429790747e+27
x: 4.705203030318961, y: 166.5616668725893 <=> 7.473190365025798e+23 ==> 7.473190365025798e+23
x: 51.6146399264698, y: 142992.0542581485 <=> 9.841101205493355e+26 ==> 9.841101205493355e+26
x: 63.04112413466787, y: 258678.4748135199 <=> 1.7929899156585234e+27 ==> 1.7929899156585234e+27
x: 87.3460615554209, y: 681917.0253005795 <=> 4.76883298542593e+27 ==> 4.76883298542593e+27
x: 37.19785928759356, y: 54352.916350066305 <=> 3.683998510372637e+26 ==> 3.683998510372637e+26
x: 58.41298898556424, y: 206313.02623606965 <=> 1.4263991991276407e+27 ==> 1.4263991991276407e+27
x: 16.60852306193672, y: 5186.8571088452845 <=> 3.2804916875550836e+25 ==> 3.2804916875550836e+25
f(x)=ax³+bx²+cx+d: 10 in 819ms: NaN, NaN, NaN, NaN
x: 30.47792458272469, y: 30264.304156120106 <=> NaN ==> NaN
x: 80.28933091786573, y: 530712.8901260403 <=> NaN ==> NaN
x: 83.20465291191101, y: 590126.6416137978 <=> NaN ==> NaN
x: 4.705203030318961, y: 166.5616668725893 <=> NaN ==> NaN
x: 51.6146399264698, y: 142992.0542581485 <=> NaN ==> NaN
x: 63.04112413466787, y: 258678.4748135199 <=> NaN ==> NaN
x: 87.3460615554209, y: 681917.0253005795 <=> NaN ==> NaN
x: 37.19785928759356, y: 54352.916350066305 <=> NaN ==> NaN
x: 58.41298898556424, y: 206313.02623606965 <=> NaN ==> NaN
x: 16.60852306193672, y: 5186.8571088452845 <=> NaN ==> NaN
源代码:
import * as tf from '@tensorflow/tfjs';
const arrayFrom = (len: number) => Array.from(Array(Math.max(len || 0, 0)).keys());
/** Detect a b c d for ax³+bx²+cx+d */
async function detectCubicPolynom({ xyFlatData = <number[]>[], loops = 100, learningRate = .01 }) {
// VARIABLES: init with 0
const [aa, bb, cc, dd] = arrayFrom(4).map(ii => tf.variable(tf.scalar(0)));
// MODEL: f(x)=ax³+bx²+cx+d
const doPredict = (xs: tf.Tensor) => tf.tidy(() =>
aa.mul(xs.pow(tf.scalar(3)))
.add(bb.mul(xs.square()))
.add(cc.mul(xs))
.add(dd)
);
// LOSS FUNCTION: MSE (mean squared error) i.e. mean of square of diff
const doLoss = (predictions: tf.Tensor, labels: tf.Tensor) => tf.tidy(() => predictions.sub(labels).square().mean());
// OPTIMIZER - SGD (stochastic gradient descent)
const optimizer = tf.train.sgd(learningRate);
// TRAIN
const doTrain = (xs: tf.Tensor, ys: tf.Tensor) => tf.tidy(() =>
optimizer.minimize(() => <tf.Tensor<tf.Rank.R0>>doLoss(doPredict(xs), ys)));
const doTrainTimes = (xs: tf.Tensor, ys: tf.Tensor, times: number) =>
arrayFrom(times).map(ii => doTrain(xs, ys)).filter(ii => !!ii).forEach(ii => ii.dispose());
// EXECUTE
const xData = tf.tensor1d(xyFlatData.filter((ii, index) => index % 2 === 0));
const yData = tf.tensor1d(xyFlatData.filter((ii, index) => index % 2 !== 0));
if (xyFlatData.length > 0 && loops > 0) {
doTrainTimes(xData, yData, loops);
}
// RESULT
const result = (await Promise.all([aa, bb, cc, dd].map(ii => ii.data())))
.reduce((acc, ii) => acc.concat([...ii]), <number[]>[]);
// CLEANUP
[aa, bb, cc, dd, xData, yData].forEach(ii => ii.dispose());
return result;
}
const tryDetectCubicPolynom = (xyFlatData: number[], loops: number, learningRate: number) => {
const now = Date.now();
detectCubicPolynom({ xyFlatData: xyFlatData || [], loops: Math.max(loops || 0, 1), learningRate: learningRate || .5 })
.then(abcd => {
console.log(`f(x)=ax³+bx²+cx+d: ${loops} in ${Date.now() - now}ms: ${abcd.join(', ')}`);
for (let ii = 0; ii < xyFlatData.length; ii += 2) {
const x = xyFlatData[ii];
const y = xyFlatData[ii + 1];
const yTrained = abcd[0] * x ** 3 + abcd[1] * x ** 2 + abcd[2] * x + abcd[3];
console.log(`\tx: ${x}, y: ${y} <=> ${yTrained} ==> ${Math.abs(yTrained - y)}`);
}
});
}
const generatePolynomialPoints = (weights: number[], points: number, xUntil = 10) => {
const flatPoints = Array<number>(points * 2);
for (let ii = 0; ii < points; ++ii) {
const xx = Math.random() * 100;
const yy = weights.reduce((acc, val, index) => acc + val * xx ** (weights.length - 1 - index), 0);
flatPoints[2 * ii] = xx;
flatPoints[2 * ii + 1] = yy;
}
return flatPoints;
}
const generatedPoints = generatePolynomialPoints([1, 2, 3, 4], 10, 100);
const rate = .0000001;
tryDetectCubicPolynom(generatedPoints, 1, rate);
tryDetectCubicPolynom(generatedPoints, 5, rate);
tryDetectCubicPolynom(generatedPoints, 10, rate);
在训练数值稳定性模型之前,您应该将 X 的值归一化到 0 和 1 之间。 (有关详细信息,请参阅:https://www.coursera.org/learn/deep-neural-network/lecture/lXv6U/normalizing-inputs)
对于这个例子,您可以简单地限制随机生成的 X 的范围,它应该可以工作。请注意本教程中 X 的值如何介于 0 和 1 之间。
我正在尝试做同样的事情,想知道为什么使用 SGD 优化器和损失函数的 meanSquareError 会完全失控,所以为了调试它,我将学习率降低到非常小的水平,我猜什么?那行得通...我现在使用的学习率为 0.000000000001。为什么他们在他们的示例中使用 0.5,为什么它似乎对他们有用?我不知道。因此,请尝试添加更多的零,直到它起作用。如果有谁知道为什么官方例子用0.5的时候要这么小,请告诉我。
@TaVirot 和@Hans 的回答提出了一些观点。他们的答案可以是解决问题的方法。 问题的一个原因是某些数据点对梯度的影响比其他数据点大得多。例如,当
x=100
和
a=1,b=2,c=3,d=4
然后
f(x)=f(100) = 1*100^3+2*100^2+3*100+4 = 1020304.
并假设 a
更改为 1.1,然后 f(100) = 1120304
。差异是 100,000。
否则,当x=1
和a=1,b=2,c=3,d=4
时,则
f(x)=f(1) = 1*1^3+2*1^2+3*1^1+4 = 10
并假设 a
更改为 1.1,然后 f(1) = 10.1
。在这种情况下,差异是 0.1。
所以相同的权重变化最终会导致巨大的 f(x) 差异,具体取决于 x 值 。所以,如果你只使用 均方误差 作为成本函数,这个 x=100 点 会在反向传播中产生巨大的梯度。总之,为避免模型发散,请考虑低学习率或数据标准化或其他成本函数。 (也许还有其他一些方法。)