为什么我在 R 中用于回归的梯度下降失败?
Why is my gradient descent for regression in R failing?
我采用了以下梯度下降算法来回归存储在 data[:,4] 中的 y 变量到存储在 data[:,1] 中的 x 变量。然而,梯度下降似乎是发散的。我会很感激一些帮助来确定我要去哪里错了。
#define the sum of squared residuals
ssquares <- function(x)
{
t = 0
for(i in 1:200)
{
t <- t + (data[i,4] - x[1] - x[2]*data[i,1])^2
}
t/200
}
# define the derivatives
derivative <- function(x)
{
t1 = 0
for(i in 1:200)
{
t1 <- t1 - 2*(data[i,4] - x[1] - x[2]*data[i,1])
}
t2 = 0
for(i in 1:200)
{
t2 <- t2 - 2*data[i,1]*(data[i,4] - x[1] - x[2]*data[i,1])
}
c(t1/200,t2/200)
}
# definition of the gradient descent method in 2D
gradient_descent <- function(func, derv, start, step=0.05, tol=1e-8) {
pt1 <- start
grdnt <- derv(pt1)
pt2 <- c(pt1[1] - step*grdnt[1], pt1[2] - step*grdnt[2])
while (abs(func(pt1)-func(pt2)) > tol) {
pt1 <- pt2
grdnt <- derv(pt1)
pt2 <- c(pt1[1] - step*grdnt[1], pt1[2] - step*grdnt[2])
print(func(pt2)) # print progress
}
pt2 # return the last point
}
# locate the minimum of the function using the Gradient Descent method
result <- gradient_descent(
ssquares, # the function to optimize
derivative, # the gradient of the function
c(1,1), # start point of theplot_loss(simple_ex) search
0.05, # step size (alpha)
1e-8) # relative tolerance for one step
# display a summary of the results
print(result) # coordinate of fucntion minimum
print(ssquares(result)) # response of function minimum
您可以矢量化您的 objective / 梯度函数以加快实现速度,因为您可以看到它实际上收敛于随机生成的数据,并且系数非常接近使用 R 中的 lm() 获得的系数:
ssquares <- function(x) {
n <- nrow(data) # 200
sum((data[,4] - cbind(1, data[,1]) %*% x)^2) / n
}
# define the derivatives
derivative <- function(x) {
n <- nrow(data) # 200
c(sum(-2*(data[,4] - cbind(1, data[,1]) %*% x)), sum(-2*(data[,1])*(data[,4] - cbind(1, data[,1]) %*% x))) / n
}
set.seed(1)
#data <- matrix(rnorm(800), nrow=200)
# locate the minimum of the function using the Gradient Descent method
result <- gradient_descent(
ssquares, # the function to optimize
derivative, # the gradient of the function
c(1,1), # start point of theplot_loss(simple_ex) search
0.05, # step size (alpha)
1e-8) # relative tolerance for one step
# [1] 2.511904
# [1] 2.263448
# [1] 2.061456
# [1] 1.89721
# [1] 1.763634
# [1] 1.654984
# [1] 1.566592
# [1] 1.494668
# ...
# display a summary of the results
print(result) # coefficients obtained with gradient descent
#[1] -0.10248356 0.08068382
lm(data[,4]~data[,1])$coef # coefficients from R lm()
# (Intercept) data[, 1]
# -0.10252181 0.08045722
# use new dataset, this time it takes quite sometime to converge, but the
# values GD converges to are pretty accurate as you can see from below.
data <- read.csv('Advertising.csv') # with advertising data, removing the first rownames column
# locate the minimum of the function using the Gradient Descent method
result <- gradient_descent(
ssquares, # the function to optimize
derivative, # the gradient of the function
c(1,1), # start point of theplot_loss(simple_ex) search
0.00001, # step size (alpha), decreasing the learning rate
1e-8) # relative tolerance for one step
# ...
# [1] 10.51364
# [1] 10.51364
# [1] 10.51364
print(result) # coordinate of fucntion minimum
[1] 6.97016852 0.04785365
lm(data[,4]~data[,1])$coef
(Intercept) data[, 1]
7.03259355 0.04753664
我采用了以下梯度下降算法来回归存储在 data[:,4] 中的 y 变量到存储在 data[:,1] 中的 x 变量。然而,梯度下降似乎是发散的。我会很感激一些帮助来确定我要去哪里错了。
#define the sum of squared residuals
ssquares <- function(x)
{
t = 0
for(i in 1:200)
{
t <- t + (data[i,4] - x[1] - x[2]*data[i,1])^2
}
t/200
}
# define the derivatives
derivative <- function(x)
{
t1 = 0
for(i in 1:200)
{
t1 <- t1 - 2*(data[i,4] - x[1] - x[2]*data[i,1])
}
t2 = 0
for(i in 1:200)
{
t2 <- t2 - 2*data[i,1]*(data[i,4] - x[1] - x[2]*data[i,1])
}
c(t1/200,t2/200)
}
# definition of the gradient descent method in 2D
gradient_descent <- function(func, derv, start, step=0.05, tol=1e-8) {
pt1 <- start
grdnt <- derv(pt1)
pt2 <- c(pt1[1] - step*grdnt[1], pt1[2] - step*grdnt[2])
while (abs(func(pt1)-func(pt2)) > tol) {
pt1 <- pt2
grdnt <- derv(pt1)
pt2 <- c(pt1[1] - step*grdnt[1], pt1[2] - step*grdnt[2])
print(func(pt2)) # print progress
}
pt2 # return the last point
}
# locate the minimum of the function using the Gradient Descent method
result <- gradient_descent(
ssquares, # the function to optimize
derivative, # the gradient of the function
c(1,1), # start point of theplot_loss(simple_ex) search
0.05, # step size (alpha)
1e-8) # relative tolerance for one step
# display a summary of the results
print(result) # coordinate of fucntion minimum
print(ssquares(result)) # response of function minimum
您可以矢量化您的 objective / 梯度函数以加快实现速度,因为您可以看到它实际上收敛于随机生成的数据,并且系数非常接近使用 R 中的 lm() 获得的系数:
ssquares <- function(x) {
n <- nrow(data) # 200
sum((data[,4] - cbind(1, data[,1]) %*% x)^2) / n
}
# define the derivatives
derivative <- function(x) {
n <- nrow(data) # 200
c(sum(-2*(data[,4] - cbind(1, data[,1]) %*% x)), sum(-2*(data[,1])*(data[,4] - cbind(1, data[,1]) %*% x))) / n
}
set.seed(1)
#data <- matrix(rnorm(800), nrow=200)
# locate the minimum of the function using the Gradient Descent method
result <- gradient_descent(
ssquares, # the function to optimize
derivative, # the gradient of the function
c(1,1), # start point of theplot_loss(simple_ex) search
0.05, # step size (alpha)
1e-8) # relative tolerance for one step
# [1] 2.511904
# [1] 2.263448
# [1] 2.061456
# [1] 1.89721
# [1] 1.763634
# [1] 1.654984
# [1] 1.566592
# [1] 1.494668
# ...
# display a summary of the results
print(result) # coefficients obtained with gradient descent
#[1] -0.10248356 0.08068382
lm(data[,4]~data[,1])$coef # coefficients from R lm()
# (Intercept) data[, 1]
# -0.10252181 0.08045722
# use new dataset, this time it takes quite sometime to converge, but the
# values GD converges to are pretty accurate as you can see from below.
data <- read.csv('Advertising.csv') # with advertising data, removing the first rownames column
# locate the minimum of the function using the Gradient Descent method
result <- gradient_descent(
ssquares, # the function to optimize
derivative, # the gradient of the function
c(1,1), # start point of theplot_loss(simple_ex) search
0.00001, # step size (alpha), decreasing the learning rate
1e-8) # relative tolerance for one step
# ...
# [1] 10.51364
# [1] 10.51364
# [1] 10.51364
print(result) # coordinate of fucntion minimum
[1] 6.97016852 0.04785365
lm(data[,4]~data[,1])$coef
(Intercept) data[, 1]
7.03259355 0.04753664