线性回归中测试数据集的负 R 平方?
negative R-squared for a test dataset in linear regression?
我正在使用人工数据模拟线性回归,然后手动计算 RSE 和 R 平方。我对训练模型的样本数据集执行此操作,然后在样本外数据集上测试模型。样本外和样本内数据来自相同的正态分布,但种子不同。但是,当涉及到样本外数据集时,我的数字没有意义。你能帮我找到错误吗?
set.seed(1)
z1 <- rnorm(100)
z2 <- z1 ^ 2
error <- rnorm(100, sd = 0.25)
y1 <- 1 + 2 * z1 + error
data1 <- data.table(y1, z1, z2)
model_quad <- lm(y1 ~ z1 + z2, data1)
model_lin <- lm(y1 ~ z1, data1)
confint(model_lin)
confint(model_quad)
summary(model_lin)
summary(model_quad)
ggplot(data1) +
geom_point(aes(x = z1, y = y1), color = "blue", size = 3) +
geom_point(aes(x = z2, y = y1), color = "red", size = 3) +
geom_line(stat = "smooth", method = lm, aes(x = z1, y = y1), color = "blue", size = 2, alpha = 0.5) +
geom_line(stat = "smooth", method = lm, aes(x = z2, y = y1), color = "red", size = 2, alpha = 0.5) +
geom_ribbon(stat = "smooth", method = lm, aes(x = z1, y = y1), fill = "blue", alpha = 0.1) +
geom_ribbon(stat = "smooth", method = lm, aes(x = z2, y = y1), fill = "red", alpha = 0.1)
set.seed(100)
z12 <- rnorm(100)
z22 <- z12 ^ 2
error2 <- rnorm(100, sd = 0.25)
y2 <- 1 + 2 * z12 + error2
data2 <- data.table(y2, z12, z22)
summary(model_lin)
summary(model_quad)
ggplot(data2) +
geom_point(aes(x = z12, y = y2), color = "blue", size = 3) +
geom_point(aes(x = z22, y = y2), color = "red", size = 3) +
geom_line(stat = "smooth", method = lm, aes(x = z12, y = y2), color = "blue", size = 2, alpha = 0.5) +
geom_line(stat = "smooth", method = lm, aes(x = z22, y = y2), color = "red", size = 2, alpha = 0.5) +
geom_ribbon(stat = "smooth", method = lm, aes(x = z12, y = y2), fill = "blue", alpha = 0.1) +
geom_ribbon(stat = "smooth", method = lm, aes(x = z22, y = y2), fill = "red", alpha = 0.1) +
geom_abline(intercept = 0.99, slope = 1.999, size = 2, color = "yellow", alpha = 0.3)
predictions_in_sample_linear <- predict(model_lin, data1)
predictions_in_sample_quadratic <- predict(model_quad, data1)
predictions_out_of_sample_linear <- predict(model_lin, data2)
predictions_out_of_sample_quadratic <- predict(model_quad, data2)
TSE_in_sample <- (y1 - mean(y1)) %*% (y1 - mean(y1))
RSE_in_sample_linear <- (predictions_in_sample_linear - y1) %*% (predictions_in_sample_linear - y1)
RSE_in_sample_quadratic <- (predictions_in_sample_quadratic - y1) %*% (predictions_in_sample_quadratic - y1)
R_Square_in_sample_linear <- (TSE_in_sample - RSE_in_sample_linear) / TSE_in_sample
R_Square_in_sample_quadratic<- (TSE_in_sample - RSE_in_sample_quadratic) / TSE_in_sample
TSE_out_of_sample <- (y2 - mean(y2)) %*% (y2 - mean(y2))
RSE_out_of_sample_linear <- (predictions_out_of_sample_linear - y2) %*% (predictions_out_of_sample_linear - y2)
RSE_out_of_sample_quadratic <- (predictions_out_of_sample_quadratic - y2) %*% (predictions_out_of_sample_quadratic - y2)
R_Square_out_of_sample_linear <- (TSE_out_of_sample - RSE_out_of_sample_linear) / TSE_out_of_sample
R_Square_out_of_sample_quadratic<- (TSE_out_of_sample - RSE_out_of_sample_quadratic) / TSE_out_of_sample
predictions_in_sample_linear
predictions_in_sample_quadratic
predictions_out_of_sample_linear
predictions_out_of_sample_quadratic
TSE_in_sample
RSE_in_sample_linear
RSE_in_sample_quadratic
R_Square_in_sample_linear
R_Square_in_sample_quadratic
TSE_out_of_sample
RSE_out_of_sample_linear
RSE_out_of_sample_quadratic
R_Square_out_of_sample_linear
R_Square_out_of_sample_quadratic
这段代码returnsR_square在Out of Sample数据中是负的,这是荒谬的。
我们将不胜感激您的建议。
长问短答。你应该使用
data2 <- data.frame(y1 = y2, z1 = z12, z2 = z22)
这给出了
RSE_out_of_sample_linear
# 0.9902969
RSE_out_of_sample_quadratic
# 0.989241
我正在使用人工数据模拟线性回归,然后手动计算 RSE 和 R 平方。我对训练模型的样本数据集执行此操作,然后在样本外数据集上测试模型。样本外和样本内数据来自相同的正态分布,但种子不同。但是,当涉及到样本外数据集时,我的数字没有意义。你能帮我找到错误吗?
set.seed(1)
z1 <- rnorm(100)
z2 <- z1 ^ 2
error <- rnorm(100, sd = 0.25)
y1 <- 1 + 2 * z1 + error
data1 <- data.table(y1, z1, z2)
model_quad <- lm(y1 ~ z1 + z2, data1)
model_lin <- lm(y1 ~ z1, data1)
confint(model_lin)
confint(model_quad)
summary(model_lin)
summary(model_quad)
ggplot(data1) +
geom_point(aes(x = z1, y = y1), color = "blue", size = 3) +
geom_point(aes(x = z2, y = y1), color = "red", size = 3) +
geom_line(stat = "smooth", method = lm, aes(x = z1, y = y1), color = "blue", size = 2, alpha = 0.5) +
geom_line(stat = "smooth", method = lm, aes(x = z2, y = y1), color = "red", size = 2, alpha = 0.5) +
geom_ribbon(stat = "smooth", method = lm, aes(x = z1, y = y1), fill = "blue", alpha = 0.1) +
geom_ribbon(stat = "smooth", method = lm, aes(x = z2, y = y1), fill = "red", alpha = 0.1)
set.seed(100)
z12 <- rnorm(100)
z22 <- z12 ^ 2
error2 <- rnorm(100, sd = 0.25)
y2 <- 1 + 2 * z12 + error2
data2 <- data.table(y2, z12, z22)
summary(model_lin)
summary(model_quad)
ggplot(data2) +
geom_point(aes(x = z12, y = y2), color = "blue", size = 3) +
geom_point(aes(x = z22, y = y2), color = "red", size = 3) +
geom_line(stat = "smooth", method = lm, aes(x = z12, y = y2), color = "blue", size = 2, alpha = 0.5) +
geom_line(stat = "smooth", method = lm, aes(x = z22, y = y2), color = "red", size = 2, alpha = 0.5) +
geom_ribbon(stat = "smooth", method = lm, aes(x = z12, y = y2), fill = "blue", alpha = 0.1) +
geom_ribbon(stat = "smooth", method = lm, aes(x = z22, y = y2), fill = "red", alpha = 0.1) +
geom_abline(intercept = 0.99, slope = 1.999, size = 2, color = "yellow", alpha = 0.3)
predictions_in_sample_linear <- predict(model_lin, data1)
predictions_in_sample_quadratic <- predict(model_quad, data1)
predictions_out_of_sample_linear <- predict(model_lin, data2)
predictions_out_of_sample_quadratic <- predict(model_quad, data2)
TSE_in_sample <- (y1 - mean(y1)) %*% (y1 - mean(y1))
RSE_in_sample_linear <- (predictions_in_sample_linear - y1) %*% (predictions_in_sample_linear - y1)
RSE_in_sample_quadratic <- (predictions_in_sample_quadratic - y1) %*% (predictions_in_sample_quadratic - y1)
R_Square_in_sample_linear <- (TSE_in_sample - RSE_in_sample_linear) / TSE_in_sample
R_Square_in_sample_quadratic<- (TSE_in_sample - RSE_in_sample_quadratic) / TSE_in_sample
TSE_out_of_sample <- (y2 - mean(y2)) %*% (y2 - mean(y2))
RSE_out_of_sample_linear <- (predictions_out_of_sample_linear - y2) %*% (predictions_out_of_sample_linear - y2)
RSE_out_of_sample_quadratic <- (predictions_out_of_sample_quadratic - y2) %*% (predictions_out_of_sample_quadratic - y2)
R_Square_out_of_sample_linear <- (TSE_out_of_sample - RSE_out_of_sample_linear) / TSE_out_of_sample
R_Square_out_of_sample_quadratic<- (TSE_out_of_sample - RSE_out_of_sample_quadratic) / TSE_out_of_sample
predictions_in_sample_linear
predictions_in_sample_quadratic
predictions_out_of_sample_linear
predictions_out_of_sample_quadratic
TSE_in_sample
RSE_in_sample_linear
RSE_in_sample_quadratic
R_Square_in_sample_linear
R_Square_in_sample_quadratic
TSE_out_of_sample
RSE_out_of_sample_linear
RSE_out_of_sample_quadratic
R_Square_out_of_sample_linear
R_Square_out_of_sample_quadratic
这段代码returnsR_square在Out of Sample数据中是负的,这是荒谬的。
我们将不胜感激您的建议。
长问短答。你应该使用
data2 <- data.frame(y1 = y2, z1 = z12, z2 = z22)
这给出了
RSE_out_of_sample_linear
# 0.9902969
RSE_out_of_sample_quadratic
# 0.989241