基于线性回归逼近曲线族的函数

Approximate the Function for a Family of Curves based on Linear Regression

R: df 看起来像这样 (x = a, y = b, group = c):

  a      b      c
-------------------
-2.1    1203    5
 1.4    1103    10
-2.1    1203    5
..       ..    ..

我从这个 df 中为 c 中的每个组(例如 5、10)创建了一个包含大约 10 条线性回归线 (ggplot2/geom_smooth) 的散点图。这些都有不同的 slopesy-intersects

有什么方法可以近似 R 中这些线性回归线的曲线族 的函数并用新绘图中的自定义参数 (ggplot2)?


编辑:

这是dput(df):(稍后会再次删除)

structure(list(X = 1:102, a = c("0", "0", "0", "0", "0", "219.399.914.550.781", 
"987", "0", "0", "0", "0", "1320", "0", "0", "0", "0", "0", "0", 
"0", "0", "0", "0", "0", "0", "0", "0", "0", "144.595.013.427.734", 
"176.470.013.427.734", "167.919.995.117.188", "125.239.242.553.711", 
"247.582.397.460.938", "173.550.708.007.812", "149.010.693.359.375", 
"908", "638.5", "81.173.999.023.438", "1472", "120.632.000.732.422", 
"2028", "154.019.999.694.824", "785.5", "118.188.000.488.281", 
"149.930.010.986.328", "-712", "-2100", "1628", "925", "1161", 
"516", "64.840.002.441.406", "426.5", "106.810.998.535.156", 
"92.175.994.873.047", "648.5", "125.379.998.779.297", "1120", 
"821", "795", "129.179.998.779.297", "137.460.000.610.352", "127.231.660.461.426", 
"148.579.998.779.297", "115.440.997.314.453", "4.482.857.905.469", 
"1163", "97.440.002.441.406", "130.298.852.539.062", "195.956.491.088.867", 
"504.998.779.296.989", "1043", "228.998.406.982.422", "128.374.566.650.391", 
"153.219.995.117.188", "111.604.742.431.641", "108.100.006.103.516", 
"1364.5", "102.669.999.694.824", "141.820.001.220.703", "83.124.743.652.344", 
"93.209.649.658.203", "149.629.656.982.422", "150.215.002.441.406", 
"161.379.998.779.297", "41.129.998.779.297", "91.320.001.220.703", 
"83.047.998.046.875", "1144.5", "104.020.001.220.703", "171.528.002.929.688", 
"1519", "123.510.003.662.109", "106.240.002.441.406", "145.934.997.558.594", 
"177.939.999.389.648", "195.360.003.662.109", "164.140.002.441.406", 
"113.640.002.441.406", "146.676.000.976.562", "1.769.916.015.625", 
"53.389.654.541.016", "685.018.981.933.594"), c = c(88L, 88L, 
88L, 88L, NA, 88L, 88L, 88L, NA, NA, NA, 86L, 86L, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 90L, 88L, 88L, 88L, 
88L, 88L, 88L, 88L, 88L, 88L, 88L, 88L, 86L, 88L, 88L, 88L, 84L, 
84L, 84L, 84L, 86L, 84L, 84L, 84L, 84L, 84L, 84L, 84L, 84L, 84L, 
82L, 82L, 84L, 82L, 82L, 84L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 80L, 80L, 
80L, 80L, 80L, 82L, 84L, 82L, 82L, 80L, 80L, 80L, 80L, 80L, 80L, 
80L, 80L, 80L, 80L, 80L, 80L), c_null = c(88L, 88L, 88L, 88L, 
0L, 88L, 88L, 88L, 0L, 0L, 0L, 86L, 86L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 90L, 88L, 88L, 88L, 88L, 
88L, 88L, 88L, 88L, 88L, 88L, 88L, 86L, 88L, 88L, 88L, 84L, 84L, 
84L, 84L, 86L, 84L, 84L, 84L, 84L, 84L, 84L, 84L, 84L, 84L, 82L, 
82L, 84L, 82L, 82L, 84L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 80L, 80L, 80L, 
80L, 80L, 82L, 84L, 82L, 82L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 
80L, 80L, 80L, 80L, 80L), c_orig = c("88.000.096.643.065", "88.000.096.643.065", 
"88.000.096.643.065", "0", "874.979.654.044.919", "873.618.932.305.081", 
"869.990.179.502.541", "0", "0", "0", "861.825.503", "861.825.503", 
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", 
"0", "899.000.015.258.789", "87.5", "880.999.984.741.211", "88", 
"879.000.015.258.789", "87", "869.000.015.258.789", "87", "868.000.030.517.578", 
"878.000.030.517.578", "876.999.969.482.422", "865.999.984.741.211", 
"861.999.969.482.422", "870.999.984.741.211", "869.000.015.258.789", 
"865.999.984.741.211", "871.999.969.482.422", "841.999.969.482.422", 
"84.5", "840.999.984.741.211", "845.999.984.741.211", "843.000.030.517.578", 
"841.999.969.482.422", "83", "834.000.015.258.789", "83", "825.999.984.741.211", 
"834.000.015.258.789", "831.999.969.482.422", "826.999.969.482.422", 
"823.000.030.517.578", "821.999.969.482.422", "825.999.984.741.211", 
"821.999.969.482.422", "82", "825.999.984.741.211", "816.999.969.482.422", 
"814.000.015.258.789", "81", "819.000.015.258.789", "816.999.969.482.422", 
"81.5", "821.999.969.482.422", "811.999.969.482.422", "814.000.015.258.789", 
"813.000.030.517.578", "808.000.030.517.578", "815.999.984.741.211", 
"818.000.030.517.578", "814.000.015.258.789", "814.000.015.258.789", 
"809.000.015.258.789", "809.000.015.258.789", "805.999.984.741.211", 
"801.999.969.482.422", "796.999.969.482.422", "801.999.969.482.422", 
"803.000.030.517.578", "804.000.015.258.789", "811.999.969.482.422", 
"825.999.984.741.211", "82.5", "819.000.015.258.789", "804.000.015.258.789", 
"795.999.984.741.211", "804.000.015.258.789", "80", "801.999.969.482.422", 
"798.000.030.517.578", "80", "80", "795.999.984.741.211", "800.999.984.741.211", 
"799.000.015.258.789", "791.999.969.482.422", "791.999.969.482.422"
), b = c("0", "0", "0", NA, NA, "-0.136072173983791", "-0.362875280254002", 
NA, NA, NA, NA, "0", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, "-240.000.152.587.891", "0.599998474121094", 
"-0.0999984741210938", "-0.0999984741210938", "-0.900001525878906", 
"-0.0999984741210938", "0.0999984741210938", "-0.199996948242202", 
"1", "-0.100006103515597", "-109.999.847.412.111", "-0.400001525878892", 
"0.900001525878892", "-0.199996948242188", "-0.300003051757812", 
"0.599998474121108", "0.5", "0.300003051757798", "-0.400001525878906", 
"0.5", "-0.299995422363295", "-0.100006103515597", "-11.999.969.482.422", 
"0.400001525878906", "-0.400001525878906", "-0.400001525878906", 
"0.800003051757812", "-0.200004577636705", "-0.5", "-0.399993896484403", 
"-0.100006103515597", "0.400001525878892", "-0.400001525878892", 
"-0.199996948242202", "0.599998474121094", "-0.900001525878892", 
"-0.299995422363295", "-0.400001525878906", "0.900001525878906", 
"-0.200004577636705", "-0.199996948242202", "0.699996948242202", 
"-1", "0.200004577636705", "-0.099998474121108", "-0.5", "0.799995422363295", 
"0.200004577636705", "-0.400001525878892", "0", "-0.5", "0", 
"-0.300003051757812", "-0.400001525878892", "-0.5", "0.5", "0.100006103515597", 
"0.099998474121108", "0.799995422363295", "140.000.152.587.889", 
"-0.0999984741210938", "-0.599998474121094", "-1.5", "-0.800003051757812", 
"0.800003051757812", "-0.400001525878906", "0.199996948242202", 
"-0.399993896484403", "0.199996948242202", "0", "-0.400001525878906", 
"0.5", "-0.199996948242188", "-0.700004577636705", NA), b_null = c("0", 
"0", "0", "0", "0", "-0.136072173983791", "-0.362875280254002", 
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", 
"0", "0", "0", "0", "0", "0", "0", "-240.000.152.587.891", "0.599998474121094", 
"-0.0999984741210938", "-0.0999984741210938", "-0.900001525878906", 
"-0.0999984741210938", "0.0999984741210938", "-0.199996948242202", 
"1", "-0.100006103515597", "-109.999.847.412.111", "-0.400001525878892", 
"0.900001525878892", "-0.199996948242188", "-0.300003051757812", 
"0.599998474121108", "0.5", "0.300003051757798", "-0.400001525878906", 
"0.5", "-0.299995422363295", "-0.100006103515597", "-11.999.969.482.422", 
"0.400001525878906", "-0.400001525878906", "-0.400001525878906", 
"0.800003051757812", "-0.200004577636705", "-0.5", "-0.399993896484403", 
"-0.100006103515597", "0.400001525878892", "-0.400001525878892", 
"-0.199996948242202", "0.599998474121094", "-0.900001525878892", 
"-0.299995422363295", "-0.400001525878906", "0.900001525878906", 
"-0.200004577636705", "-0.199996948242202", "0.699996948242202", 
"-1", "0.200004577636705", "-0.099998474121108", "-0.5", "0.799995422363295", 
"0.200004577636705", "-0.400001525878892", "0", "-0.5", "0", 
"-0.300003051757812", "-0.400001525878892", "-0.5", "0.5", "0.100006103515597", 
"0.099998474121108", "0.799995422363295", "140.000.152.587.889", 
"-0.0999984741210938", "-0.599998474121094", "-1.5", "-0.800003051757812", 
"0.800003051757812", "-0.400001525878906", "0.199996948242202", 
"-0.399993896484403", "0.199996948242202", "0", "-0.400001525878906", 
"0.5", "-0.199996948242188", "-0.700004577636705", "0")), .Names = c("X", 
"a", "c", "c_null", "c_orig", "b", "b_null"), class = "data.frame", row.names = c(NA, 
-102L))

我刚刚用 ggplot2 绘制了它:

ggplot(aes(x = a, y = b, group = c), data = Health, na.rm = TRUE) + 
  geom_point(aes(color = c, size = factor(c)), alpha=0.3) +
  scale_color_distiller(palette="YlGnBu", na.value="white") +
  geom_smooth(method = "lm", aes(group = factor(c), color = c), se = F)

现在我想对所有 geom_smooth 线的曲线族进行近似,以在另一个图中用其他参数预测不同的场景!

这是一个如何创建和绘制预测模型的简单示例。

首先,让我们创建一些假数据:

## Fake data
set.seed(595)
a = runif(50, 0, 100)
c = runif(50, 0, 100)

# Add equation for b in terms of a and c
dat = data.frame(a,c, b = 2*a + 3*c + 10 + rnorm(50, 0, 20))

现在,要根据 ac 预测 b,我们需要一个回归模型:

## Regression model
m1 = lm(b ~ a + c, data=dat)

以下是该模型的摘要:

summary(m1)    
Call:
  lm(formula = b ~ a + c, data = dat)

Residuals:
  Min      1Q  Median      3Q     Max 
-63.169 -10.364   0.385  12.959  53.623 

Coefficients:
  Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.63897    9.07948  -0.401     0.69    
a            2.19203    0.09701  22.595   <2e-16 ***
c            3.00188    0.11356  26.435   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 20.58 on 47 degrees of freedom
Multiple R-squared:  0.9565,  Adjusted R-squared:  0.9547 
F-statistic:   517 on 2 and 47 DF,  p-value: < 2.2e-16

要绘制模型预测,让我们创建这些预测的数据框。我们将为 c 的四个值预测 b

## Predict b for various values of c and the entire range of a values
newdat = expand.grid(a=0:100, c=c(5,20,80,95))
newdat = data.frame(newdat, b_pred=predict(m1, newdata=newdat))

现在绘制数据和模型的四个预测:

# Plot points for b vs. a and then show prediction lines for various values of c.
ggplot(dat, aes(a, b)) +
  geom_point() +
  geom_line(data=newdat, aes(a, b_pred, color=factor(c))) +
  guides(colour=guide_legend(reverse=TRUE))

更新: 跟进我的最后一条评论,也许这有助于可视化模型的回归表面。它是一个曲面,因为 b = f(a,c) 是两个变量的函数。函数为:b = f(a,c) = -3.64 + 2.19*a + 3.00*c,如上回归总结所示。这是平面方程。这是我们绘制它时的样子:

# Set up data grid for plotting
a=seq(1, 100, length.out=20)
c=seq(1, 100, length.out=20)
z = outer(a,c, FUN=function(a,c) -3.64 + 2.19*a + 3.00*c)

# Plot regression surface: b = f(a,c) = -3.64 + 2.19*a + 3.00*c
mat=persp(a, c, z, ylim=c(0,100), theta=35, phi=20, zlab="b", border="grey30")

# Add red lines of constant c
lapply(c(5,20,80,95), function(c_val) {
  lines(trans3d(x=a, y=c_val, z=-3.64 + 2.19*a + 3.00*c_val, pmat=mat), 
        col="red", lwd=2, lty="12")

请注意下图中每条红线的常数值为 c。只有 a 不同。这就是我们用 ggplot 绘制的图在您将其扩展到第三维时的样子。

事实上,如果我们旋转 3D 图,让我们在 c 轴的方向(并垂直于 a 轴)和稍微降低透视效果。将下面的图与我们用 ggplot 制作的图进行比较:

# Plot regression surface: b = f(a,c) = -3.64 + 2.19*a + 3.00*c
mat=persp(a, c, z, ylim=c(0,100), theta=0, phi=0, zlab="b", border="grey30", d=5)

# Add red lines of constant c
lapply(c(5,20,80,95), function(c_val) {
  lines(trans3d(x=a, y=c_val, z=-3.64 + 2.19*a + 3.00*c_val, pmat=mat), 
        col="red", lwd=2, lty="12")
})