R中带有mgcv的粗薄板样条拟合(薄板样条插值)
Rough thin-plate spline fitting (thin-plate spline interpolation) in R with mgcv
背景
我正在尝试复制书中的 图 2.6 An Introduction to Statistical Learning:
A rough thin-plate spline fit to the Income data from Figure 2.3. This
fit makes zero errors on the training data.
到目前为止我尝试了什么?
试过复制之前的图2.5,平滑的薄板样条拟合,不确定是否成功
income_2 <- read.csv("http://www-bcf.usc.edu/~gareth/ISL/Income2.csv")
library(mgcv)
model1 <- gam(Income ~ te(Education, Seniority, bs=c("tp", "tp")), data = income_2)
x <- range(income_2$Education)
x <- seq(x[1], x[2], length.out=30)
y <- range(income_2$Seniority)
y <- seq(y[1], y[2], length.out=30)
z <- outer(x,y,
function(Education,Seniority)
predict(model1, data.frame(Education,Seniority)))
p <- persp(x,y,z, theta=30, phi=30,
col="yellow",expand = 0.5,shade = 0.2,
xlab="Education", ylab="Seniority", zlab="Income")
obs <- trans3d(income_2$Education, income_2$Seniority,income_2$Income,p)
pred <- trans3d(income_2$Education, income_2$Seniority,fitted(model1),p)
points(obs, col="red",pch=16)
segments(obs$x, obs$y, pred$x, pred$y)
双重问题
- 我是否用 gam 创建了一个合适的光滑薄板?我使用的是 smooth.terms
bs="tp"
,文档说 'They are reduced rank versions of the thin plate splines and use the thin plate spline penalty.'
- 我怎样才能创建一个粗略的薄板样条拟合,使训练数据出现零误差? (上图第一张)
income_2 <- structure(list(Education = c(21.5862068965517, 18.2758620689655,
12.0689655172414, 17.0344827586207, 19.9310344827586, 18.2758620689655,
19.9310344827586, 21.1724137931034, 20.3448275862069, 10, 13.7241379310345,
18.6896551724138, 11.6551724137931, 16.6206896551724, 10, 20.3448275862069,
14.1379310344828, 16.6206896551724, 16.6206896551724, 20.3448275862069,
18.2758620689655, 14.551724137931, 17.448275862069, 10.4137931034483,
21.5862068965517, 11.2413793103448, 19.9310344827586, 11.6551724137931,
12.0689655172414, 17.0344827586207), Seniority = c(113.103448275862,
119.310344827586, 100.689655172414, 187.586206896552, 20, 26.2068965517241,
150.344827586207, 82.0689655172414, 88.2758620689655, 113.103448275862,
51.0344827586207, 144.137931034483, 20, 94.4827586206897, 187.586206896552,
94.4827586206897, 20, 44.8275862068966, 175.172413793103, 187.586206896552,
100.689655172414, 137.931034482759, 94.4827586206897, 32.4137931034483,
20, 44.8275862068966, 168.965517241379, 57.2413793103448, 32.4137931034483,
106.896551724138), Income = c(99.9171726114381, 92.579134855529,
34.6787271520874, 78.7028062353695, 68.0099216471551, 71.5044853814318,
87.9704669939115, 79.8110298331255, 90.00632710858, 45.6555294997364,
31.9138079371295, 96.2829968022869, 27.9825049000603, 66.601792415137,
41.5319924201478, 89.00070081522, 28.8163007592387, 57.6816942573605,
70.1050960424457, 98.8340115435447, 74.7046991976891, 53.5321056283034,
72.0789236655191, 18.5706650327685, 78.8057842852386, 21.388561306174,
90.8140351180409, 22.6361626208955, 17.613593041445, 74.6109601985289
)), .Names = c("Education", "Seniority", "Income"), row.names = c(NA,
-30L), class = "data.frame")
library(mgcv)
首先,您可以只对双变量薄板样条使用 s(Education, Seniority, bs = 'tp')
而不是使用张量积构造。薄板样条在任何维度上都是定义明确的。
其次,mgcv
做的是回归而不是插值,所以如果不进行调整,您就无法通过所有点得到拟合样条 运行。薄板样条的调整包括:
- 通过将
k
设置为唯一采样点的确切数量(如果您有超过 2000 个唯一数据位置,也可以设置 xt
来禁用 bs = 'tp'
后面的低秩近似);
- 设置
sp = 0
以禁用样条曲线的惩罚。
您的数据集中唯一采样位置的数量 income_2
是
xt <- unique(income_2[c("Education", "Seniority")])
nrow(xt)
#[1] 30
因为30比2000小,我们直接设置k = 30
即可,不需要将xt
传给s(, bs = 'tp')
。
interpolation_model <- gam(Income ~ s(Education, Seniority, k = 30, sp = 0),
data = income_2)
interpolation_model$residuals
# [1] 2.131628e-13 2.728484e-12 4.561684e-12 1.264766e-12 3.495870e-12
# [6] 4.177991e-12 -1.023182e-12 1.193712e-12 2.231104e-12 6.878054e-12
#[11] 6.309619e-12 6.679102e-13 7.574386e-12 3.637979e-12 4.227729e-12
#[16] 1.790568e-12 4.376943e-12 5.130119e-12 8.242296e-13 -6.536993e-13
#[21] 2.771117e-12 1.811884e-12 3.495870e-12 9.141132e-12 2.117417e-12
#[26] 7.243983e-12 -3.979039e-13 6.352252e-12 6.203038e-12 3.652190e-12
现在您看到所有残差都为零。
你也可以找其他直接做薄板样条插值的包。
薄板样条是各向同性的/径向的,变量尺度不同要小心!
Thank you for the explanation and solving my question. Do you know why the spline surface looks more undulating with less ridges?
因为你的两个变量在尺度上相差很大。你想先标准化你的两个变量,然后拟合薄板样条。
## this is how your original data look like on the 2D domain
with(income_2, plot(Education, Seniority, asp = 1))
## let's scale it
xt_scaled <- scale(xt)
dat <- data.frame(xt_scaled, Income = income_2$Income)
with(dat, plot(Education, Seniority, asp = 1))
## fit a model on scaled data
interpolation_model <- gam(Income ~ s(Education, Seniority, k = 30, sp = 0),
data = dat)
## grid on the transformed space
x <- range(dat$Education)
x <- seq(x[1], x[2], length.out=30)
y <- range(dat$Seniority)
y <- seq(y[1], y[2], length.out=30)
## prediction on the transformed space
newdat <- expand.grid(Education = x, Seniority = y)
z <- matrix(predict(interpolation_model, newdat), nrow = length(x))
现在要制作绘图,我们要将网格反向变换到其原始比例。请注意,这不需要转换预测值。
## back transform the grid
scaled_center <- attr(xt_scaled, "scaled:center")
#Education Seniority
# 16.38621 93.86207
scaled_scale <- attr(xt_scaled, "scaled:scale")
#Education Seniority
# 3.810622 55.715623
xx <- x * scaled_scale[1] + scaled_center[1]
yy <- y * scaled_scale[2] + scaled_center[2]
## use `xx`, `yy` and `z`
p <- persp(xx, yy, z, theta = 30, phi = 30,
col = "yellow",expand = 0.5, shade = 0.2,
xlab = "Education", ylab = "Seniority", zlab = "Income")
obs <- trans3d(income_2$Education, income_2$Seniority, income_2$Income, p)
pred <- trans3d(income_2$Education, income_2$Seniority, fitted(interpolation_model), p)
points(obs, col="red",pch=16)
segments(obs$x, obs$y, pred$x, pred$y)
背景
我正在尝试复制书中的 图 2.6 An Introduction to Statistical Learning:
A rough thin-plate spline fit to the Income data from Figure 2.3. This fit makes zero errors on the training data.
到目前为止我尝试了什么?
试过复制之前的图2.5,平滑的薄板样条拟合,不确定是否成功
income_2 <- read.csv("http://www-bcf.usc.edu/~gareth/ISL/Income2.csv")
library(mgcv)
model1 <- gam(Income ~ te(Education, Seniority, bs=c("tp", "tp")), data = income_2)
x <- range(income_2$Education)
x <- seq(x[1], x[2], length.out=30)
y <- range(income_2$Seniority)
y <- seq(y[1], y[2], length.out=30)
z <- outer(x,y,
function(Education,Seniority)
predict(model1, data.frame(Education,Seniority)))
p <- persp(x,y,z, theta=30, phi=30,
col="yellow",expand = 0.5,shade = 0.2,
xlab="Education", ylab="Seniority", zlab="Income")
obs <- trans3d(income_2$Education, income_2$Seniority,income_2$Income,p)
pred <- trans3d(income_2$Education, income_2$Seniority,fitted(model1),p)
points(obs, col="red",pch=16)
segments(obs$x, obs$y, pred$x, pred$y)
双重问题
- 我是否用 gam 创建了一个合适的光滑薄板?我使用的是 smooth.terms
bs="tp"
,文档说 'They are reduced rank versions of the thin plate splines and use the thin plate spline penalty.' - 我怎样才能创建一个粗略的薄板样条拟合,使训练数据出现零误差? (上图第一张)
income_2 <- structure(list(Education = c(21.5862068965517, 18.2758620689655,
12.0689655172414, 17.0344827586207, 19.9310344827586, 18.2758620689655,
19.9310344827586, 21.1724137931034, 20.3448275862069, 10, 13.7241379310345,
18.6896551724138, 11.6551724137931, 16.6206896551724, 10, 20.3448275862069,
14.1379310344828, 16.6206896551724, 16.6206896551724, 20.3448275862069,
18.2758620689655, 14.551724137931, 17.448275862069, 10.4137931034483,
21.5862068965517, 11.2413793103448, 19.9310344827586, 11.6551724137931,
12.0689655172414, 17.0344827586207), Seniority = c(113.103448275862,
119.310344827586, 100.689655172414, 187.586206896552, 20, 26.2068965517241,
150.344827586207, 82.0689655172414, 88.2758620689655, 113.103448275862,
51.0344827586207, 144.137931034483, 20, 94.4827586206897, 187.586206896552,
94.4827586206897, 20, 44.8275862068966, 175.172413793103, 187.586206896552,
100.689655172414, 137.931034482759, 94.4827586206897, 32.4137931034483,
20, 44.8275862068966, 168.965517241379, 57.2413793103448, 32.4137931034483,
106.896551724138), Income = c(99.9171726114381, 92.579134855529,
34.6787271520874, 78.7028062353695, 68.0099216471551, 71.5044853814318,
87.9704669939115, 79.8110298331255, 90.00632710858, 45.6555294997364,
31.9138079371295, 96.2829968022869, 27.9825049000603, 66.601792415137,
41.5319924201478, 89.00070081522, 28.8163007592387, 57.6816942573605,
70.1050960424457, 98.8340115435447, 74.7046991976891, 53.5321056283034,
72.0789236655191, 18.5706650327685, 78.8057842852386, 21.388561306174,
90.8140351180409, 22.6361626208955, 17.613593041445, 74.6109601985289
)), .Names = c("Education", "Seniority", "Income"), row.names = c(NA,
-30L), class = "data.frame")
library(mgcv)
首先,您可以只对双变量薄板样条使用 s(Education, Seniority, bs = 'tp')
而不是使用张量积构造。薄板样条在任何维度上都是定义明确的。
其次,mgcv
做的是回归而不是插值,所以如果不进行调整,您就无法通过所有点得到拟合样条 运行。薄板样条的调整包括:
- 通过将
k
设置为唯一采样点的确切数量(如果您有超过 2000 个唯一数据位置,也可以设置xt
来禁用bs = 'tp'
后面的低秩近似); - 设置
sp = 0
以禁用样条曲线的惩罚。
您的数据集中唯一采样位置的数量 income_2
是
xt <- unique(income_2[c("Education", "Seniority")])
nrow(xt)
#[1] 30
因为30比2000小,我们直接设置k = 30
即可,不需要将xt
传给s(, bs = 'tp')
。
interpolation_model <- gam(Income ~ s(Education, Seniority, k = 30, sp = 0),
data = income_2)
interpolation_model$residuals
# [1] 2.131628e-13 2.728484e-12 4.561684e-12 1.264766e-12 3.495870e-12
# [6] 4.177991e-12 -1.023182e-12 1.193712e-12 2.231104e-12 6.878054e-12
#[11] 6.309619e-12 6.679102e-13 7.574386e-12 3.637979e-12 4.227729e-12
#[16] 1.790568e-12 4.376943e-12 5.130119e-12 8.242296e-13 -6.536993e-13
#[21] 2.771117e-12 1.811884e-12 3.495870e-12 9.141132e-12 2.117417e-12
#[26] 7.243983e-12 -3.979039e-13 6.352252e-12 6.203038e-12 3.652190e-12
现在您看到所有残差都为零。
你也可以找其他直接做薄板样条插值的包。
薄板样条是各向同性的/径向的,变量尺度不同要小心!
Thank you for the explanation and solving my question. Do you know why the spline surface looks more undulating with less ridges?
因为你的两个变量在尺度上相差很大。你想先标准化你的两个变量,然后拟合薄板样条。
## this is how your original data look like on the 2D domain
with(income_2, plot(Education, Seniority, asp = 1))
## let's scale it
xt_scaled <- scale(xt)
dat <- data.frame(xt_scaled, Income = income_2$Income)
with(dat, plot(Education, Seniority, asp = 1))
## fit a model on scaled data
interpolation_model <- gam(Income ~ s(Education, Seniority, k = 30, sp = 0),
data = dat)
## grid on the transformed space
x <- range(dat$Education)
x <- seq(x[1], x[2], length.out=30)
y <- range(dat$Seniority)
y <- seq(y[1], y[2], length.out=30)
## prediction on the transformed space
newdat <- expand.grid(Education = x, Seniority = y)
z <- matrix(predict(interpolation_model, newdat), nrow = length(x))
现在要制作绘图,我们要将网格反向变换到其原始比例。请注意,这不需要转换预测值。
## back transform the grid
scaled_center <- attr(xt_scaled, "scaled:center")
#Education Seniority
# 16.38621 93.86207
scaled_scale <- attr(xt_scaled, "scaled:scale")
#Education Seniority
# 3.810622 55.715623
xx <- x * scaled_scale[1] + scaled_center[1]
yy <- y * scaled_scale[2] + scaled_center[2]
## use `xx`, `yy` and `z`
p <- persp(xx, yy, z, theta = 30, phi = 30,
col = "yellow",expand = 0.5, shade = 0.2,
xlab = "Education", ylab = "Seniority", zlab = "Income")
obs <- trans3d(income_2$Education, income_2$Seniority, income_2$Income, p)
pred <- trans3d(income_2$Education, income_2$Seniority, fitted(interpolation_model), p)
points(obs, col="red",pch=16)
segments(obs$x, obs$y, pred$x, pred$y)