优化样条回归中的自由度
Optimizing degrees of freedom in spline regression
我有两个基因表达时程数据集:
首先,在 4 个组的 14 个时间点测量基因表达:
df1 <- structure(list(val = c(-0.1, -0.13, -0.4, -0.3, -0.3, -0.2, -0.24,
0.1, 0.2, 0.13, 0, 0.63, 0.83, 0.85, -0.07, -0.07, -0.27, -0.2,
-0.2, -0.1, 0.2, 0.1, 0.07, 0.17, 0.6, 0.75, 1.1, 1.1, -0.13,
-0.15, -0.26, -0.25, -0.14, 0.04, 0.2, 0.24, 0.23, 0.2, 0.1,
0.73, 1, 1.3, 0, 0.06, -0.24, -0.17, -0.17, -0.04, 0.16, 0.1,
0.14, 0.27, 0.34, 0.9, 0.97, 1.04),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39),
group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,4L),
.Label = c("a", "b", "c", "d"), class = "factor")), .Names = c("val","time", "group"),
row.names = c(NA, -56L), class = "data.frame")
df1$group <- factor(df1$group,levels=c("a","b","c","d"))
看起来像这样(添加 loess
平滑趋势线):
library(ggplot2)
ggplot(df1,aes(x=time,y=val,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
其次,在相似的 14 个时间点测量了基因表达,但现在来自 2 个不同的组,每个组由两种性别代表:
df2 <- structure(list(val = c(-0.23, -0.01, -0.14, -0.01, -0.21, -0.16,
-0.24, -0.11, 0.02, -0.11, -0.01, -0.25, -0.47, -1.25, 0.02,
-0.3, -0.02, 0.14, 0.25, -0.05, 0.15, 0.11, -0.24, -0.18, -0.39,
-0.49, -0.5, -0.65, -0.06, 0.09, 0.1, 0.15, 0.08, 0.15, 0.4,
0.24, 0.07, 0.08, -0.18, -0.35, -0.19, -0.81, -0.16, 0.29, -0.05,
0.14, 0.14, 0.48, 0.34, 0.11, -0.07, -0.13, -0.41, -0.22, -0.54,
-0.76, 0.35, 0.34, -0.06, 0.21, 0.14, 0.14, 0.25, 0.22, 0.25,
0.16, 0.3, 0.44, 0.08, 0.48, 0.1, 0.16, -0.03, -0.22, 0.2, 0.01,
-0.09, -0.02, -0.01, 0.06, -0.13, 0.19, 0.11, -0.04, -0.39, 0.03,
-0.01, 0.09, 0.1, -0.14, -0.12, -0.1, 0.36, 0.08, 0.09, 0.09,
0.42, 0.37, -0.14, 0.12, 0.09, 0.03, 0.06, -0.25, 0.2, -0.06,
-0.44, 0.23, 0.03, 0.16, 0.81, 0.83),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0,1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58,5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17,4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39),
sex = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("F", "M"), class = "factor"), group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
.Label = c("a", "b"), class = "factor")), .Names = c("val", "time", "sex", "group"), row.names = c(NA, -112L), class = "data.frame")
df2$sex <- ordered(df2$sex,levels=c("M","F"))
df2$group <- ordered(df2$group,levels=c("a","b"))
df2$col <- factor(paste0(df2$group,":",df2$sex))
看起来像这样(添加黄土平滑趋势线):
ggplot(df2,aes(x=time,y=val,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
对于 df1
,我想估计 time
对 val
的影响,调整 group
。
对于df2
,我想估计time:group
对val
的影响,调整sex
。
查看数据我认为使用 spline
regression
s 是合适的,所以我使用了 mgcv
包中的 gam
函数,据我所知理解优化了适合数据的 spline
的自由度。
这是我适合的 df1
:
mgcv1.fit <- mgcv::gam(val ~ group+s(time),data=df1)
给出:
Family: gaussian
Link function: identity
Formula:
val ~ group + s(time)
Estimated degrees of freedom:
7.18 total = 11.18
GCV score: 0.01258176
但是 7.18 的自由度对于这些数据来说似乎太多了。
对于df2
:
mgcv2.fit <- mgcv::gam(val ~ sex+s(time,by=group),data=df2)
给出:
Family: gaussian
Link function: identity
Formula:
val ~ sex + s(time, by = group)
Estimated degrees of freedom:
1.72 total = 3.72
GCV score: 0.08522094
我想在这种情况下,我认为自由度会稍微高一些。
还有一点。绘制这两个数据集的拟合值:
df1$mgcv <- mgcv1.fit$fitted.values
ggplot(df1,aes(x=time,y=mgcv,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
看起来不错。
但是 df2
df2$mgcv <- mgcv2.fit$fitted.values
ggplot(df2,aes(x=time,y=mgcv,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
看起来它翻转了组标签。
所以我的问题是:
- 我是否正确使用
mgcv::gam
来优化我的问题的样条自由度?
mgcv
是否重新排序其 fitted.values
中的样本?
首先,mgcv
在因素水平上做对了。如果你检查str(df2$sex)
,你会看到"M"(男)是第一级,"F"(女)是第二级。但是从str(df2$col)
看来,"F"是第一个,所以你在制作情节的时候会出现一些错误的标签。
其次,您的第二个型号没有正确指定。
- 当没有"by"变量时,样条
s(time)
处于居中约束,或者"by"是一个因素。因此,您将 "by" 变量 group
作为模型公式中的单独项提供,以捕捉其边际效应;
- 由于 "by" 变量
group
是有序变量,mgcv
对其应用对比,在构造 s(time, by = group)
时删除第一级 "a" .所以你需要单独提供一个s(time)
作为baseline smooth。
您当前的 mgcv2.fit
是一个相当差的模型(不足为奇),给出的解释偏差为 9%。但是,如果您执行以下操作,您将获得 64%。
gam(val ~ sex + s(time) + group + s(time, by = group), data = df2, method = "REML")
ggplot
现在看起来正确了(我没有更改 df2$col
所以颜色可能仍然是相反的)。
gam
默认使用"GCV.Cp"作为平滑参数选择方式。但建议使用"REML",不易过拟合。
备注1
如果 "by" 变量 group
是一个(无序)因子,则它不受对比影响。所以模型公式应该是:
val ~ sex + group + s(time, by = group)
以下引用自?gam.models
'by'变量部分:
If a ‘by’ variable is a ‘factor’ then it generates an indicator
vector for each level of the factor, unless it is an ‘ordered’
factor. In the non-ordered case, the model matrix for the smooth
term is then replicated for each factor level, and each copy has
its rows multiplied by the corresponding rows of its indicator
variable. The smoothness penalties are also duplicated for each
factor level. In short a different smooth is generated for each
factor level (the ‘id’ argument to ‘s’ and ‘te’ can be used to
force all such smooths to have the same smoothing parameter).
‘ordered’ ‘by’ variables are handled in the same way, except that
no smooth is generated for the first level of the ordered factor
(see ‘b3’ example below). This is useful for setting up
identifiable models when the same smooth occurs more than once in
a model, with different factor ‘by’ variables.
备注2
我不是要评判你的模型,但 "F" 和 "M" 之间似乎存在明显的组内差异。从您的数据我们看到 "F" 和 "M" 在组 "b" 中的差异比在组 "a" 中的差异更大。目前 sex
的效果在两组中是相同的,只是垂直移动。您可以在这个答案的上面 ggplot
中观察到这一点。最终由您决定模型,但以防万一您想要对这种 sex-group
交互进行建模,您可以
df2$sex_group <- with(df2, interaction(sex, group)) ## the new variable is unordered
test <- gam(val ~ sex + group + s(time, by = sex_group), data = df2, method = "REML")
请注意我是如何向 by
提供两个因子变量的。创建了一个辅助变量sex_group
。
我有两个基因表达时程数据集:
首先,在 4 个组的 14 个时间点测量基因表达:
df1 <- structure(list(val = c(-0.1, -0.13, -0.4, -0.3, -0.3, -0.2, -0.24,
0.1, 0.2, 0.13, 0, 0.63, 0.83, 0.85, -0.07, -0.07, -0.27, -0.2,
-0.2, -0.1, 0.2, 0.1, 0.07, 0.17, 0.6, 0.75, 1.1, 1.1, -0.13,
-0.15, -0.26, -0.25, -0.14, 0.04, 0.2, 0.24, 0.23, 0.2, 0.1,
0.73, 1, 1.3, 0, 0.06, -0.24, -0.17, -0.17, -0.04, 0.16, 0.1,
0.14, 0.27, 0.34, 0.9, 0.97, 1.04),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39),
group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,4L),
.Label = c("a", "b", "c", "d"), class = "factor")), .Names = c("val","time", "group"),
row.names = c(NA, -56L), class = "data.frame")
df1$group <- factor(df1$group,levels=c("a","b","c","d"))
看起来像这样(添加 loess
平滑趋势线):
library(ggplot2)
ggplot(df1,aes(x=time,y=val,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
其次,在相似的 14 个时间点测量了基因表达,但现在来自 2 个不同的组,每个组由两种性别代表:
df2 <- structure(list(val = c(-0.23, -0.01, -0.14, -0.01, -0.21, -0.16,
-0.24, -0.11, 0.02, -0.11, -0.01, -0.25, -0.47, -1.25, 0.02,
-0.3, -0.02, 0.14, 0.25, -0.05, 0.15, 0.11, -0.24, -0.18, -0.39,
-0.49, -0.5, -0.65, -0.06, 0.09, 0.1, 0.15, 0.08, 0.15, 0.4,
0.24, 0.07, 0.08, -0.18, -0.35, -0.19, -0.81, -0.16, 0.29, -0.05,
0.14, 0.14, 0.48, 0.34, 0.11, -0.07, -0.13, -0.41, -0.22, -0.54,
-0.76, 0.35, 0.34, -0.06, 0.21, 0.14, 0.14, 0.25, 0.22, 0.25,
0.16, 0.3, 0.44, 0.08, 0.48, 0.1, 0.16, -0.03, -0.22, 0.2, 0.01,
-0.09, -0.02, -0.01, 0.06, -0.13, 0.19, 0.11, -0.04, -0.39, 0.03,
-0.01, 0.09, 0.1, -0.14, -0.12, -0.1, 0.36, 0.08, 0.09, 0.09,
0.42, 0.37, -0.14, 0.12, 0.09, 0.03, 0.06, -0.25, 0.2, -0.06,
-0.44, 0.23, 0.03, 0.16, 0.81, 0.83),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0,1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58,5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17,4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39),
sex = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("F", "M"), class = "factor"), group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
.Label = c("a", "b"), class = "factor")), .Names = c("val", "time", "sex", "group"), row.names = c(NA, -112L), class = "data.frame")
df2$sex <- ordered(df2$sex,levels=c("M","F"))
df2$group <- ordered(df2$group,levels=c("a","b"))
df2$col <- factor(paste0(df2$group,":",df2$sex))
看起来像这样(添加黄土平滑趋势线):
ggplot(df2,aes(x=time,y=val,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
对于 df1
,我想估计 time
对 val
的影响,调整 group
。
对于df2
,我想估计time:group
对val
的影响,调整sex
。
查看数据我认为使用 spline
regression
s 是合适的,所以我使用了 mgcv
包中的 gam
函数,据我所知理解优化了适合数据的 spline
的自由度。
这是我适合的 df1
:
mgcv1.fit <- mgcv::gam(val ~ group+s(time),data=df1)
给出:
Family: gaussian
Link function: identity
Formula:
val ~ group + s(time)
Estimated degrees of freedom:
7.18 total = 11.18
GCV score: 0.01258176
但是 7.18 的自由度对于这些数据来说似乎太多了。
对于df2
:
mgcv2.fit <- mgcv::gam(val ~ sex+s(time,by=group),data=df2)
给出:
Family: gaussian
Link function: identity
Formula:
val ~ sex + s(time, by = group)
Estimated degrees of freedom:
1.72 total = 3.72
GCV score: 0.08522094
我想在这种情况下,我认为自由度会稍微高一些。
还有一点。绘制这两个数据集的拟合值:
df1$mgcv <- mgcv1.fit$fitted.values
ggplot(df1,aes(x=time,y=mgcv,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
看起来不错。
但是 df2
df2$mgcv <- mgcv2.fit$fitted.values
ggplot(df2,aes(x=time,y=mgcv,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
看起来它翻转了组标签。
所以我的问题是:
- 我是否正确使用
mgcv::gam
来优化我的问题的样条自由度? mgcv
是否重新排序其fitted.values
中的样本?
首先,mgcv
在因素水平上做对了。如果你检查str(df2$sex)
,你会看到"M"(男)是第一级,"F"(女)是第二级。但是从str(df2$col)
看来,"F"是第一个,所以你在制作情节的时候会出现一些错误的标签。
其次,您的第二个型号没有正确指定。
- 当没有"by"变量时,样条
s(time)
处于居中约束,或者"by"是一个因素。因此,您将 "by" 变量group
作为模型公式中的单独项提供,以捕捉其边际效应; - 由于 "by" 变量
group
是有序变量,mgcv
对其应用对比,在构造s(time, by = group)
时删除第一级 "a" .所以你需要单独提供一个s(time)
作为baseline smooth。
您当前的 mgcv2.fit
是一个相当差的模型(不足为奇),给出的解释偏差为 9%。但是,如果您执行以下操作,您将获得 64%。
gam(val ~ sex + s(time) + group + s(time, by = group), data = df2, method = "REML")
ggplot
现在看起来正确了(我没有更改 df2$col
所以颜色可能仍然是相反的)。
gam
默认使用"GCV.Cp"作为平滑参数选择方式。但建议使用"REML",不易过拟合。
备注1
如果 "by" 变量 group
是一个(无序)因子,则它不受对比影响。所以模型公式应该是:
val ~ sex + group + s(time, by = group)
以下引用自?gam.models
'by'变量部分:
If a ‘by’ variable is a ‘factor’ then it generates an indicator
vector for each level of the factor, unless it is an ‘ordered’
factor. In the non-ordered case, the model matrix for the smooth
term is then replicated for each factor level, and each copy has
its rows multiplied by the corresponding rows of its indicator
variable. The smoothness penalties are also duplicated for each
factor level. In short a different smooth is generated for each
factor level (the ‘id’ argument to ‘s’ and ‘te’ can be used to
force all such smooths to have the same smoothing parameter).
‘ordered’ ‘by’ variables are handled in the same way, except that
no smooth is generated for the first level of the ordered factor
(see ‘b3’ example below). This is useful for setting up
identifiable models when the same smooth occurs more than once in
a model, with different factor ‘by’ variables.
备注2
我不是要评判你的模型,但 "F" 和 "M" 之间似乎存在明显的组内差异。从您的数据我们看到 "F" 和 "M" 在组 "b" 中的差异比在组 "a" 中的差异更大。目前 sex
的效果在两组中是相同的,只是垂直移动。您可以在这个答案的上面 ggplot
中观察到这一点。最终由您决定模型,但以防万一您想要对这种 sex-group
交互进行建模,您可以
df2$sex_group <- with(df2, interaction(sex, group)) ## the new variable is unordered
test <- gam(val ~ sex + group + s(time, by = sex_group), data = df2, method = "REML")
请注意我是如何向 by
提供两个因子变量的。创建了一个辅助变量sex_group
。