使用 broom:augment 拟合模型后如何保留原始列
How to preserve original columns after fitting model with broom:augment
我正在拟合一个 gam 并使用 broom::augment 来获取模型系数。但是,在 'augmenting' 我的模型之后,我丢失了数据集的一些原始列。我将需要这些列 pivot_longer 并制作一些图,日期沿 X 轴,地区沿 Y 轴。有没有办法保留这些列供以后使用?
这里有一个数据子集来解释我的意思:
#Make a model from the following data
library(tidyverse)
library(broom)
library(mgcv)
test <- structure(list(year = c(2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020), period = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 2, 3, 4, 5, 6, 7, 8, 12, 7, 8, 9, 10, 11, 12,
13, 15, 19, 20, 21, 22, 3, 7, 11, 12, 13, 14, 15, 16, 19, 21,
23, 24, 25, 28, 29, 2, 4, 5, 6, 7), district = c(221, 221, 221,
221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221,
221, 221, 221, 221, 221, 221, 221, 222, 222, 222, 222, 222, 222,
222, 222, 222, 222, 222, 222, 222, 222, 223, 223, 223, 223, 223,
223, 223, 223, 223, 223, 224, 224, 224, 224, 224, 224, 224, 224,
226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 227,
227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
227, 228, 228, 228, 228, 228), date = structure(c(18454, 18458,
18459, 18462, 18466, 18467, 18468, 18471, 18475, 18476, 18477,
18479, 18482, 18485, 18487, 18489, 18491, 18493, 18495, 18498,
18499, 18500, 18501, 18466, 18471, 18475, 18479, 18482, 18485,
18487, 18489, 18491, 18493, 18495, 18497, 18499, 18501, 18463,
18466, 18471, 18475, 18479, 18482, 18485, 18487, 18489, 18491,
18459, 18462, 18466, 18471, 18475, 18479, 18482, 18491, 18435,
18438, 18442, 18445, 18447, 18449, 18452, 18456, 18485, 18487,
18489, 18497, 18421, 18431, 18440, 18442, 18445, 18447, 18449,
18452, 18459, 18463, 18475, 18479, 18482, 18489, 18491, 18462,
18471, 18475, 18479, 18482), class = "Date"), pinkcount = c(2153732,
2074853, 526934, 721057, 728884, 116370, 231951, 750551, 803772,
76330, 105696, 386568, 156750, 8243, 26856, 16036, 40262, 24827,
6404, 2183, 1206, 1438, 2888, 177054, 392059, 179505, 472985,
489168, 639106, 341042, 169540, 153864, 86039, 126288, 146441,
36259, 15654, 100867, 51319, 78043, 51462, 57584, 33914, 470795,
197139, 47556, 17062, 93562, 123367, 55249, 175281, 190319, 188759,
64537, 1871, 5, 158, 171, 224, 973, 1547, 1395, 262, 1145554,
829993, 634609, 104113, 3, 8, 8, 134, 868, 452, 14899, 1848,
2449, 10365, 64608, 64016, 56468, 16191, 17130, 37070, 127510,
139808, 29274, 9985), airtemp_f = c(56.675652173913, 57.7475,
57.1775, 57.035, 56.5025, 56.765, 54.92, 57.572, 58.22, 57.2975,
56.585, 56.6075, 55.7375, 56.12, 56.63, 59.8025, 58.2330434782609,
57.11, 61.82, 56.6825, 55.97, 55.7825, 54.875, 56.5025, 57.572,
58.22, 56.6075, 55.7375, 56.12, 56.63, 59.8025, 58.2330434782609,
57.11, 61.82, 57.59, 55.97, 54.875, 56.7725, 56.5025, 57.572,
58.22, 56.6075, 55.7375, 56.12, 56.63, 59.8025, 58.2330434782609,
57.1775, 57.035, 56.5025, 57.572, 58.22, 56.6075, 55.7375, 58.2330434782609,
52.2275, 51.11, 52.205, 62.39, 59.18, 55.0475, 55.58, 55.37,
56.12, 56.63, 59.8025, 57.59, 47.9225, 51.3425, 54.6575, 52.205,
62.39, 59.18, 55.0475, 55.58, 57.1775, 56.7725, 58.22, 56.6075,
55.7375, 59.8025, 58.2330434782609, 57.035, 57.572, 58.22, 56.6075,
55.7375), watertemp_f = c(58.2408695652174, 58.325, 59.405, 59.405,
57.875, 58.2575, 57.3575, 58.52, 57.935, 57.995, 57.2825, 57.29,
56.915, 56.78, 57.785, 58.445, 59.0547826086957, 58.7675, 60.3575,
58.835, 58.625, 58.16, 57.77, 57.875, 58.52, 57.935, 57.29, 56.915,
56.78, 57.785, 58.445, 59.0547826086957, 58.7675, 60.3575, 59.2025,
58.625, 57.77, 58.2125, 57.875, 58.52, 57.935, 57.29, 56.915,
56.78, 57.785, 58.445, 59.0547826086957, 59.405, 59.405, 57.875,
58.52, 57.935, 57.29, 56.915, 59.0547826086957, 53.8475, 53.7275,
54.3875, 55.0775, 56.405, 55.475, 55.958, 57.2225, 56.78, 57.785,
58.445, 59.2025, 47.7875, 52.67, 54.755, 54.3875, 55.0775, 56.405,
55.475, 55.958, 59.405, 58.2125, 57.935, 57.29, 56.915, 58.445,
59.0547826086957, 59.405, 58.52, 57.935, 57.29, 56.915), rainfall_inch = c(0.0393700787401575,
0, 0.031496062992126, 1.68897637795276, 0.570866141732283, 1.54330708661417,
5.36220472440945, 0, 1.8503937007874, 4.09055118110236, 2.85826771653543,
0.110236220472441, 1.44094488188976, 1.49212598425197, 0.0118110236220472,
0, 0, 0.0708661417322835, 0, 0.570866141732283, 3.1496062992126,
2.85826771653543, 1.94881889763779, 0.570866141732283, 0, 1.8503937007874,
0.110236220472441, 1.44094488188976, 1.49212598425197, 0.0118110236220472,
0, 0, 0.0708661417322835, 0, 0.551181102362205, 3.1496062992126,
1.94881889763779, 1.62204724409449, 0.570866141732283, 0, 1.8503937007874,
0.110236220472441, 1.44094488188976, 1.49212598425197, 0.0118110236220472,
0, 0, 0.031496062992126, 1.68897637795276, 0.570866141732283,
0, 1.8503937007874, 0.110236220472441, 1.44094488188976, 0, 0.598425196850394,
0.811023622047244, 0.0984251968503937, 0, 0, 0.0590551181102362,
1.08267716535433, 0.299212598425197, 1.49212598425197, 0.0118110236220472,
0, 0.551181102362205, 1.5, 0.389763779527559, 0, 0.0984251968503937,
0, 0, 0.0590551181102362, 1.08267716535433, 0.031496062992126,
1.62204724409449, 1.8503937007874, 0.110236220472441, 1.44094488188976,
0, 0, 1.68897637795276, 0, 1.8503937007874, 0.110236220472441,
1.44094488188976)), row.names = c(NA, -87L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 6 x 8
year period district date pinkcount airtemp_f watertemp_f rainfall_inch
<dbl> <dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl>
1 2020 1 221 2020-07-11 2153732 56.7 58.2 0.0394
2 2020 2 221 2020-07-15 2074853 57.7 58.3 0
3 2020 3 221 2020-07-16 526934 57.2 59.4 0.0315
4 2020 4 221 2020-07-19 721057 57.0 59.4 1.69
5 2020 5 221 2020-07-23 728884 56.5 57.9 0.571
6 2020 6 221 2020-07-24 116370 56.8 58.3 1.54
mod1 <- gam(pinkcount ~ s(airtemp_f) + s(watertemp_f) + s(rainfall_inch), data = test)
aug <- augment(mod1)
aug
After 'augment' I lost year,district,period, and date.
# A tibble: 6 x 10
pinkcount airtemp_f watertemp_f rainfall_inch .fitted .se.fit .resid .hat .sigma .cooksd
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl>
1 2153732 56.7 58.2 0.0394 295858. 64667. 1857874. 0.0296 NA 0.137
2 2074853 57.7 58.3 0 280398. 59418. 1794455. 0.0250 NA 0.107
3 526934 57.2 59.4 0.0315 256397. 88107. 270537. 0.0550 NA 0.00567
4 721057 57.0 59.4 1.69 182417. 90747. 538640. 0.0584 NA 0.0240
5 728884 56.5 57.9 0.571 270947. 53998. 457937. 0.0207 NA 0.00569
6 116370 56.8 58.3 1.54 225246. 55145. -108876. 0.0216 NA 0.000336
有谁知道如何保留这些原始专栏并将它们附加到 'aug'?
原始数据集的行数和aug
相同,为什么不将它们绑定在一起。
library(dplyr)
test %>%
select(year,district,period, date) %>%
bind_cols(aug)
# year district period date pinkcount airtemp_f watertemp_f rainfall_inch .fitted
# <dbl> <dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2020 221 1 2020-07-11 2153732 56.7 58.2 0.0394 295858.
# 2 2020 221 2 2020-07-15 2074853 57.7 58.3 0 280398.
# 3 2020 221 3 2020-07-16 526934 57.2 59.4 0.0315 256397.
# 4 2020 221 4 2020-07-19 721057 57.0 59.4 1.69 182417.
# 5 2020 221 5 2020-07-23 728884 56.5 57.9 0.571 270947.
# 6 2020 221 6 2020-07-24 116370 56.8 58.3 1.54 225246.
# 7 2020 221 7 2020-07-25 231951 54.9 57.4 5.36 57257.
# 8 2020 221 8 2020-07-28 750551 57.6 58.5 0 280472.
# 9 2020 221 9 2020-08-01 803772 58.2 57.9 1.85 186288.
#10 2020 221 10 2020-08-02 76330 57.3 58.0 4.09 98621.
# … with 77 more rows, and 5 more variables: .se.fit <dbl>, .resid <dbl>, .hat <dbl>,
# .sigma <lgl>, .cooksd <dbl>
我正在拟合一个 gam 并使用 broom::augment 来获取模型系数。但是,在 'augmenting' 我的模型之后,我丢失了数据集的一些原始列。我将需要这些列 pivot_longer 并制作一些图,日期沿 X 轴,地区沿 Y 轴。有没有办法保留这些列供以后使用? 这里有一个数据子集来解释我的意思:
#Make a model from the following data
library(tidyverse)
library(broom)
library(mgcv)
test <- structure(list(year = c(2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020), period = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 2, 3, 4, 5, 6, 7, 8, 12, 7, 8, 9, 10, 11, 12,
13, 15, 19, 20, 21, 22, 3, 7, 11, 12, 13, 14, 15, 16, 19, 21,
23, 24, 25, 28, 29, 2, 4, 5, 6, 7), district = c(221, 221, 221,
221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221,
221, 221, 221, 221, 221, 221, 221, 222, 222, 222, 222, 222, 222,
222, 222, 222, 222, 222, 222, 222, 222, 223, 223, 223, 223, 223,
223, 223, 223, 223, 223, 224, 224, 224, 224, 224, 224, 224, 224,
226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 227,
227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
227, 228, 228, 228, 228, 228), date = structure(c(18454, 18458,
18459, 18462, 18466, 18467, 18468, 18471, 18475, 18476, 18477,
18479, 18482, 18485, 18487, 18489, 18491, 18493, 18495, 18498,
18499, 18500, 18501, 18466, 18471, 18475, 18479, 18482, 18485,
18487, 18489, 18491, 18493, 18495, 18497, 18499, 18501, 18463,
18466, 18471, 18475, 18479, 18482, 18485, 18487, 18489, 18491,
18459, 18462, 18466, 18471, 18475, 18479, 18482, 18491, 18435,
18438, 18442, 18445, 18447, 18449, 18452, 18456, 18485, 18487,
18489, 18497, 18421, 18431, 18440, 18442, 18445, 18447, 18449,
18452, 18459, 18463, 18475, 18479, 18482, 18489, 18491, 18462,
18471, 18475, 18479, 18482), class = "Date"), pinkcount = c(2153732,
2074853, 526934, 721057, 728884, 116370, 231951, 750551, 803772,
76330, 105696, 386568, 156750, 8243, 26856, 16036, 40262, 24827,
6404, 2183, 1206, 1438, 2888, 177054, 392059, 179505, 472985,
489168, 639106, 341042, 169540, 153864, 86039, 126288, 146441,
36259, 15654, 100867, 51319, 78043, 51462, 57584, 33914, 470795,
197139, 47556, 17062, 93562, 123367, 55249, 175281, 190319, 188759,
64537, 1871, 5, 158, 171, 224, 973, 1547, 1395, 262, 1145554,
829993, 634609, 104113, 3, 8, 8, 134, 868, 452, 14899, 1848,
2449, 10365, 64608, 64016, 56468, 16191, 17130, 37070, 127510,
139808, 29274, 9985), airtemp_f = c(56.675652173913, 57.7475,
57.1775, 57.035, 56.5025, 56.765, 54.92, 57.572, 58.22, 57.2975,
56.585, 56.6075, 55.7375, 56.12, 56.63, 59.8025, 58.2330434782609,
57.11, 61.82, 56.6825, 55.97, 55.7825, 54.875, 56.5025, 57.572,
58.22, 56.6075, 55.7375, 56.12, 56.63, 59.8025, 58.2330434782609,
57.11, 61.82, 57.59, 55.97, 54.875, 56.7725, 56.5025, 57.572,
58.22, 56.6075, 55.7375, 56.12, 56.63, 59.8025, 58.2330434782609,
57.1775, 57.035, 56.5025, 57.572, 58.22, 56.6075, 55.7375, 58.2330434782609,
52.2275, 51.11, 52.205, 62.39, 59.18, 55.0475, 55.58, 55.37,
56.12, 56.63, 59.8025, 57.59, 47.9225, 51.3425, 54.6575, 52.205,
62.39, 59.18, 55.0475, 55.58, 57.1775, 56.7725, 58.22, 56.6075,
55.7375, 59.8025, 58.2330434782609, 57.035, 57.572, 58.22, 56.6075,
55.7375), watertemp_f = c(58.2408695652174, 58.325, 59.405, 59.405,
57.875, 58.2575, 57.3575, 58.52, 57.935, 57.995, 57.2825, 57.29,
56.915, 56.78, 57.785, 58.445, 59.0547826086957, 58.7675, 60.3575,
58.835, 58.625, 58.16, 57.77, 57.875, 58.52, 57.935, 57.29, 56.915,
56.78, 57.785, 58.445, 59.0547826086957, 58.7675, 60.3575, 59.2025,
58.625, 57.77, 58.2125, 57.875, 58.52, 57.935, 57.29, 56.915,
56.78, 57.785, 58.445, 59.0547826086957, 59.405, 59.405, 57.875,
58.52, 57.935, 57.29, 56.915, 59.0547826086957, 53.8475, 53.7275,
54.3875, 55.0775, 56.405, 55.475, 55.958, 57.2225, 56.78, 57.785,
58.445, 59.2025, 47.7875, 52.67, 54.755, 54.3875, 55.0775, 56.405,
55.475, 55.958, 59.405, 58.2125, 57.935, 57.29, 56.915, 58.445,
59.0547826086957, 59.405, 58.52, 57.935, 57.29, 56.915), rainfall_inch = c(0.0393700787401575,
0, 0.031496062992126, 1.68897637795276, 0.570866141732283, 1.54330708661417,
5.36220472440945, 0, 1.8503937007874, 4.09055118110236, 2.85826771653543,
0.110236220472441, 1.44094488188976, 1.49212598425197, 0.0118110236220472,
0, 0, 0.0708661417322835, 0, 0.570866141732283, 3.1496062992126,
2.85826771653543, 1.94881889763779, 0.570866141732283, 0, 1.8503937007874,
0.110236220472441, 1.44094488188976, 1.49212598425197, 0.0118110236220472,
0, 0, 0.0708661417322835, 0, 0.551181102362205, 3.1496062992126,
1.94881889763779, 1.62204724409449, 0.570866141732283, 0, 1.8503937007874,
0.110236220472441, 1.44094488188976, 1.49212598425197, 0.0118110236220472,
0, 0, 0.031496062992126, 1.68897637795276, 0.570866141732283,
0, 1.8503937007874, 0.110236220472441, 1.44094488188976, 0, 0.598425196850394,
0.811023622047244, 0.0984251968503937, 0, 0, 0.0590551181102362,
1.08267716535433, 0.299212598425197, 1.49212598425197, 0.0118110236220472,
0, 0.551181102362205, 1.5, 0.389763779527559, 0, 0.0984251968503937,
0, 0, 0.0590551181102362, 1.08267716535433, 0.031496062992126,
1.62204724409449, 1.8503937007874, 0.110236220472441, 1.44094488188976,
0, 0, 1.68897637795276, 0, 1.8503937007874, 0.110236220472441,
1.44094488188976)), row.names = c(NA, -87L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 6 x 8
year period district date pinkcount airtemp_f watertemp_f rainfall_inch
<dbl> <dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl>
1 2020 1 221 2020-07-11 2153732 56.7 58.2 0.0394
2 2020 2 221 2020-07-15 2074853 57.7 58.3 0
3 2020 3 221 2020-07-16 526934 57.2 59.4 0.0315
4 2020 4 221 2020-07-19 721057 57.0 59.4 1.69
5 2020 5 221 2020-07-23 728884 56.5 57.9 0.571
6 2020 6 221 2020-07-24 116370 56.8 58.3 1.54
mod1 <- gam(pinkcount ~ s(airtemp_f) + s(watertemp_f) + s(rainfall_inch), data = test)
aug <- augment(mod1)
aug
After 'augment' I lost year,district,period, and date.
# A tibble: 6 x 10
pinkcount airtemp_f watertemp_f rainfall_inch .fitted .se.fit .resid .hat .sigma .cooksd
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl>
1 2153732 56.7 58.2 0.0394 295858. 64667. 1857874. 0.0296 NA 0.137
2 2074853 57.7 58.3 0 280398. 59418. 1794455. 0.0250 NA 0.107
3 526934 57.2 59.4 0.0315 256397. 88107. 270537. 0.0550 NA 0.00567
4 721057 57.0 59.4 1.69 182417. 90747. 538640. 0.0584 NA 0.0240
5 728884 56.5 57.9 0.571 270947. 53998. 457937. 0.0207 NA 0.00569
6 116370 56.8 58.3 1.54 225246. 55145. -108876. 0.0216 NA 0.000336
有谁知道如何保留这些原始专栏并将它们附加到 'aug'?
原始数据集的行数和aug
相同,为什么不将它们绑定在一起。
library(dplyr)
test %>%
select(year,district,period, date) %>%
bind_cols(aug)
# year district period date pinkcount airtemp_f watertemp_f rainfall_inch .fitted
# <dbl> <dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2020 221 1 2020-07-11 2153732 56.7 58.2 0.0394 295858.
# 2 2020 221 2 2020-07-15 2074853 57.7 58.3 0 280398.
# 3 2020 221 3 2020-07-16 526934 57.2 59.4 0.0315 256397.
# 4 2020 221 4 2020-07-19 721057 57.0 59.4 1.69 182417.
# 5 2020 221 5 2020-07-23 728884 56.5 57.9 0.571 270947.
# 6 2020 221 6 2020-07-24 116370 56.8 58.3 1.54 225246.
# 7 2020 221 7 2020-07-25 231951 54.9 57.4 5.36 57257.
# 8 2020 221 8 2020-07-28 750551 57.6 58.5 0 280472.
# 9 2020 221 9 2020-08-01 803772 58.2 57.9 1.85 186288.
#10 2020 221 10 2020-08-02 76330 57.3 58.0 4.09 98621.
# … with 77 more rows, and 5 more variables: .se.fit <dbl>, .resid <dbl>, .hat <dbl>,
# .sigma <lgl>, .cooksd <dbl>