将线性回归模型应用于成对列

Question

我有一些数据如下：

                      BNBBTC    NULSBTC   NEOBTC    LINKBTC    IOTABTC    ETCBTC
2021-05-15 14:49:00 0.012036 0.00001983 0.001972 0.00089782 0.00003891 0.0021133
2021-05-15 14:50:00 0.012039 0.00001983 0.001968 0.00089699 0.00003889 0.0021148
2021-05-15 14:51:00 0.012032 0.00001975 0.001972 0.00089625 0.00003890 0.0021139
2021-05-15 14:52:00 0.012017 0.00001974 0.001969 0.00089484 0.00003885 0.0021152
2021-05-15 14:53:00 0.012026 0.00001975 0.001967 0.00089538 0.00003885 0.0021140
2021-05-15 14:54:00 0.012012 0.00001973 0.001966 0.00089511 0.00003885 0.0021104
2021-05-15 14:55:00 0.012019 0.00001973 0.001965 0.00089463 0.00003882 0.0021097
2021-05-15 14:56:00 0.012022 0.00001977 0.001966 0.00089541 0.00003879 0.0021131
2021-05-15 14:57:00 0.012022 0.00001977 0.001967 0.00089546 0.00003879 0.0021113
2021-05-15 14:58:00 0.012005 0.00001977 0.001967 0.00089500 0.00003886 0.0021110
2021-05-15 14:59:00 0.012015 0.00001984 0.001965 0.00089540 0.00003887 0.0021107
2021-05-15 15:00:00 0.012016 0.00001984 0.001967 0.00089574 0.00003882 0.0021088
2021-05-15 15:01:00 0.012032 0.00001980 0.001966 0.00089636 0.00003896 0.0021155
2021-05-15 15:02:00 0.012029 0.00001969 0.001966 0.00089600 0.00003894 0.0021191
2021-05-15 15:03:00 0.012014 0.00001974 0.001966 0.00089401 0.00003891 0.0021166
2021-05-15 15:04:00 0.012017 0.00001975 0.001966 0.00089410 0.00003890 0.0021187
2021-05-15 15:05:00 0.012017 0.00001975 0.001968 0.00089554 0.00003889 0.0021109
2021-05-15 15:06:00 0.012021 0.00001980 0.001967 0.00089600 0.00003898 0.0021052
2021-05-15 15:07:00 0.012035 0.00001980 0.001969 0.00089589 0.00003897 0.0020955
2021-05-15 15:08:00 0.012026 0.00001983 0.001970 0.00089409 0.00003881 0.0020878

我正在尝试运行跨所有不同列组合的线性回归。例如：

lm(dat$BNBBTC ~ dat$NULSBTC)
lm(dat$BNBBTC ~ dat$NEOBTC)
lm(dat$BNBBTC ~ dat$LINKBTC)
...

我尝试先展开网格并创建公式。

expand.grid(y = colnames(dat), x = colnames(dat), KEEP.OUT.ATTRS = FALSE) %>% 
  mutate(
    formula = paste(y, "~", x, sep = " ")
  ) %>% 
  group_by(formula) %>% 
  mutate(
    mod = summary(lm(formula, data = dat)$r.squared)
  )

但这会产生错误，因为 mutate 函数确实处理线性模型。我想将线性回归模型存储在嵌套列中，我可以在其中访问系数、p 值并执行一些测试。

dat <- structure(c(0.012036, 0.012039, 0.012032, 0.012017, 0.012026, 
0.012012, 0.012019, 0.012022, 0.012022, 0.012005, 0.012015, 0.012016, 
0.012032, 0.012029, 0.012014, 0.012017, 0.012017, 0.012021, 0.012035, 
0.012026, 0.00001983, 0.00001983, 0.00001975, 0.00001974, 0.00001975, 
0.00001973, 0.00001973, 0.00001977, 0.00001977, 0.00001977, 0.00001984, 
0.00001984, 0.0000198, 0.00001969, 0.00001974, 0.00001975, 0.00001975, 
0.0000198, 0.0000198, 0.00001983, 0.001972, 0.001968, 0.001972, 
0.001969, 0.001967, 0.001966, 0.001965, 0.001966, 0.001967, 0.001967, 
0.001965, 0.001967, 0.001966, 0.001966, 0.001966, 0.001966, 0.001968, 
0.001967, 0.001969, 0.00197, 0.00089782, 0.00089699, 0.00089625, 
0.00089484, 0.00089538, 0.00089511, 0.00089463, 0.00089541, 0.00089546, 
0.000895, 0.0008954, 0.00089574, 0.00089636, 0.000896, 0.00089401, 
0.0008941, 0.00089554, 0.000896, 0.00089589, 0.00089409, 0.00003891, 
0.00003889, 0.0000389, 0.00003885, 0.00003885, 0.00003885, 0.00003882, 
0.00003879, 0.00003879, 0.00003886, 0.00003887, 0.00003882, 0.00003896, 
0.00003894, 0.00003891, 0.0000389, 0.00003889, 0.00003898, 0.00003897, 
0.00003881, 0.0021133, 0.0021148, 0.0021139, 0.0021152, 0.002114, 
0.0021104, 0.0021097, 0.0021131, 0.0021113, 0.002111, 0.0021107, 
0.0021088, 0.0021155, 0.0021191, 0.0021166, 0.0021187, 0.0021109, 
0.0021052, 0.0020955, 0.0020878), class = c("xts", "zoo"), index = structure(c(1621082940, 
1621083000, 1621083060, 1621083120, 1621083180, 1621083240, 1621083300, 
1621083360, 1621083420, 1621083480, 1621083540, 1621083600, 1621083660, 
1621083720, 1621083780, 1621083840, 1621083900, 1621083960, 1621084020, 
1621084080), tzone = "", tclass = c("POSIXct", "POSIXt")), .Dim = c(20L, 
6L), .Dimnames = list(NULL, c("BNBBTC", "NULSBTC", "NEOBTC", 
"LINKBTC", "IOTABTC", "ETCBTC")))

编辑：

这可能会得到我想要的

myFormulas <- expand.grid(y = colnames(dat), x = colnames(dat), KEEP.OUT.ATTRS = FALSE) %>% 
  mutate(
    formula = paste(y, "~", x, sep = " ")
  ) %>% 
  pull(formula)
  

map(myFormulas, ~lm(.x, data = dat))

Answer 1

我们可以使用 rowwise 或 map 来遍历公式并创建带有模型摘要输出的 list 列

out <- expand.grid(y = colnames(dat), x = colnames(dat),
       KEEP.OUT.ATTRS = FALSE) %>% 
  mutate(
         formula = paste(y, "~", x, sep = " ")
     ) %>%
  rowwise %>% 
  mutate(mod = list(summary(lm(as.formula(formula), data = dat)))) %>%
  ungroup

-输出

out
# A tibble: 36 x 4
   y       x       formula           mod       
   <fct>   <fct>   <chr>             <list>    
 1 BNBBTC  BNBBTC  BNBBTC ~ BNBBTC   <smmry.lm>
 2 NULSBTC BNBBTC  NULSBTC ~ BNBBTC  <smmry.lm>
 3 NEOBTC  BNBBTC  NEOBTC ~ BNBBTC   <smmry.lm>
 4 LINKBTC BNBBTC  LINKBTC ~ BNBBTC  <smmry.lm>
 5 IOTABTC BNBBTC  IOTABTC ~ BNBBTC  <smmry.lm>
 6 ETCBTC  BNBBTC  ETCBTC ~ BNBBTC   <smmry.lm>
 7 BNBBTC  NULSBTC BNBBTC ~ NULSBTC  <smmry.lm>
 8 NULSBTC NULSBTC NULSBTC ~ NULSBTC <smmry.lm>
 9 NEOBTC  NULSBTC NEOBTC ~ NULSBTC  <smmry.lm>
10 LINKBTC NULSBTC LINKBTC ~ NULSBTC <smmry.lm>
# … with 26 more rows

或使用map2

library(purrr)
library(tidyr)
out <- crossing(y = colnames(dat), x = colnames(dat)) %>%
          mutate(mod = map2(x, y,
     ~ summary(lm(reformulate(.x, response = .y), data = dat))))

'mod' 是一个 list 列，可以 pull 编辑出来

out$mod

或

out %>%
    pull(mod)

将线性回归模型应用于成对列

Apply linear regression model to pairwise columns

r

lm