在 R 中逐行循环逻辑回归

looping logistic regression row wise in R

我正在尝试 运行 针对数据框“dat”的 C1 行从 R1 到 R6 的每一行进行逻辑回归分析

Input data frame

dat <- data.frame(list(X1 = c(1, 1, 0, 1, 1, 1, 0), X2 = c(1, 1, 0, 1, 1, 1, 0), X3 = c(1, 1, 1, 0, 1, 1, 0), X4 = c(1, 1, 0, 1, 1, 1, 0), X5 = c(0, 1, 0, 0, 1, 1, 0), X6 = c(1, 1, 1, 0, 1, 1, 0), X7 = c(0, 1, 0, 1, 1, 1, 0), X8 = c(0, 1, 1, 0, 1, 0, 0), X9 = c(0, 1, 1, 0, 1, 0, 0), X10 = c(1, 1, 1, 0, 1, 0, 0), X11 = c(1, 1, 1, 1, 1, 0, 0), X12 = c(0, 1, 0, 0, 1, 0, 0 ), X13 = c(0, 1, 0, 1, 1, 0, 0), X14 = c(1, 1, 0, 1, 1, 0, 0), X15 = c(1, 1, 1, 1, 1, 0, 0), X16 = c(0, 1, 1, 1, 1, 0, 0), X17 = c(0, 1, 1, 0, 1, 0, 0), X18 = c(0, 1, 1, 0, 1, 0, 0), X19 = c(1, 1, 0, 0, 1, 0, 0), X20 = c(1, 1, 1, 0, 1, 0, 0), X21 = c(0, 1, 0, 1, 1, 0, 0), X22 = c(1, 1, 0, 0, 1, 0, 0), X23 = c(1, 1, 1, 0, 1, 0, 0), X24 = c(1, 1, 0, 0, 1, 0, 0), X25 = c(1, 1, 0, 0, 1, 0, 0), X26 = c(0, 1, 1, 1, 1, 0, 0), X27 = c(0, 1, 0, 0, 1, 0, 0), X28 = c(1, 1, 0, 0, 1, 0, 0), X29 = c(1, 1, 1, 1, 0, 0, 0), X30 = c(1, 1, 0, 0, 0, 1, 0)),  row.names = c("r1", "r2", "r3", "r4", "r5", "r6", "C1"))

logistic regression analyses

    R1 <- glm(r1 ~ C1, data=dat, family=binomial); coef(summary(R1))[,2] 
    R2 <- glm(r2 ~ C1, data=dat, family=binomial); coef(summary(R2))[,2] 
    R3 <- glm(r3 ~ C1, data=dat, family=binomial); coef(summary(R3))[,2] 
    R4 <- glm(r4 ~ C1, data=dat, family=binomial); coef(summary(R4))[,2] 
    R5 <- glm(r5 ~ C1, data=dat, family=binomial); coef(summary(R5))[,2] 
    R6 <- glm(r6 ~ C1, data=dat, family=binomial); coef(summary(R6))[,2] 

实际数据有 6000 行,因此无法针对 C1 逐行执行。

有没有办法在循环中针对 C1 从 R1 到 R6 的每一行计算 glm,并将输出提取到新列中?

library(tidyverse)

independent_var <- 'C1'
dependent_vars <- setdiff(rownames(dat),independent_var)
compare <- NULL

for(i in dependent_vars){
    modelname <- toupper(i)
    
    filtered_data <- dat %>%
    t %>% 
    data.frame %>% 
    select(all_of(c(i,'C1')))
    
    eval(parse(text=sprintf('%s <- glm(%s ~ %s,data=filtered_data,family=binomial)',modelname,i,independent_var)))
    eval(parse(text=sprintf('newrow <- data.frame(model="%s",coef=as.numeric(%s$coefficients[-2]))',modelname,modelname)))
    compare <- rbind(compare,newrow)
}

compare

输出;

  model   coef
  <chr>  <dbl>
1 R1     0.405
2 R2    25.6  
3 R3    -0.134
4 R4    -0.405
5 R5     2.64 
6 R6    -1.01 

我想这就是您要找的。

library(tidyverse)

dat %>%
  rownames_to_column("var") %>%
  pivot_longer(cols = -var) %>%
  mutate(grp = ifelse(grepl("r", var), "x", "y")) %>%
  group_split(grp) %>%
  reduce(full_join, by = "name") %>%
  nest(data = -var.x) %>%
  mutate(reg = map_dbl(data, ~glm(value.x ~ value.y, data=.x, family=binomial) %>%
                     summary() %>%
                     coef() %>%
                     .[,2])) %>%
  select(-data)
#> # A tibble: 6 x 2
#>   var.x       reg
#>   <chr>     <dbl>
#> 1 r1        0.373
#> 2 r2    39436.   
#> 3 r3        0.366
#> 4 r4        0.373
#> 5 r5        0.732
#> 6 r6        0.413

首先我们使用 rownames_to_column 将您的行名称变成一个变量名称,然后我们使用 pivot_longer 将数据帧从宽到长。然后我们根据您的“x”变量和“y”变量拆分数据框以进行逻辑回归。然后我将拆分数据框与其自身连接起来,以获得 x 和 y 的每个组合。最后,我 nest 数据框和 运行 每个“行”的所有回归。

如果有任何不清楚的地方,请告诉我。

可以对任意数量的行执行此操作。但是,首先要注意一点。在您的数据中,行 C1 包含所有值 0。这当然不适合 glm 函数。所以我不得不稍微编辑一下 C1 行。

library(tidyverse)

dat <- data.frame(list(X1 = c(1, 1, 0, 1, 1, 1, 1), X2 = c(1, 1, 0, 1, 1, 1, 1), X3 = c(1, 1, 1, 0, 1, 1, 1), X4 = c(1, 1, 0, 1, 1, 1, 0), X5 = c(0, 1, 0, 0, 1, 1, 0), X6 = c(1, 1, 1, 0, 1, 1, 0), X7 = c(0, 1, 0, 1, 1, 1, 0), X8 = c(0, 1, 1, 0, 1, 0, 0), X9 = c(0, 1, 1, 0, 1, 0, 0), X10 = c(1, 1, 1, 0, 1, 0, 0), X11 = c(1, 1, 1, 1, 1, 0, 0), X12 = c(0, 1, 0, 0, 1, 0, 0 ), X13 = c(0, 1, 0, 1, 1, 0, 1), X14 = c(1, 1, 0, 1, 1, 0, 1), X15 = c(1, 1, 1, 1, 1, 0, 0), X16 = c(0, 1, 1, 1, 1, 0, 0), X17 = c(0, 1, 1, 0, 1, 0, 0), X18 = c(0, 1, 1, 0, 1, 0, 0), X19 = c(1, 1, 0, 0, 1, 0, 0), X20 = c(1, 1, 1, 0, 1, 0, 0), X21 = c(0, 1, 0, 1, 1, 0, 0), X22 = c(1, 1, 0, 0, 1, 0, 0), X23 = c(1, 1, 1, 0, 1, 0, 0), X24 = c(1, 1, 0, 0, 1, 0, 0), X25 = c(1, 1, 0, 0, 1, 0, 0), X26 = c(0, 1, 1, 1, 1, 0, 0), X27 = c(0, 1, 0, 0, 1, 0, 0), X28 = c(1, 1, 0, 0, 1, 0, 0), X29 = c(1, 1, 1, 1, 0, 0, 0), X30 = c(1, 1, 0, 0, 0, 1, 0)),  row.names = c("r1", "r2", "r3", "r4", "r5", "r6", "C1"))

输出

   X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30
r1  1  1  1  1  0  1  0  0  0   1   1   0   0   1   1   0   0   0   1   1   0   1   1   1   1   0   0   1   1   1
r2  1  1  1  1  1  1  1  1  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
r3  0  0  1  0  0  1  0  1  1   1   1   0   0   0   1   1   1   1   0   1   0   0   1   0   0   1   0   0   1   0
r4  1  1  0  1  0  0  1  0  0   0   1   0   1   1   1   1   0   0   0   0   1   0   0   0   0   1   0   0   1   0
r5  1  1  1  1  1  1  1  1  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   0   0
r6  1  1  1  1  1  1  1  0  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
C1  1  1  1  0  0  0  0  0  0   0   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0

然后我将你的数据分成 r 行和 C1 行。

dfR = dat %>% as_tibble() %>% slice_head(n=nrow(.)-1) 
dfC = dat %>% as_tibble() %>% slice_tail()

输出dfR

# A tibble: 6 x 30
     X1    X2    X3    X4    X5    X6    X7    X8    X9   X10   X11   X12   X13   X14   X15   X16   X17   X18   X19   X20   X21
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     1     1     1     1     0     1     0     0     0     1     1     0     0     1     1     0     0     0     1     1     0
2     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1
3     0     0     1     0     0     1     0     1     1     1     1     0     0     0     1     1     1     1     0     1     0
4     1     1     0     1     0     0     1     0     0     0     1     0     1     1     1     1     0     0     0     0     1
5     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1
6     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0
# ... with 9 more variables: X22 <dbl>, X23 <dbl>, X24 <dbl>, X25 <dbl>, X26 <dbl>, X27 <dbl>, X28 <dbl>, X29 <dbl>, X30 <dbl>

输出dfC

# A tibble: 1 x 30
     X1    X2    X3    X4    X5    X6    X7    X8    X9   X10   X11   X12   X13   X14   X15   X16   X17   X18   X19   X20   X21
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     1     1     1     0     0     0     0     0     0     0     0     0     1     1     0     0     0     0     0     0     0
# ... with 9 more variables: X22 <dbl>, X23 <dbl>, X24 <dbl>, X25 <dbl>, X26 <dbl>, X27 <dbl>, X28 <dbl>, X29 <dbl>, X30 <dbl>

接下来我准备了四个简单的函数。 fGetRowValue returns 具有一行值的向量。 fGetData 使用两个变量 xC 创建一个 tibble,其中 x 是单行的值,C 当然是值来自您的 C1 行。 fglm returns 我的模型和 fCoef returns tibble 包含模型的系数。

fGetRowValue = function(data) data %>% 
  pivot_longer(everything()) %>% pull(value)

fGetData = function(dataR, dataC) tibble(
  x = fGetRowValue(dataR),
  C = fGetRowValue(dataC)
)

fglm = function(data) glm(x~C, binomial, data)

fCoef = function(model) tibble(
  Intercept = coef(model)[1],
  C = coef(model)[2]
)

现在是正确计算的时候了。

dfR %>%  mutate(row = 1:nrow(.)) %>% 
  group_by(row) %>% 
  nest() %>%   #1
  group_modify(~fGetData(.x$data[[1]], dfC)) %>% #2
  nest() %>% #3
  mutate(model = map(data, ~fglm(.x))) %>% #4
  mutate(Ceof = map(model, ~fCoef(.x))) %>% #5
  unnest(Ceof)

输出

# A tibble: 6 x 5
# Groups:   row [6]
    row data              model  Intercept        C
  <int> <list>            <list>     <dbl>    <dbl>
1     1 <tibble [30 x 2]> <glm>     0.241   1.15e+0
2     2 <tibble [30 x 2]> <glm>    25.6    -1.98e-9
3     3 <tibble [30 x 2]> <glm>     0.0800 -1.47e+0
4     4 <tibble [30 x 2]> <glm>    -0.754   2.14e+0
5     5 <tibble [30 x 2]> <glm>     2.44    1.71e+1
6     6 <tibble [30 x 2]> <glm>    -1.39    1.79e+0

由于我的计算方式可能有点混乱,让我逐年描述一下(见评论编号)。

在第一步之后,我们收到了这种形式的数据

# A tibble: 6 x 2
# Groups:   row [6]
    row data             
  <int> <list>           
1     1 <tibble [1 x 30]>
2     2 <tibble [1 x 30]>
3     3 <tibble [1 x 30]>
4     4 <tibble [1 x 30]>
5     5 <tibble [1 x 30]>
6     6 <tibble [1 x 30]>

变量数据包含一个 tibble 列表,其中一行包含值。

第二步之后,我们就这样了

# A tibble: 180 x 3
# Groups:   row [6]
     row     x     C
   <int> <dbl> <dbl>
 1     1     1     1
 2     1     1     1
 3     1     1     1
 4     1     1     0
 5     1     0     0
 6     1     1     0
 7     1     0     0
 8     1     0     0
 9     1     0     0
10     1     1     0
# ... with 170 more rows

我们在第三步回滚它

# A tibble: 6 x 2
# Groups:   row [6]
    row data             
  <int> <list>           
1     1 <tibble [30 x 2]>
2     2 <tibble [30 x 2]>
3     3 <tibble [30 x 2]>
4     4 <tibble [30 x 2]>
5     5 <tibble [30 x 2]>
6     6 <tibble [30 x 2]>

然后在第四步中,我们使用包含 glm 个模型列表的变量完成我们的 tibble

# A tibble: 6 x 3
# Groups:   row [6]
    row data              model 
  <int> <list>            <list>
1     1 <tibble [30 x 2]> <glm> 
2     2 <tibble [30 x 2]> <glm> 
3     3 <tibble [30 x 2]> <glm> 
4     4 <tibble [30 x 2]> <glm> 
5     5 <tibble [30 x 2]> <glm> 
6     6 <tibble [30 x 2]> <glm> 

我们已经完成了最后一步,即从我们的模型中下载系数。

# A tibble: 6 x 4
# Groups:   row [6]
    row data              model  Ceof            
  <int> <list>            <list> <list>          
1     1 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>
2     2 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>
3     3 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>
4     4 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>
5     5 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>
6     6 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>

现在您需要做的就是unnest,我们有您想要的结果。

# A tibble: 6 x 5
# Groups:   row [6]
    row data              model  Intercept        C
  <int> <list>            <list>     <dbl>    <dbl>
1     1 <tibble [30 x 2]> <glm>     0.241   1.15e+0
2     2 <tibble [30 x 2]> <glm>    25.6    -1.98e-9
3     3 <tibble [30 x 2]> <glm>     0.0800 -1.47e+0
4     4 <tibble [30 x 2]> <glm>    -0.754   2.14e+0
5     5 <tibble [30 x 2]> <glm>     2.44    1.71e+1
6     6 <tibble [30 x 2]> <glm>    -1.39    1.79e+0

请注意,顺便说一句,所有用于构建它们的模型和数据都已保留。您可以自由使用它进行进一步的计算。