在 R 中使用数据屏蔽创建函数

create a function using data masking in R

为了进行配对分析,我需要编写一个 函数 对整数计数求和。需要求和的总数在“df”中相应的“Yrs_Before”和“Yrs_After”列中指定。有没有一种方法可以避免只用一个函数写“fm_after”?这是“Yrs_Before”、“Yrs_After”、“Before.Yr_1...n+1”和“After.Yr_1...n+1”的变量掩码吗" 列?

数据框

set.seed(123)
(df=data.frame(
  Yrs_Before=sample(1:8, 3),
  Yrs_After=sample(1:8, 3),
  Before.Yr_1=sample(1:8, 3),
  Before.Yr_2=sample(1:8, 3),
  Before.Yr_3=sample(1:8, 3),
  Before.Yr_4=sample(1:8, 3),
  Before.Yr_5=sample(1:8, 3),
  Before.Yr_6=sample(1:8, 3),
  Before.Yr_7=sample(1:8, 3),
  Before.Yr_8=sample(1:8, 3),
  After.Yr_1=sample(1:8, 3),
  After.Yr_2=sample(1:8, 3),
  After.Yr_3=sample(1:8, 3),
  After.Yr_4=sample(1:8, 3),
  After.Yr_5=sample(1:8, 3),
  After.Yr_6=sample(1:8, 3),
  After.Yr_7=sample(1:8, 3),
  After.Yr_8=sample(1:8, 3)
  
))

函数根据前期的年数对相应的行求和。

fm=function(data,Yrs_Before){
  data |> dplyr::mutate(sums=
  ifelse(
  Yrs_Before == 1, rowSums(across(Before.Yr_1)),
  ifelse(
    Yrs_Before == 2, rowSums(across(Before.Yr_1:Before.Yr_2)),
    ifelse(
      Yrs_Before == 3, rowSums(across(Before.Yr_1:Before.Yr_3)),
      ifelse(
        Yrs_Before == 4, rowSums(across(Before.Yr_1:Before.Yr_4)),
        ifelse(
          Yrs_Before == 5, rowSums(across(Before.Yr_1:Before.Yr_5)),
          ifelse(
            Yrs_Before == 6, rowSums(across(Before.Yr_1:Before.Yr_6)),
            ifelse(
              Yrs_Before == 7, rowSums(across(Before.Yr_1:Before.Yr_7)),
              ifelse(
                Yrs_Before == 8, rowSums(across(Before.Yr_1:Before.Yr_8)),"")))))))))
}

输出


fm(df,Yrs_Before)
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 sums
1          7         6           2           5           6           3           3           1           3           1          4          3          4          5          1          4          3          2   23
2          8         3           6           4           1           5           1           8           2           6          6          7          7          7          2          5          6          5   33
3          3         2           3           6           2           8           4           5           7           3          1          5          2          1          3          7          1          7   11

变量名更改为“After”的重复函数


fm_after=function(data,Yrs_After){
  data |> dplyr::mutate(sums=
                   ifelse(
                     Yrs_After == 1, rowSums(across(After.Yr_1)),
                     ifelse(
                       Yrs_After == 2, rowSums(across(After.Yr_1:After.Yr_2)),
                       ifelse(
                         Yrs_After == 3, rowSums(across(After.Yr_1:After.Yr_3)),
                         ifelse(
                           Yrs_After == 4, rowSums(across(After.Yr_1:After.Yr_4)),
                           ifelse(
                             Yrs_After == 5, rowSums(across(After.Yr_1:After.Yr_5)),
                             ifelse(
                               Yrs_After == 6, rowSums(across(After.Yr_1:After.Yr_6)),
                               ifelse(
                                 Yrs_After == 7, rowSums(across(After.Yr_1:After.Yr_7)),
                                 ifelse(
                                   Yrs_After == 8, rowSums(across(After.Yr_1:After.Yr_8)),"")))))))))
}

输出

fm_after(df,Yrs_After)
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 sums
1          7         6           2           5           6           3           3           1           3           1          4          3          4          5          1          4          3          2   21
2          8         3           6           4           1           5           1           8           2           6          6          7          7          7          2          5          6          5   20
3          3         2           3           6           2           8           4           5           7           3          1          5          2          1          3          7          1          7    6

更新:

按照建议,使用以下 tidyverse 函数计算 Before.Yr 和 After.Yr 崩溃次数。

sums=function(data,crashes,yrs){
data %>%
  dplyr::rowwise() %>%
    dplyr::transmute(sum = cumsum(c_across(matches(.data[[crashes]])))[.data[[yrs]]])
}

但是返回了一个错误。

> sums(df,"Before.Yr","Yrs_Before")
Error: Problem with `mutate()` column `sum`.
ℹ `sum = cumsum(c_across(matches(.data[["Before.Yr"]])))[.data[["Yrs_Before"]]]`.
x Column `Before.Yr` not found in `.data`
ℹ The error occurred in row 1.
Run `rlang::last_error()` to see where the error occurred.

修复错误的建议?

我们可以通过转向长格式来做到这一点

library(dplyr)
library(tidyr)
library(stringr)
df %>%  
  mutate(rn = row_number()) %>% 
  pivot_longer(cols = -c(rn, Yrs_Before, Yrs_After)) %>%
   mutate(yrs = as.numeric(str_extract(name, "\d+$"))) %>% 
  group_by(rn, grp = str_extract(name, "\w+")) %>%
  summarise(Sum = if(cur_group()$grp == 'Before')  
   sum(value[yrs <= Yrs_Before], na.rm = TRUE)  else 
   sum(value[yrs <= Yrs_After], na.rm = TRUE), .groups = 'drop') %>% 
  pivot_wider(names_from = grp, values_from = Sum) %>%
  select(-rn) %>%
  bind_cols(df, .)

-输出

 Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1
1          7         6           2           5           6           3           3           1           3           1          4
2          8         3           6           4           1           5           1           8           2           6          6
3          3         2           3           6           2           8           4           5           7           3          1
  After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 After Before
1          3          4          5          1          4          3          2    21     23
2          7          7          7          2          5          6          5    20     33
3          5          2          1          3          7          1          7     6     11

它可以包装在一个只有输入数据的函数中

fm <- function(data) {
  data %>%  
  mutate(rn = row_number()) %>% 
  pivot_longer(cols = -c(rn, Yrs_Before, Yrs_After)) %>%
   mutate(yrs = as.numeric(str_extract(name, "\d+$"))) %>% 
  group_by(rn, grp = str_extract(name, "\w+")) %>%
  summarise(Sum = if(cur_group()$grp == 'Before')  
   sum(value[yrs <= Yrs_Before], na.rm = TRUE)  else 
   sum(value[yrs <= Yrs_After], na.rm = TRUE), .groups = 'drop') %>% 
  pivot_wider(names_from = grp, values_from = Sum) %>%
  select(-rn) %>%
  bind_cols(df, .)
  }

-测试

> fm(df)
  Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1
1          7         6           2           5           6           3           3           1           3           1          4
2          8         3           6           4           1           5           1           8           2           6          6
3          3         2           3           6           2           8           4           5           7           3          1
  After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 After Before
1          3          4          5          1          4          3          2    21     23
2          7          7          7          2          5          6          5    20     33
3          5          2          1          3          7          1          7     6     11

编辑:

在 base R 中你可以这样做:

A <- split.default(df, sub('.*(After|Before).*', '\1', names(df)))  
sapply(A, \(x) rowSums(x[-1] * (col(x[-1]) <= x[[1]])))
    After Before
[1,]    21     23
[2,]    20     33
[3,]     6     11

然后您可以cbind将其添加到原始数据帧

整洁宇宙:

使用 mutate 而不是 transmute 以保留原始数据帧。

df %>%
  rowwise() %>%
  transmute(After = cumsum(c_across(matches('Before.Yr')))[Yrs_Before],
            Before = cumsum(c_across(matches('After.Yr')))[Yrs_After]) 

# A tibble: 3 x 2
# Rowwise: 
  After Before
  <int>  <int>
1    23     21
2    33     20
3    11      6

要避免太多代码,您可以使用 paste

f <- \(x, dat=df) sapply(seq_len(nrow(dat)), \(i, ...) 
                      sum(dat[i, paste0(x, '.Yr_', 1:dat[i, paste0('Yrs_', x)])]))

res <- transform(df, sums_bef=f('Before'), sums_aft=f('After'))
res
#   Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3
# 1          7         6           2           5           6
# 2          8         3           6           4           1
# 3          3         2           3           6           2
#   Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8
# 1           3           3           1           3           1
# 2           5           1           8           2           6
# 3           8           4           5           7           3
#   After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5
# 1          4          3          4          5          1
# 2          6          7          7          7          2
# 3          1          5          2          1          3
#   After.Yr_6 After.Yr_7 After.Yr_8 sums_bef sums_aft
# 1          4          3          2       23       21
# 2          5          6          5       33       20
# 3          7          1          7       11        6

数据:

df <- structure(list(Yrs_Before = c(7L, 8L, 3L), Yrs_After = c(6L, 
3L, 2L), Before.Yr_1 = c(2L, 6L, 3L), Before.Yr_2 = c(5L, 4L, 
6L), Before.Yr_3 = c(6L, 1L, 2L), Before.Yr_4 = c(3L, 5L, 8L), 
    Before.Yr_5 = c(3L, 1L, 4L), Before.Yr_6 = c(1L, 8L, 5L), 
    Before.Yr_7 = c(3L, 2L, 7L), Before.Yr_8 = c(1L, 6L, 3L), 
    After.Yr_1 = c(4L, 6L, 1L), After.Yr_2 = c(3L, 7L, 5L), After.Yr_3 = c(4L, 
    7L, 2L), After.Yr_4 = c(5L, 7L, 1L), After.Yr_5 = 1:3, After.Yr_6 = c(4L, 
    5L, 7L), After.Yr_7 = c(3L, 6L, 1L), After.Yr_8 = c(2L, 5L, 
    7L)), class = "data.frame", row.names = c(NA, -3L))