在 R 中使用数据屏蔽创建函数
create a function using data masking in R
为了进行配对分析,我需要编写一个 函数 对整数计数求和。需要求和的总数在“df”中相应的“Yrs_Before”和“Yrs_After”列中指定。有没有一种方法可以避免只用一个函数写“fm_after”?这是“Yrs_Before”、“Yrs_After”、“Before.Yr_1...n+1”和“After.Yr_1...n+1”的变量掩码吗" 列?
数据框
set.seed(123)
(df=data.frame(
Yrs_Before=sample(1:8, 3),
Yrs_After=sample(1:8, 3),
Before.Yr_1=sample(1:8, 3),
Before.Yr_2=sample(1:8, 3),
Before.Yr_3=sample(1:8, 3),
Before.Yr_4=sample(1:8, 3),
Before.Yr_5=sample(1:8, 3),
Before.Yr_6=sample(1:8, 3),
Before.Yr_7=sample(1:8, 3),
Before.Yr_8=sample(1:8, 3),
After.Yr_1=sample(1:8, 3),
After.Yr_2=sample(1:8, 3),
After.Yr_3=sample(1:8, 3),
After.Yr_4=sample(1:8, 3),
After.Yr_5=sample(1:8, 3),
After.Yr_6=sample(1:8, 3),
After.Yr_7=sample(1:8, 3),
After.Yr_8=sample(1:8, 3)
))
函数根据前期的年数对相应的行求和。
fm=function(data,Yrs_Before){
data |> dplyr::mutate(sums=
ifelse(
Yrs_Before == 1, rowSums(across(Before.Yr_1)),
ifelse(
Yrs_Before == 2, rowSums(across(Before.Yr_1:Before.Yr_2)),
ifelse(
Yrs_Before == 3, rowSums(across(Before.Yr_1:Before.Yr_3)),
ifelse(
Yrs_Before == 4, rowSums(across(Before.Yr_1:Before.Yr_4)),
ifelse(
Yrs_Before == 5, rowSums(across(Before.Yr_1:Before.Yr_5)),
ifelse(
Yrs_Before == 6, rowSums(across(Before.Yr_1:Before.Yr_6)),
ifelse(
Yrs_Before == 7, rowSums(across(Before.Yr_1:Before.Yr_7)),
ifelse(
Yrs_Before == 8, rowSums(across(Before.Yr_1:Before.Yr_8)),"")))))))))
}
输出
fm(df,Yrs_Before)
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 sums
1 7 6 2 5 6 3 3 1 3 1 4 3 4 5 1 4 3 2 23
2 8 3 6 4 1 5 1 8 2 6 6 7 7 7 2 5 6 5 33
3 3 2 3 6 2 8 4 5 7 3 1 5 2 1 3 7 1 7 11
变量名更改为“After”的重复函数
fm_after=function(data,Yrs_After){
data |> dplyr::mutate(sums=
ifelse(
Yrs_After == 1, rowSums(across(After.Yr_1)),
ifelse(
Yrs_After == 2, rowSums(across(After.Yr_1:After.Yr_2)),
ifelse(
Yrs_After == 3, rowSums(across(After.Yr_1:After.Yr_3)),
ifelse(
Yrs_After == 4, rowSums(across(After.Yr_1:After.Yr_4)),
ifelse(
Yrs_After == 5, rowSums(across(After.Yr_1:After.Yr_5)),
ifelse(
Yrs_After == 6, rowSums(across(After.Yr_1:After.Yr_6)),
ifelse(
Yrs_After == 7, rowSums(across(After.Yr_1:After.Yr_7)),
ifelse(
Yrs_After == 8, rowSums(across(After.Yr_1:After.Yr_8)),"")))))))))
}
输出
fm_after(df,Yrs_After)
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 sums
1 7 6 2 5 6 3 3 1 3 1 4 3 4 5 1 4 3 2 21
2 8 3 6 4 1 5 1 8 2 6 6 7 7 7 2 5 6 5 20
3 3 2 3 6 2 8 4 5 7 3 1 5 2 1 3 7 1 7 6
更新:
按照建议,使用以下 tidyverse 函数计算 Before.Yr 和 After.Yr 崩溃次数。
sums=function(data,crashes,yrs){
data %>%
dplyr::rowwise() %>%
dplyr::transmute(sum = cumsum(c_across(matches(.data[[crashes]])))[.data[[yrs]]])
}
但是返回了一个错误。
> sums(df,"Before.Yr","Yrs_Before")
Error: Problem with `mutate()` column `sum`.
ℹ `sum = cumsum(c_across(matches(.data[["Before.Yr"]])))[.data[["Yrs_Before"]]]`.
x Column `Before.Yr` not found in `.data`
ℹ The error occurred in row 1.
Run `rlang::last_error()` to see where the error occurred.
修复错误的建议?
我们可以通过转向长格式来做到这一点
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -c(rn, Yrs_Before, Yrs_After)) %>%
mutate(yrs = as.numeric(str_extract(name, "\d+$"))) %>%
group_by(rn, grp = str_extract(name, "\w+")) %>%
summarise(Sum = if(cur_group()$grp == 'Before')
sum(value[yrs <= Yrs_Before], na.rm = TRUE) else
sum(value[yrs <= Yrs_After], na.rm = TRUE), .groups = 'drop') %>%
pivot_wider(names_from = grp, values_from = Sum) %>%
select(-rn) %>%
bind_cols(df, .)
-输出
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1
1 7 6 2 5 6 3 3 1 3 1 4
2 8 3 6 4 1 5 1 8 2 6 6
3 3 2 3 6 2 8 4 5 7 3 1
After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 After Before
1 3 4 5 1 4 3 2 21 23
2 7 7 7 2 5 6 5 20 33
3 5 2 1 3 7 1 7 6 11
它可以包装在一个只有输入数据的函数中
fm <- function(data) {
data %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -c(rn, Yrs_Before, Yrs_After)) %>%
mutate(yrs = as.numeric(str_extract(name, "\d+$"))) %>%
group_by(rn, grp = str_extract(name, "\w+")) %>%
summarise(Sum = if(cur_group()$grp == 'Before')
sum(value[yrs <= Yrs_Before], na.rm = TRUE) else
sum(value[yrs <= Yrs_After], na.rm = TRUE), .groups = 'drop') %>%
pivot_wider(names_from = grp, values_from = Sum) %>%
select(-rn) %>%
bind_cols(df, .)
}
-测试
> fm(df)
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1
1 7 6 2 5 6 3 3 1 3 1 4
2 8 3 6 4 1 5 1 8 2 6 6
3 3 2 3 6 2 8 4 5 7 3 1
After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 After Before
1 3 4 5 1 4 3 2 21 23
2 7 7 7 2 5 6 5 20 33
3 5 2 1 3 7 1 7 6 11
编辑:
在 base R 中你可以这样做:
A <- split.default(df, sub('.*(After|Before).*', '\1', names(df)))
sapply(A, \(x) rowSums(x[-1] * (col(x[-1]) <= x[[1]])))
After Before
[1,] 21 23
[2,] 20 33
[3,] 6 11
然后您可以cbind
将其添加到原始数据帧
整洁宇宙:
使用 mutate
而不是 transmute
以保留原始数据帧。
df %>%
rowwise() %>%
transmute(After = cumsum(c_across(matches('Before.Yr')))[Yrs_Before],
Before = cumsum(c_across(matches('After.Yr')))[Yrs_After])
# A tibble: 3 x 2
# Rowwise:
After Before
<int> <int>
1 23 21
2 33 20
3 11 6
要避免太多代码,您可以使用 paste
。
f <- \(x, dat=df) sapply(seq_len(nrow(dat)), \(i, ...)
sum(dat[i, paste0(x, '.Yr_', 1:dat[i, paste0('Yrs_', x)])]))
res <- transform(df, sums_bef=f('Before'), sums_aft=f('After'))
res
# Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3
# 1 7 6 2 5 6
# 2 8 3 6 4 1
# 3 3 2 3 6 2
# Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8
# 1 3 3 1 3 1
# 2 5 1 8 2 6
# 3 8 4 5 7 3
# After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5
# 1 4 3 4 5 1
# 2 6 7 7 7 2
# 3 1 5 2 1 3
# After.Yr_6 After.Yr_7 After.Yr_8 sums_bef sums_aft
# 1 4 3 2 23 21
# 2 5 6 5 33 20
# 3 7 1 7 11 6
数据:
df <- structure(list(Yrs_Before = c(7L, 8L, 3L), Yrs_After = c(6L,
3L, 2L), Before.Yr_1 = c(2L, 6L, 3L), Before.Yr_2 = c(5L, 4L,
6L), Before.Yr_3 = c(6L, 1L, 2L), Before.Yr_4 = c(3L, 5L, 8L),
Before.Yr_5 = c(3L, 1L, 4L), Before.Yr_6 = c(1L, 8L, 5L),
Before.Yr_7 = c(3L, 2L, 7L), Before.Yr_8 = c(1L, 6L, 3L),
After.Yr_1 = c(4L, 6L, 1L), After.Yr_2 = c(3L, 7L, 5L), After.Yr_3 = c(4L,
7L, 2L), After.Yr_4 = c(5L, 7L, 1L), After.Yr_5 = 1:3, After.Yr_6 = c(4L,
5L, 7L), After.Yr_7 = c(3L, 6L, 1L), After.Yr_8 = c(2L, 5L,
7L)), class = "data.frame", row.names = c(NA, -3L))
为了进行配对分析,我需要编写一个 函数 对整数计数求和。需要求和的总数在“df”中相应的“Yrs_Before”和“Yrs_After”列中指定。有没有一种方法可以避免只用一个函数写“fm_after”?这是“Yrs_Before”、“Yrs_After”、“Before.Yr_1...n+1”和“After.Yr_1...n+1”的变量掩码吗" 列?
数据框
set.seed(123)
(df=data.frame(
Yrs_Before=sample(1:8, 3),
Yrs_After=sample(1:8, 3),
Before.Yr_1=sample(1:8, 3),
Before.Yr_2=sample(1:8, 3),
Before.Yr_3=sample(1:8, 3),
Before.Yr_4=sample(1:8, 3),
Before.Yr_5=sample(1:8, 3),
Before.Yr_6=sample(1:8, 3),
Before.Yr_7=sample(1:8, 3),
Before.Yr_8=sample(1:8, 3),
After.Yr_1=sample(1:8, 3),
After.Yr_2=sample(1:8, 3),
After.Yr_3=sample(1:8, 3),
After.Yr_4=sample(1:8, 3),
After.Yr_5=sample(1:8, 3),
After.Yr_6=sample(1:8, 3),
After.Yr_7=sample(1:8, 3),
After.Yr_8=sample(1:8, 3)
))
函数根据前期的年数对相应的行求和。
fm=function(data,Yrs_Before){
data |> dplyr::mutate(sums=
ifelse(
Yrs_Before == 1, rowSums(across(Before.Yr_1)),
ifelse(
Yrs_Before == 2, rowSums(across(Before.Yr_1:Before.Yr_2)),
ifelse(
Yrs_Before == 3, rowSums(across(Before.Yr_1:Before.Yr_3)),
ifelse(
Yrs_Before == 4, rowSums(across(Before.Yr_1:Before.Yr_4)),
ifelse(
Yrs_Before == 5, rowSums(across(Before.Yr_1:Before.Yr_5)),
ifelse(
Yrs_Before == 6, rowSums(across(Before.Yr_1:Before.Yr_6)),
ifelse(
Yrs_Before == 7, rowSums(across(Before.Yr_1:Before.Yr_7)),
ifelse(
Yrs_Before == 8, rowSums(across(Before.Yr_1:Before.Yr_8)),"")))))))))
}
输出
fm(df,Yrs_Before)
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 sums
1 7 6 2 5 6 3 3 1 3 1 4 3 4 5 1 4 3 2 23
2 8 3 6 4 1 5 1 8 2 6 6 7 7 7 2 5 6 5 33
3 3 2 3 6 2 8 4 5 7 3 1 5 2 1 3 7 1 7 11
变量名更改为“After”的重复函数
fm_after=function(data,Yrs_After){
data |> dplyr::mutate(sums=
ifelse(
Yrs_After == 1, rowSums(across(After.Yr_1)),
ifelse(
Yrs_After == 2, rowSums(across(After.Yr_1:After.Yr_2)),
ifelse(
Yrs_After == 3, rowSums(across(After.Yr_1:After.Yr_3)),
ifelse(
Yrs_After == 4, rowSums(across(After.Yr_1:After.Yr_4)),
ifelse(
Yrs_After == 5, rowSums(across(After.Yr_1:After.Yr_5)),
ifelse(
Yrs_After == 6, rowSums(across(After.Yr_1:After.Yr_6)),
ifelse(
Yrs_After == 7, rowSums(across(After.Yr_1:After.Yr_7)),
ifelse(
Yrs_After == 8, rowSums(across(After.Yr_1:After.Yr_8)),"")))))))))
}
输出
fm_after(df,Yrs_After)
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 sums
1 7 6 2 5 6 3 3 1 3 1 4 3 4 5 1 4 3 2 21
2 8 3 6 4 1 5 1 8 2 6 6 7 7 7 2 5 6 5 20
3 3 2 3 6 2 8 4 5 7 3 1 5 2 1 3 7 1 7 6
更新:
按照建议,使用以下 tidyverse 函数计算 Before.Yr 和 After.Yr 崩溃次数。
sums=function(data,crashes,yrs){
data %>%
dplyr::rowwise() %>%
dplyr::transmute(sum = cumsum(c_across(matches(.data[[crashes]])))[.data[[yrs]]])
}
但是返回了一个错误。
> sums(df,"Before.Yr","Yrs_Before")
Error: Problem with `mutate()` column `sum`.
ℹ `sum = cumsum(c_across(matches(.data[["Before.Yr"]])))[.data[["Yrs_Before"]]]`.
x Column `Before.Yr` not found in `.data`
ℹ The error occurred in row 1.
Run `rlang::last_error()` to see where the error occurred.
修复错误的建议?
我们可以通过转向长格式来做到这一点
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -c(rn, Yrs_Before, Yrs_After)) %>%
mutate(yrs = as.numeric(str_extract(name, "\d+$"))) %>%
group_by(rn, grp = str_extract(name, "\w+")) %>%
summarise(Sum = if(cur_group()$grp == 'Before')
sum(value[yrs <= Yrs_Before], na.rm = TRUE) else
sum(value[yrs <= Yrs_After], na.rm = TRUE), .groups = 'drop') %>%
pivot_wider(names_from = grp, values_from = Sum) %>%
select(-rn) %>%
bind_cols(df, .)
-输出
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1
1 7 6 2 5 6 3 3 1 3 1 4
2 8 3 6 4 1 5 1 8 2 6 6
3 3 2 3 6 2 8 4 5 7 3 1
After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 After Before
1 3 4 5 1 4 3 2 21 23
2 7 7 7 2 5 6 5 20 33
3 5 2 1 3 7 1 7 6 11
它可以包装在一个只有输入数据的函数中
fm <- function(data) {
data %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -c(rn, Yrs_Before, Yrs_After)) %>%
mutate(yrs = as.numeric(str_extract(name, "\d+$"))) %>%
group_by(rn, grp = str_extract(name, "\w+")) %>%
summarise(Sum = if(cur_group()$grp == 'Before')
sum(value[yrs <= Yrs_Before], na.rm = TRUE) else
sum(value[yrs <= Yrs_After], na.rm = TRUE), .groups = 'drop') %>%
pivot_wider(names_from = grp, values_from = Sum) %>%
select(-rn) %>%
bind_cols(df, .)
}
-测试
> fm(df)
Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3 Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8 After.Yr_1
1 7 6 2 5 6 3 3 1 3 1 4
2 8 3 6 4 1 5 1 8 2 6 6
3 3 2 3 6 2 8 4 5 7 3 1
After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5 After.Yr_6 After.Yr_7 After.Yr_8 After Before
1 3 4 5 1 4 3 2 21 23
2 7 7 7 2 5 6 5 20 33
3 5 2 1 3 7 1 7 6 11
编辑:
在 base R 中你可以这样做:
A <- split.default(df, sub('.*(After|Before).*', '\1', names(df)))
sapply(A, \(x) rowSums(x[-1] * (col(x[-1]) <= x[[1]])))
After Before
[1,] 21 23
[2,] 20 33
[3,] 6 11
然后您可以cbind
将其添加到原始数据帧
整洁宇宙:
使用 mutate
而不是 transmute
以保留原始数据帧。
df %>%
rowwise() %>%
transmute(After = cumsum(c_across(matches('Before.Yr')))[Yrs_Before],
Before = cumsum(c_across(matches('After.Yr')))[Yrs_After])
# A tibble: 3 x 2
# Rowwise:
After Before
<int> <int>
1 23 21
2 33 20
3 11 6
要避免太多代码,您可以使用 paste
。
f <- \(x, dat=df) sapply(seq_len(nrow(dat)), \(i, ...)
sum(dat[i, paste0(x, '.Yr_', 1:dat[i, paste0('Yrs_', x)])]))
res <- transform(df, sums_bef=f('Before'), sums_aft=f('After'))
res
# Yrs_Before Yrs_After Before.Yr_1 Before.Yr_2 Before.Yr_3
# 1 7 6 2 5 6
# 2 8 3 6 4 1
# 3 3 2 3 6 2
# Before.Yr_4 Before.Yr_5 Before.Yr_6 Before.Yr_7 Before.Yr_8
# 1 3 3 1 3 1
# 2 5 1 8 2 6
# 3 8 4 5 7 3
# After.Yr_1 After.Yr_2 After.Yr_3 After.Yr_4 After.Yr_5
# 1 4 3 4 5 1
# 2 6 7 7 7 2
# 3 1 5 2 1 3
# After.Yr_6 After.Yr_7 After.Yr_8 sums_bef sums_aft
# 1 4 3 2 23 21
# 2 5 6 5 33 20
# 3 7 1 7 11 6
数据:
df <- structure(list(Yrs_Before = c(7L, 8L, 3L), Yrs_After = c(6L,
3L, 2L), Before.Yr_1 = c(2L, 6L, 3L), Before.Yr_2 = c(5L, 4L,
6L), Before.Yr_3 = c(6L, 1L, 2L), Before.Yr_4 = c(3L, 5L, 8L),
Before.Yr_5 = c(3L, 1L, 4L), Before.Yr_6 = c(1L, 8L, 5L),
Before.Yr_7 = c(3L, 2L, 7L), Before.Yr_8 = c(1L, 6L, 3L),
After.Yr_1 = c(4L, 6L, 1L), After.Yr_2 = c(3L, 7L, 5L), After.Yr_3 = c(4L,
7L, 2L), After.Yr_4 = c(5L, 7L, 1L), After.Yr_5 = 1:3, After.Yr_6 = c(4L,
5L, 7L), After.Yr_7 = c(3L, 6L, 1L), After.Yr_8 = c(2L, 5L,
7L)), class = "data.frame", row.names = c(NA, -3L))