将 dplyr summarise_if() 与谓词一起使用
Using dplyr summarise_if() with a predicate
我想计算 x1 和 x2 的平均值 在 sum(is.NA) 与所有观测值之比为 >= 0.5 或 NA 的日期。
数据:
library(lubridate)
library(dplyr)
x = seq(length.out= 10)
x[seq(1,11,5)] <- NA
data = data.frame(
tseq = seq(from = Sys.time(), length.out = 11, by = "12 hours"),
x1 = x,
x2 = x
)
means = data %>% group_by(tseq=floor_date(tseq, "days")) %>%
summarise_all(list( mean = ~ mean(., na.rm = TRUE)))
ratio = data %>% group_by(tseq=floor_date(tseq, "days")) %>%
summarise_all(list( ratio = ~ sum(is.na(.)) / n()))
> ratio
tseq x1_ratio x2_ratio
1 2019-08-26 00:00:00 1 1
2 2019-08-27 00:00:00 0 0
3 2019-08-28 00:00:00 0 0
4 2019-08-29 00:00:00 0.5 0.5
5 2019-08-30 00:00:00 0 0
6 2019-08-31 00:00:00 0.5 0.5
所以这里 2019-08-26、2019-08-29、2019-08-31 日期会很有意义。
在向量中,我可以通过函数
完成此操作
isEnough = function(x){
# is there enough values to calculate mean
if (sum(is.na(x)) / length(x) < 0.5){
return(FALSE)
}
else return(TRUE)
}
对于数据框,我找不到解决方案。到目前为止我已经试过了
data %>% group_by(tseq=floor_date(tseq, "days")) %>%
summarise_if(.predicate = isEnough(~ sum(is.na(.)), ~n()),
.funs = list( mean = ~ mean(., na.rm = TRUE)))
Error in naCount/xLength : non-numeric argument to binary operator
data %>% group_by(tseq=floor_date(tseq, "days")) %>%
summarise_if(.predicate = list( ~ sum(is.na(.)) / n() > 0.5),
.func = list( mean = ~ mean(., na.rm = TRUE)))
Error: n() should only be called in a data context
data %>% group_by(tseq=floor_date(tseq, "days")) %>%
summarise_if(.predicate = (~ sum(is.na(.)) / ~n() > 0.5),
.func = list( mean = ~ mean(., na.rm = TRUE)))
Error in sum(is.na(.))/~n() > 0.5 :
non-numeric argument to binary operator
summarise_if
用于 select 列 。将其视为 summarise_at
的派生词,您可以在其中指定要在哪些列上使用某些函数。
看来您想分别计算 x1
和 x2
的平均值,但在相同条件下,我首先收集这两个值使用 tidyr
的 gather
:
将列合二为一
library(tidyr)
data %>% gather(x, val, x1, x2) %>%
group_by(tseqs=floor_date(tseq, "days"), x) %>%
summarise(
ratio=sum(is.na(val))/n(),
mean=mean(val, na.rm=TRUE)*ifelse(ratio >= 0.5, NA, 1)
)
# A tibble: 12 x 4
# Groups: tseqs [?]
tseqs x ratio mean
<dttm> <chr> <dbl> <dbl>
1 2019-08-26 00:00:00 x1 1 NaN
2 2019-08-26 00:00:00 x2 1 NaN
3 2019-08-27 00:00:00 x1 0 2.5
4 2019-08-27 00:00:00 x2 0 2.5
5 2019-08-28 00:00:00 x1 0 4.5
6 2019-08-28 00:00:00 x2 0 4.5
7 2019-08-29 00:00:00 x1 0.5 NA
8 2019-08-29 00:00:00 x2 0.5 NA
9 2019-08-30 00:00:00 x1 0 8.5
10 2019-08-30 00:00:00 x2 0 8.5
11 2019-08-31 00:00:00 x1 0.5 NA
12 2019-08-31 00:00:00 x2 0.5 NA
最后一步是清理它,然后将其打包回格式:
data %>% gather(x, val, x1, x2) %>%
group_by(tseqs=floor_date(tseq, "days"), x) %>%
summarise(
ratio=sum(is.na(val))/n(),
mean=mean(val, na.rm=TRUE)*ifelse(ratio >= 0.5, NA, 1)
) %>%
select(tseqs, x, mean) %>%
tidyr::spread(x, mean)
# A tibble: 6 x 3
# Groups: tseqs [6]
tseqs x1 x2
<dttm> <dbl> <dbl>
1 2019-08-26 00:00:00 NaN NaN
2 2019-08-27 00:00:00 2.5 2.5
3 2019-08-28 00:00:00 4.5 4.5
4 2019-08-29 00:00:00 NA NA
5 2019-08-30 00:00:00 8.5 8.5
6 2019-08-31 00:00:00 NA NA
我想计算 x1 和 x2 的平均值 在 sum(is.NA) 与所有观测值之比为 >= 0.5 或 NA 的日期。
数据:
library(lubridate)
library(dplyr)
x = seq(length.out= 10)
x[seq(1,11,5)] <- NA
data = data.frame(
tseq = seq(from = Sys.time(), length.out = 11, by = "12 hours"),
x1 = x,
x2 = x
)
means = data %>% group_by(tseq=floor_date(tseq, "days")) %>%
summarise_all(list( mean = ~ mean(., na.rm = TRUE)))
ratio = data %>% group_by(tseq=floor_date(tseq, "days")) %>%
summarise_all(list( ratio = ~ sum(is.na(.)) / n()))
> ratio
tseq x1_ratio x2_ratio
1 2019-08-26 00:00:00 1 1
2 2019-08-27 00:00:00 0 0
3 2019-08-28 00:00:00 0 0
4 2019-08-29 00:00:00 0.5 0.5
5 2019-08-30 00:00:00 0 0
6 2019-08-31 00:00:00 0.5 0.5
所以这里 2019-08-26、2019-08-29、2019-08-31 日期会很有意义。 在向量中,我可以通过函数
完成此操作isEnough = function(x){
# is there enough values to calculate mean
if (sum(is.na(x)) / length(x) < 0.5){
return(FALSE)
}
else return(TRUE)
}
对于数据框,我找不到解决方案。到目前为止我已经试过了
data %>% group_by(tseq=floor_date(tseq, "days")) %>%
summarise_if(.predicate = isEnough(~ sum(is.na(.)), ~n()),
.funs = list( mean = ~ mean(., na.rm = TRUE)))
Error in naCount/xLength : non-numeric argument to binary operator
data %>% group_by(tseq=floor_date(tseq, "days")) %>%
summarise_if(.predicate = list( ~ sum(is.na(.)) / n() > 0.5),
.func = list( mean = ~ mean(., na.rm = TRUE)))
Error: n() should only be called in a data context
data %>% group_by(tseq=floor_date(tseq, "days")) %>%
summarise_if(.predicate = (~ sum(is.na(.)) / ~n() > 0.5),
.func = list( mean = ~ mean(., na.rm = TRUE)))
Error in sum(is.na(.))/~n() > 0.5 :
non-numeric argument to binary operator
summarise_if
用于 select 列 。将其视为 summarise_at
的派生词,您可以在其中指定要在哪些列上使用某些函数。
看来您想分别计算 x1
和 x2
的平均值,但在相同条件下,我首先收集这两个值使用 tidyr
的 gather
:
library(tidyr)
data %>% gather(x, val, x1, x2) %>%
group_by(tseqs=floor_date(tseq, "days"), x) %>%
summarise(
ratio=sum(is.na(val))/n(),
mean=mean(val, na.rm=TRUE)*ifelse(ratio >= 0.5, NA, 1)
)
# A tibble: 12 x 4
# Groups: tseqs [?]
tseqs x ratio mean
<dttm> <chr> <dbl> <dbl>
1 2019-08-26 00:00:00 x1 1 NaN
2 2019-08-26 00:00:00 x2 1 NaN
3 2019-08-27 00:00:00 x1 0 2.5
4 2019-08-27 00:00:00 x2 0 2.5
5 2019-08-28 00:00:00 x1 0 4.5
6 2019-08-28 00:00:00 x2 0 4.5
7 2019-08-29 00:00:00 x1 0.5 NA
8 2019-08-29 00:00:00 x2 0.5 NA
9 2019-08-30 00:00:00 x1 0 8.5
10 2019-08-30 00:00:00 x2 0 8.5
11 2019-08-31 00:00:00 x1 0.5 NA
12 2019-08-31 00:00:00 x2 0.5 NA
最后一步是清理它,然后将其打包回格式:
data %>% gather(x, val, x1, x2) %>%
group_by(tseqs=floor_date(tseq, "days"), x) %>%
summarise(
ratio=sum(is.na(val))/n(),
mean=mean(val, na.rm=TRUE)*ifelse(ratio >= 0.5, NA, 1)
) %>%
select(tseqs, x, mean) %>%
tidyr::spread(x, mean)
# A tibble: 6 x 3
# Groups: tseqs [6]
tseqs x1 x2
<dttm> <dbl> <dbl>
1 2019-08-26 00:00:00 NaN NaN
2 2019-08-27 00:00:00 2.5 2.5
3 2019-08-28 00:00:00 4.5 4.5
4 2019-08-29 00:00:00 NA NA
5 2019-08-30 00:00:00 8.5 8.5
6 2019-08-31 00:00:00 NA NA