按组汇总 wtd.quantile
summarise wtd.quantile by group
我想使用 Hmisc::wtd.quantile 为具有许多重复日期的数据框创建一个新的 df。我按日期分组,使用 summarize()
聚合日期,并尝试在每个日期(带权重)使用 wtd.quantile()
。这也是一个相当大的数据集。下面是一些示例代码:
# sample data
# grouping_var = dt_time
require(Hmisc)
require(plyr)
require(dplyr)
df <- data.frame(type = sample(letters[1:2], 10e6, replace = TRUE),
score = sample(500:899, 10e6, replace = TRUE),
dt_time = sample(seq(as.Date('2010/01/01'),
as.Date('2018/01/01'),
by="day"), 10e6, replace = TRUE),
weight = sample(1.0:2.0, 10e6, replace = TRUE))
# my attempt:
ptiles <- df %>%
group_by(dt_time) %>%
plyr::ddply(~dt_time, dplyr::summarize,
ptile10 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .1, na.rm = TRUE),
ptile50 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .5, na.rm = TRUE),
ptile90 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .9, na.rm = TRUE))
# desired df,
# where each new variable would be created using the
# wtd.quantile function:
desired_ptiles <- data.frame(dt_time = seq(as.Date('2010/01/01'),
as.Date('2010/01/06'),
by = "day"),
# only 6 because lol 10e6
ptile10 = sample(500:899, 6, replace = TRUE),
ptile50 = sample(500:899, 6, replace = TRUE),
ptile90 = sample(500:899, 6, replace = TRUE))
到目前为止,我的努力导致了这个错误:
Error in summarise_impl(.data, dots) :
Evaluation error: 'arg' must be NULL or a character vector.
并且在使用公式符号时:
ptiles <- df %>%
group_by(dt_time) %>%
plyr::ddply(~dt_time, dplyr::summarize,
ptile10 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .1, na.rm = TRUE),
ptile50 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .5, na.rm = TRUE),
ptile90 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .9, na.rm = TRUE))
# error message:
Error in summarise_impl(.data, dots) :
Evaluation error: 'arg' must be NULL or a character vector.
我是不是用错了方法?我见过使用 split()
的方法,但这似乎很烦人。有没有data.table
的方法可以让wtd.quantile()
这样总结?
谢谢!
使用 group_by
时不需要 ddply
,因为数据已按分组拆分。另外,分组后不需要在summarize
内定义数据。
这个有效:
ptiles <- df %>%
group_by(dt_time) %>%
summarize(ptile10 = wtd.quantile(score, weights = weight,
probs = .1, na.rm = TRUE),
ptile50 = wtd.quantile(score, weights = weight,
probs = .5, na.rm = TRUE),
ptile90 = wtd.quantile(score, weights = weight,
probs = .9, na.rm = TRUE))
> ptiles
# A tibble: 2,923 x 4
dt_time ptile10 ptile50 ptile90
<date> <dbl> <dbl> <dbl>
1 2010-01-01 539.0 697 859.0
2 2010-01-02 538.0 704 861.7
3 2010-01-03 541.0 706 862.0
4 2010-01-04 541.0 702 859.0
5 2010-01-05 540.0 706 860.0
6 2010-01-06 537.0 695 859.0
7 2010-01-07 539.0 696 859.0
8 2010-01-08 536.0 700 857.0
9 2010-01-09 538.0 694 861.0
10 2010-01-10 538.4 701 859.0
# ... with 2,913 more rows
这是一个替代解决方案,它使用一个函数(基于@LAP 的解决方案)来检查您提供的一组分位数值(即 c(.1, .5, .9)
)并自动创建适当的列:
require(Hmisc)
require(plyr)
require(tidyverse)
# for reproducibility
set.seed(5)
# example dataset
df <- data.frame(type = sample(letters[1:2], 50000, replace = TRUE),
score = sample(500:899, 50000, replace = TRUE),
dt_time = sample(seq(as.Date('2010/01/01'),
as.Date('2018/01/01'),
by="day"), 50000, replace = TRUE),
weight = sample(1.0:2.0, 50000, replace = TRUE))
# function to get quantiles per date
f = function(x) {
df %>%
group_by(dt_time) %>%
dplyr::summarize(value = wtd.quantile(score, weights = weight, probs = x, na.rm = TRUE)) %>%
mutate(ptile = x) }
# input quantiles of interest
q = c(.1, .5, .9)
# apply function to each quantile of interest
# and reshape
map_df(q, f) %>%
spread(ptile, value, sep = "_")
# # A tibble: 2,923 x 4
# dt_time ptile_0.1 ptile_0.5 ptile_0.9
# <date> <dbl> <dbl> <dbl>
# 1 2010-01-01 543 670. 857
# 2 2010-01-02 549 800 868.
# 3 2010-01-03 529. 705 829.
# 4 2010-01-04 543 690 850.
# 5 2010-01-05 528. 642. 870.
# 6 2010-01-06 534. 649 810.
# 7 2010-01-07 512. 668 801.
# 8 2010-01-08 554 664 869.
# 9 2010-01-09 533. 650 765
#10 2010-01-10 535. 696 843.
# # ... with 2,913 more rows
我想使用 Hmisc::wtd.quantile 为具有许多重复日期的数据框创建一个新的 df。我按日期分组,使用 summarize()
聚合日期,并尝试在每个日期(带权重)使用 wtd.quantile()
。这也是一个相当大的数据集。下面是一些示例代码:
# sample data
# grouping_var = dt_time
require(Hmisc)
require(plyr)
require(dplyr)
df <- data.frame(type = sample(letters[1:2], 10e6, replace = TRUE),
score = sample(500:899, 10e6, replace = TRUE),
dt_time = sample(seq(as.Date('2010/01/01'),
as.Date('2018/01/01'),
by="day"), 10e6, replace = TRUE),
weight = sample(1.0:2.0, 10e6, replace = TRUE))
# my attempt:
ptiles <- df %>%
group_by(dt_time) %>%
plyr::ddply(~dt_time, dplyr::summarize,
ptile10 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .1, na.rm = TRUE),
ptile50 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .5, na.rm = TRUE),
ptile90 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .9, na.rm = TRUE))
# desired df,
# where each new variable would be created using the
# wtd.quantile function:
desired_ptiles <- data.frame(dt_time = seq(as.Date('2010/01/01'),
as.Date('2010/01/06'),
by = "day"),
# only 6 because lol 10e6
ptile10 = sample(500:899, 6, replace = TRUE),
ptile50 = sample(500:899, 6, replace = TRUE),
ptile90 = sample(500:899, 6, replace = TRUE))
到目前为止,我的努力导致了这个错误:
Error in summarise_impl(.data, dots) :
Evaluation error: 'arg' must be NULL or a character vector.
并且在使用公式符号时:
ptiles <- df %>%
group_by(dt_time) %>%
plyr::ddply(~dt_time, dplyr::summarize,
ptile10 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .1, na.rm = TRUE),
ptile50 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .5, na.rm = TRUE),
ptile90 = Hmisc::wtd.quantile(., .$score, weights = .$weight,
probs = .9, na.rm = TRUE))
# error message:
Error in summarise_impl(.data, dots) :
Evaluation error: 'arg' must be NULL or a character vector.
我是不是用错了方法?我见过使用 split()
的方法,但这似乎很烦人。有没有data.table
的方法可以让wtd.quantile()
这样总结?
谢谢!
使用 group_by
时不需要 ddply
,因为数据已按分组拆分。另外,分组后不需要在summarize
内定义数据。
这个有效:
ptiles <- df %>%
group_by(dt_time) %>%
summarize(ptile10 = wtd.quantile(score, weights = weight,
probs = .1, na.rm = TRUE),
ptile50 = wtd.quantile(score, weights = weight,
probs = .5, na.rm = TRUE),
ptile90 = wtd.quantile(score, weights = weight,
probs = .9, na.rm = TRUE))
> ptiles
# A tibble: 2,923 x 4
dt_time ptile10 ptile50 ptile90
<date> <dbl> <dbl> <dbl>
1 2010-01-01 539.0 697 859.0
2 2010-01-02 538.0 704 861.7
3 2010-01-03 541.0 706 862.0
4 2010-01-04 541.0 702 859.0
5 2010-01-05 540.0 706 860.0
6 2010-01-06 537.0 695 859.0
7 2010-01-07 539.0 696 859.0
8 2010-01-08 536.0 700 857.0
9 2010-01-09 538.0 694 861.0
10 2010-01-10 538.4 701 859.0
# ... with 2,913 more rows
这是一个替代解决方案,它使用一个函数(基于@LAP 的解决方案)来检查您提供的一组分位数值(即 c(.1, .5, .9)
)并自动创建适当的列:
require(Hmisc)
require(plyr)
require(tidyverse)
# for reproducibility
set.seed(5)
# example dataset
df <- data.frame(type = sample(letters[1:2], 50000, replace = TRUE),
score = sample(500:899, 50000, replace = TRUE),
dt_time = sample(seq(as.Date('2010/01/01'),
as.Date('2018/01/01'),
by="day"), 50000, replace = TRUE),
weight = sample(1.0:2.0, 50000, replace = TRUE))
# function to get quantiles per date
f = function(x) {
df %>%
group_by(dt_time) %>%
dplyr::summarize(value = wtd.quantile(score, weights = weight, probs = x, na.rm = TRUE)) %>%
mutate(ptile = x) }
# input quantiles of interest
q = c(.1, .5, .9)
# apply function to each quantile of interest
# and reshape
map_df(q, f) %>%
spread(ptile, value, sep = "_")
# # A tibble: 2,923 x 4
# dt_time ptile_0.1 ptile_0.5 ptile_0.9
# <date> <dbl> <dbl> <dbl>
# 1 2010-01-01 543 670. 857
# 2 2010-01-02 549 800 868.
# 3 2010-01-03 529. 705 829.
# 4 2010-01-04 543 690 850.
# 5 2010-01-05 528. 642. 870.
# 6 2010-01-06 534. 649 810.
# 7 2010-01-07 512. 668 801.
# 8 2010-01-08 554 664 869.
# 9 2010-01-09 533. 650 765
#10 2010-01-10 535. 696 843.
# # ... with 2,913 more rows