以最有效的方式确保 dplyr::summarise() 中的唯一值
Ensuring unique values in dplyr::summarise() in most efficient way
我通常有很多 character
类型的列(在 20 到 30 之间)和只有 3-4 个 numeric
类型的列。
对数字列进行分组和汇总非常快,但我在汇总字符列同时确保每个分组 var 值的唯一值的方法总体上非常慢。
只是想知道是否有比使用 paste()
更快的方法。
library(magrittr)
make_unique <- function(x, sep = "-") {
ifelse(length(x_unique <- unique(x)) == 1, x_unique,
paste(sort(x_unique), collapse = sep))
}
make_unique_2 <- function(x, sep = "-") {
paste(sort(x), collapse = sep)
}
df <- tibble::tribble(
~id, ~country, ~value,
"a", "A", 10,
"a", "B", 20,
"b", "A", 5,
"c", "A", 100,
"c", "B", 1,
"c", "C", 25
)
df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique) %>%
dplyr::ungroup()
#> # A tibble: 3 x 2
#> id country
#> <chr> <chr>
#> 1 a A-B
#> 2 b A
#> 3 c A-B-C
microbenchmark::microbenchmark(
"numeric" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.numeric, sum) %>%
dplyr::ungroup(),
"character_1" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique) %>%
dplyr::ungroup(),
"character_2" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique_2) %>%
dplyr::ungroup()
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> numeric 1.0554 1.24160 1.918480 1.43135 1.90180 8.7733 100
#> character_1 1.1907 1.37530 2.093501 1.60895 2.04235 7.7648 100
#> character_2 1.2255 1.44185 2.474062 1.69260 2.38540 9.4851 100
由 reprex 包 (v0.2.1) 创建于 2019-04-05
在更大的数据集上,我们将看到关于 make_unique_2
的一些基准变化
-新功能
make_unique_3 <- function(x, sep="-") {
x_unique <- unique(x)
if(length(x_unique) == 1) x_unique else paste(sort(x_unique), collapse= sep)
}
make_unique_4 <- function(x, sep="-") {
x_unique <- unique(x)
if(n_distinct(x_unique) == 1) x_unique else str_c(sort(x_unique), collapse=sep)
}
-数据
df <- df[rep(1:nrow(df), 1e5), ]
-基准
library(microbenchmark)
microbenchmark::microbenchmark(
"numeric" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.numeric, sum) %>%
dplyr::ungroup(),
"character_1" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique) %>%
dplyr::ungroup(),
"character_2" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique_2) %>%
dplyr::ungroup(),
"character_3" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique_3) %>%
dplyr::ungroup(),
"character_4" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique_4) %>%
dplyr::ungroup(),
unit = "relative", times = 10L
)
-输出
#Unit: relative
# expr min lq mean median uq max neval cld
# numeric 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# character_1 1.681810 1.614818 1.625383 1.636651 1.616881 1.489384 10 a
# character_2 7.668509 7.207077 7.117084 6.992513 6.102214 9.102668 10 b
# character_3 1.671742 1.618976 1.632336 1.710828 1.587933 1.501431 10 a
# character_4 1.444589 1.435881 1.504313 1.562996 1.515468 1.479626 10 a
-评论
从 paste
更改为 str_c
将效率从 1.68 提高到 1.44(make_unique
对比 make_unique_4
)
我通常有很多 character
类型的列(在 20 到 30 之间)和只有 3-4 个 numeric
类型的列。
对数字列进行分组和汇总非常快,但我在汇总字符列同时确保每个分组 var 值的唯一值的方法总体上非常慢。
只是想知道是否有比使用 paste()
更快的方法。
library(magrittr)
make_unique <- function(x, sep = "-") {
ifelse(length(x_unique <- unique(x)) == 1, x_unique,
paste(sort(x_unique), collapse = sep))
}
make_unique_2 <- function(x, sep = "-") {
paste(sort(x), collapse = sep)
}
df <- tibble::tribble(
~id, ~country, ~value,
"a", "A", 10,
"a", "B", 20,
"b", "A", 5,
"c", "A", 100,
"c", "B", 1,
"c", "C", 25
)
df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique) %>%
dplyr::ungroup()
#> # A tibble: 3 x 2
#> id country
#> <chr> <chr>
#> 1 a A-B
#> 2 b A
#> 3 c A-B-C
microbenchmark::microbenchmark(
"numeric" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.numeric, sum) %>%
dplyr::ungroup(),
"character_1" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique) %>%
dplyr::ungroup(),
"character_2" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique_2) %>%
dplyr::ungroup()
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> numeric 1.0554 1.24160 1.918480 1.43135 1.90180 8.7733 100
#> character_1 1.1907 1.37530 2.093501 1.60895 2.04235 7.7648 100
#> character_2 1.2255 1.44185 2.474062 1.69260 2.38540 9.4851 100
由 reprex 包 (v0.2.1) 创建于 2019-04-05
在更大的数据集上,我们将看到关于 make_unique_2
-新功能
make_unique_3 <- function(x, sep="-") {
x_unique <- unique(x)
if(length(x_unique) == 1) x_unique else paste(sort(x_unique), collapse= sep)
}
make_unique_4 <- function(x, sep="-") {
x_unique <- unique(x)
if(n_distinct(x_unique) == 1) x_unique else str_c(sort(x_unique), collapse=sep)
}
-数据
df <- df[rep(1:nrow(df), 1e5), ]
-基准
library(microbenchmark)
microbenchmark::microbenchmark(
"numeric" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.numeric, sum) %>%
dplyr::ungroup(),
"character_1" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique) %>%
dplyr::ungroup(),
"character_2" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique_2) %>%
dplyr::ungroup(),
"character_3" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique_3) %>%
dplyr::ungroup(),
"character_4" = df %>%
dplyr::group_by(id) %>%
dplyr::summarise_if(is.character, make_unique_4) %>%
dplyr::ungroup(),
unit = "relative", times = 10L
)
-输出
#Unit: relative
# expr min lq mean median uq max neval cld
# numeric 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# character_1 1.681810 1.614818 1.625383 1.636651 1.616881 1.489384 10 a
# character_2 7.668509 7.207077 7.117084 6.992513 6.102214 9.102668 10 b
# character_3 1.671742 1.618976 1.632336 1.710828 1.587933 1.501431 10 a
# character_4 1.444589 1.435881 1.504313 1.562996 1.515468 1.479626 10 a
-评论
从 paste
更改为 str_c
将效率从 1.68 提高到 1.44(make_unique
对比 make_unique_4
)