如何创建一个对重复行求和的列,然后仅删除 R 中的一个重复项?
How to create a column that sums duplicated rows and then delete only one of the duplicates in R?
我有一个非常庞大的数据集,我正在寻找最简单(也是最快)的方法来创建一个列,该列对一个特定列的值求和,然后继续只留下一个重复值。
我的数据集如下所示:
data <- data.frame(DATE_INTER = c("2015-05-29", "2013-12-13", "2009-09-08"),
DATE_SAIDA = c("2015-06-10", "2013-12-15", "2009-10-20"),
GRUPO_AIH = c("09081997", "13122006", "13122006"),
DIAS_PERMANENCIA = c(12, 2, 42))
我需要使用列“GRUPO_AIH”来检查重复项。我的最终输出将是这样的:
我已经尝试过了,但是它花费的时间太长,而且在完成之后,我什至无法使用 dplyr 过滤 R 停止工作的任何内容。
data <- data %>%
group_by(GRUPO_AIH) %>%
mutate(DIAS_PERMANENCIA2 = sum(DIAS_PERMANENCIA))
有什么建议吗?
答案在这里。只是一个观察,在您提供的数据集示例中,GRUPO_AIH 变量中实际上没有任何重复值,因此我更改为 GRUPO_AIH = c("09081997", "13122006", "13122006"),
data %>%
group_by(GRUPO_AIH) %>%
mutate(DIAS_PERMANENCIA = sum(DIAS_PERMANENCIA)) %>%
filter(duplicated(GRUPO_AIH) == FALSE)
DATE_INTER DATE_SAIDA GRUPO_AIH DIAS_PERMANENCIA
<chr> <chr> <chr> <dbl>
1 2015-05-29 2015-06-10 09081997 12
2 2013-12-13 2013-12-15 13122006 44
OBS:também sou sanitarista atuando com dados na vigilância kk
如果您有一个“非常大的数据集”(数百万行),也许这会是最快的:
library(data.table)
data <- data.frame(DATE_INTER = c("2015-05-29", "2013-12-13", "2009-09-08"),
DATE_SAIDA = c("2015-06-10", "2013-12-15", "2009-10-20"),
GRUPO_AIH = c("09081997", "13122206", "13122206"),
DIAS_PERMANENCIA = c(12, 2, 42))
data_dt <- setDT(data)
setkey(data_dt, GRUPO_AIH)
data_dt[, DIAS_PERMANENCIA := sum(DIAS_PERMANENCIA), by = "GRUPO_AIH"]
data_final <- as.data.frame(data_dt[!duplicated(data_dt, by = "GRUPO_AIH")])
data_final
#> DATE_INTER DATE_SAIDA GRUPO_AIH DIAS_PERMANENCIA
#> 1 2015-05-29 2015-06-10 09081997 12
#> 2 2013-12-13 2013-12-15 13122206 44
由 reprex package (v2.0.1)
于 2022-05-31 创建
基准测试:
#install.packages("data.table")
#install.packages("dplyr")
#install.packages("microbenchmark")
library(microbenchmark)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
data <- data.frame(DATE_INTER = rep(c("2015-05-29", "2013-12-13", "2009-09-08"), times = 10e5),
DATE_SAIDA = rep(c("2015-06-10", "2013-12-15", "2009-10-20"), times = 10e5),
GRUPO_AIH = rep(c("09081997", "13122206", "13122206"), times = 10e5),
DIAS_PERMANENCIA = rep(c(12, 2, 42), times = 10e5))
dplyr_func <- function(data){
data_final <- data %>%
dplyr::group_by(GRUPO_AIH) %>%
dplyr::mutate(DIAS_PERMANENCIA = sum(DIAS_PERMANENCIA)) %>%
dplyr::filter(duplicated(GRUPO_AIH) == FALSE)
return(data_final)
}
dt_func <- function(data){
data_dt <- data.table::setDT(data)
data.table::setkey(data_dt, GRUPO_AIH)
data_dt[, DIAS_PERMANENCIA := sum(DIAS_PERMANENCIA), by = "GRUPO_AIH"]
data_final <- as.data.frame(data_dt[!duplicated(data_dt, by = "GRUPO_AIH")])
return(data_final)
}
dplyr::all_equal(dplyr_func(data), dt_func(data))
#> [1] TRUE
res <- microbenchmark(dplyr_func(data), dt_func(data), times = 4)
autoplot(res)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# The difference in speed will likely become more pronounced as the size of the data increases
data <- data.frame(DATE_INTER = rep(c("2015-05-29", "2013-12-13", "2009-09-08"), times = 10e6),
DATE_SAIDA = rep(c("2015-06-10", "2013-12-15", "2009-10-20"), times = 10e6),
GRUPO_AIH = rep(c("09081997", "13122206", "13122206"), times = 10e6),
DIAS_PERMANENCIA = rep(c(12, 2, 42), times = 10e6))
dplyr_func <- function(data){
data_final <- data %>%
dplyr::group_by(GRUPO_AIH) %>%
dplyr::mutate(DIAS_PERMANENCIA = sum(DIAS_PERMANENCIA)) %>%
dplyr::filter(duplicated(GRUPO_AIH) == FALSE)
return(data_final)
}
dt_func <- function(data){
data_dt <- data.table::setDT(data)
data.table::setkey(data_dt, GRUPO_AIH)
data_dt[, DIAS_PERMANENCIA := sum(DIAS_PERMANENCIA), by = "GRUPO_AIH"]
data_final <- as.data.frame(data_dt[!duplicated(data_dt, by = "GRUPO_AIH")])
return(data_final)
}
res <- microbenchmark(dplyr_func(data), dt_func(data), times = 2)
autoplot(res)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
由 reprex 包 (v2.0.1) 创建于 2022-05-31
我有一个非常庞大的数据集,我正在寻找最简单(也是最快)的方法来创建一个列,该列对一个特定列的值求和,然后继续只留下一个重复值。
我的数据集如下所示:
data <- data.frame(DATE_INTER = c("2015-05-29", "2013-12-13", "2009-09-08"),
DATE_SAIDA = c("2015-06-10", "2013-12-15", "2009-10-20"),
GRUPO_AIH = c("09081997", "13122006", "13122006"),
DIAS_PERMANENCIA = c(12, 2, 42))
我需要使用列“GRUPO_AIH”来检查重复项。我的最终输出将是这样的:
我已经尝试过了,但是它花费的时间太长,而且在完成之后,我什至无法使用 dplyr 过滤 R 停止工作的任何内容。
data <- data %>%
group_by(GRUPO_AIH) %>%
mutate(DIAS_PERMANENCIA2 = sum(DIAS_PERMANENCIA))
有什么建议吗?
答案在这里。只是一个观察,在您提供的数据集示例中,GRUPO_AIH 变量中实际上没有任何重复值,因此我更改为 GRUPO_AIH = c("09081997", "13122006", "13122006"),
data %>%
group_by(GRUPO_AIH) %>%
mutate(DIAS_PERMANENCIA = sum(DIAS_PERMANENCIA)) %>%
filter(duplicated(GRUPO_AIH) == FALSE)
DATE_INTER DATE_SAIDA GRUPO_AIH DIAS_PERMANENCIA
<chr> <chr> <chr> <dbl>
1 2015-05-29 2015-06-10 09081997 12
2 2013-12-13 2013-12-15 13122006 44
OBS:também sou sanitarista atuando com dados na vigilância kk
如果您有一个“非常大的数据集”(数百万行),也许这会是最快的:
library(data.table)
data <- data.frame(DATE_INTER = c("2015-05-29", "2013-12-13", "2009-09-08"),
DATE_SAIDA = c("2015-06-10", "2013-12-15", "2009-10-20"),
GRUPO_AIH = c("09081997", "13122206", "13122206"),
DIAS_PERMANENCIA = c(12, 2, 42))
data_dt <- setDT(data)
setkey(data_dt, GRUPO_AIH)
data_dt[, DIAS_PERMANENCIA := sum(DIAS_PERMANENCIA), by = "GRUPO_AIH"]
data_final <- as.data.frame(data_dt[!duplicated(data_dt, by = "GRUPO_AIH")])
data_final
#> DATE_INTER DATE_SAIDA GRUPO_AIH DIAS_PERMANENCIA
#> 1 2015-05-29 2015-06-10 09081997 12
#> 2 2013-12-13 2013-12-15 13122206 44
由 reprex package (v2.0.1)
于 2022-05-31 创建基准测试:
#install.packages("data.table")
#install.packages("dplyr")
#install.packages("microbenchmark")
library(microbenchmark)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
data <- data.frame(DATE_INTER = rep(c("2015-05-29", "2013-12-13", "2009-09-08"), times = 10e5),
DATE_SAIDA = rep(c("2015-06-10", "2013-12-15", "2009-10-20"), times = 10e5),
GRUPO_AIH = rep(c("09081997", "13122206", "13122206"), times = 10e5),
DIAS_PERMANENCIA = rep(c(12, 2, 42), times = 10e5))
dplyr_func <- function(data){
data_final <- data %>%
dplyr::group_by(GRUPO_AIH) %>%
dplyr::mutate(DIAS_PERMANENCIA = sum(DIAS_PERMANENCIA)) %>%
dplyr::filter(duplicated(GRUPO_AIH) == FALSE)
return(data_final)
}
dt_func <- function(data){
data_dt <- data.table::setDT(data)
data.table::setkey(data_dt, GRUPO_AIH)
data_dt[, DIAS_PERMANENCIA := sum(DIAS_PERMANENCIA), by = "GRUPO_AIH"]
data_final <- as.data.frame(data_dt[!duplicated(data_dt, by = "GRUPO_AIH")])
return(data_final)
}
dplyr::all_equal(dplyr_func(data), dt_func(data))
#> [1] TRUE
res <- microbenchmark(dplyr_func(data), dt_func(data), times = 4)
autoplot(res)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# The difference in speed will likely become more pronounced as the size of the data increases
data <- data.frame(DATE_INTER = rep(c("2015-05-29", "2013-12-13", "2009-09-08"), times = 10e6),
DATE_SAIDA = rep(c("2015-06-10", "2013-12-15", "2009-10-20"), times = 10e6),
GRUPO_AIH = rep(c("09081997", "13122206", "13122206"), times = 10e6),
DIAS_PERMANENCIA = rep(c(12, 2, 42), times = 10e6))
dplyr_func <- function(data){
data_final <- data %>%
dplyr::group_by(GRUPO_AIH) %>%
dplyr::mutate(DIAS_PERMANENCIA = sum(DIAS_PERMANENCIA)) %>%
dplyr::filter(duplicated(GRUPO_AIH) == FALSE)
return(data_final)
}
dt_func <- function(data){
data_dt <- data.table::setDT(data)
data.table::setkey(data_dt, GRUPO_AIH)
data_dt[, DIAS_PERMANENCIA := sum(DIAS_PERMANENCIA), by = "GRUPO_AIH"]
data_final <- as.data.frame(data_dt[!duplicated(data_dt, by = "GRUPO_AIH")])
return(data_final)
}
res <- microbenchmark(dplyr_func(data), dt_func(data), times = 2)
autoplot(res)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
由 reprex 包 (v2.0.1) 创建于 2022-05-31