如何计算 R dplyr 中不同观察值的总和
How to calculate the sum of distinct observations in R dplyr
我很纳闷。虽然我知道如何使用 n_distinct() 计算每组 distinct_values 的总和,但目前我似乎很难找到唯一观察值的总和。
我想按 id 分组,然后对每个唯一值求和......
library(tidyverse)
df <- tibble(col1 = c("apple","apple","pple", "banana", "banana","bananna"),
col2 = c("pple","app","app", "bananna", "banan", "banan"),
counts_col1 = c(100,100,2,200,200,2),
counts_col2 = c(2,50,50,2,20,20),
id=c(1,1,1,2,2,2))
df1 <- df %>%
pivot_longer(c(counts_col1:counts_col2),names_to ="strings",values_to = "value") %>%
group_by(id,col1,col2) %>%
ungroup() %>%
group_by(id)
df1
#> # A tibble: 12 × 5
#> # Groups: id [2]
#> col1 col2 id strings value
#> <chr> <chr> <dbl> <chr> <dbl>
#> 1 apple pple 1 counts_col1 100
#> 2 apple pple 1 counts_col2 2
#> 3 apple app 1 counts_col1 100
#> 4 apple app 1 counts_col2 50
#> 5 pple app 1 counts_col1 2
#> 6 pple app 1 counts_col2 50
#> 7 banana bananna 2 counts_col1 200
#> 8 banana bananna 2 counts_col2 2
#> 9 banana banan 2 counts_col1 200
#> 10 banana banan 2 counts_col2 20
#> 11 bananna banan 2 counts_col1 2
#> 12 bananna banan 2 counts_col2 20
由 reprex package (v2.0.1)
于 2022-03-16 创建
...最终会变成这样
#> col1 col2 id strings value sum_distinct
#> <chr> <chr> <dbl> <chr> <dbl>
#> 1 apple pple 1 counts_col1 100 152
#> 2 apple pple 1 counts_col2 2 NA
#> 3 apple app 1 counts_col1 100 NA
#> 4 apple app 1 counts_col2 50 NA
#> 5 pple app 1 counts_col1 2 NA
#> 6 pple app 1 counts_col2 50 NA
#> 7 banana bananna 2 counts_col1 200 222
#> 8 banana bananna 2 counts_col2 2 NA
#> 9 banana banan 2 counts_col1 200 NA
#> 10 banana banan 2 counts_col2 20 NA
#> 11 bananna banan 2 counts_col1 2 NA
#> 12 bananna banan 2 counts_col2 20 NA
我们可以使用 replace
和 unique
library(dplyr)
library(tidyr)
df %>%
pivot_longer(c(counts_col1:counts_col2),
names_to ="strings",values_to = "value") %>%
group_by(id,col1,col2) %>%
group_by(id) %>%
mutate(sum_distinct = replace(rep(NA_real_, n()), 1, sum(unique(value)))) %>%
ungroup
-输出
# A tibble: 12 × 6
col1 col2 id strings value sum_distinct
<chr> <chr> <dbl> <chr> <dbl> <dbl>
1 apple pple 1 counts_col1 100 152
2 apple pple 1 counts_col2 2 NA
3 apple app 1 counts_col1 100 NA
4 apple app 1 counts_col2 50 NA
5 pple app 1 counts_col1 2 NA
6 pple app 1 counts_col2 50 NA
7 banana bananna 2 counts_col1 200 222
8 banana bananna 2 counts_col2 2 NA
9 banana banan 2 counts_col1 200 NA
10 banana banan 2 counts_col2 20 NA
11 bananna banan 2 counts_col1 2 NA
12 bananna banan 2 counts_col2 20 NA
使用 data.table
,您可以:
Reprex
- 代码
library(tidyverse) # to read your tibble
library(data.table)
# 1 - Building your starting data.table
df <- melt(setDT(df),
id.vars = c("col1", "col2", "id"),
measure.vars = c("counts_col1", "counts_col2"),
variable.name = "string")[order(id, col1, -col2)]
df
#> col1 col2 id string value
#> 1: apple pple 1 counts_col1 100
#> 2: apple pple 1 counts_col2 2
#> 3: apple app 1 counts_col1 100
#> 4: apple app 1 counts_col2 50
#> 5: pple app 1 counts_col1 2
#> 6: pple app 1 counts_col2 50
#> 7: banana bananna 2 counts_col1 200
#> 8: banana bananna 2 counts_col2 2
#> 9: banana banan 2 counts_col1 200
#> 10: banana banan 2 counts_col2 20
#> 11: bananna banan 2 counts_col1 2
#> 12: bananna banan 2 counts_col2 20
# 2 - Computing the 'sum_distinct' column
df[, sum_distinct := sum(unique(value)), by = id
][!df[, .I[1], by = id]$V1, sum_distinct := NA_integer_][]
- 输出
#> col1 col2 id string value sum_distinct
#> <char> <char> <num> <fctr> <num> <num>
#> 1: apple pple 1 counts_col1 100 152
#> 2: apple pple 1 counts_col2 2 NA
#> 3: apple app 1 counts_col1 100 NA
#> 4: apple app 1 counts_col2 50 NA
#> 5: pple app 1 counts_col1 2 NA
#> 6: pple app 1 counts_col2 50 NA
#> 7: banana bananna 2 counts_col1 200 222
#> 8: banana bananna 2 counts_col2 2 NA
#> 9: banana banan 2 counts_col1 200 NA
#> 10: banana banan 2 counts_col2 20 NA
#> 11: bananna banan 2 counts_col1 2 NA
#> 12: bananna banan 2 counts_col2 20 NA
由 reprex package (v2.0.1)
于 2022-03-16 创建
我很纳闷。虽然我知道如何使用 n_distinct() 计算每组 distinct_values 的总和,但目前我似乎很难找到唯一观察值的总和。
我想按 id 分组,然后对每个唯一值求和......
library(tidyverse)
df <- tibble(col1 = c("apple","apple","pple", "banana", "banana","bananna"),
col2 = c("pple","app","app", "bananna", "banan", "banan"),
counts_col1 = c(100,100,2,200,200,2),
counts_col2 = c(2,50,50,2,20,20),
id=c(1,1,1,2,2,2))
df1 <- df %>%
pivot_longer(c(counts_col1:counts_col2),names_to ="strings",values_to = "value") %>%
group_by(id,col1,col2) %>%
ungroup() %>%
group_by(id)
df1
#> # A tibble: 12 × 5
#> # Groups: id [2]
#> col1 col2 id strings value
#> <chr> <chr> <dbl> <chr> <dbl>
#> 1 apple pple 1 counts_col1 100
#> 2 apple pple 1 counts_col2 2
#> 3 apple app 1 counts_col1 100
#> 4 apple app 1 counts_col2 50
#> 5 pple app 1 counts_col1 2
#> 6 pple app 1 counts_col2 50
#> 7 banana bananna 2 counts_col1 200
#> 8 banana bananna 2 counts_col2 2
#> 9 banana banan 2 counts_col1 200
#> 10 banana banan 2 counts_col2 20
#> 11 bananna banan 2 counts_col1 2
#> 12 bananna banan 2 counts_col2 20
由 reprex package (v2.0.1)
于 2022-03-16 创建...最终会变成这样
#> col1 col2 id strings value sum_distinct
#> <chr> <chr> <dbl> <chr> <dbl>
#> 1 apple pple 1 counts_col1 100 152
#> 2 apple pple 1 counts_col2 2 NA
#> 3 apple app 1 counts_col1 100 NA
#> 4 apple app 1 counts_col2 50 NA
#> 5 pple app 1 counts_col1 2 NA
#> 6 pple app 1 counts_col2 50 NA
#> 7 banana bananna 2 counts_col1 200 222
#> 8 banana bananna 2 counts_col2 2 NA
#> 9 banana banan 2 counts_col1 200 NA
#> 10 banana banan 2 counts_col2 20 NA
#> 11 bananna banan 2 counts_col1 2 NA
#> 12 bananna banan 2 counts_col2 20 NA
我们可以使用 replace
和 unique
library(dplyr)
library(tidyr)
df %>%
pivot_longer(c(counts_col1:counts_col2),
names_to ="strings",values_to = "value") %>%
group_by(id,col1,col2) %>%
group_by(id) %>%
mutate(sum_distinct = replace(rep(NA_real_, n()), 1, sum(unique(value)))) %>%
ungroup
-输出
# A tibble: 12 × 6
col1 col2 id strings value sum_distinct
<chr> <chr> <dbl> <chr> <dbl> <dbl>
1 apple pple 1 counts_col1 100 152
2 apple pple 1 counts_col2 2 NA
3 apple app 1 counts_col1 100 NA
4 apple app 1 counts_col2 50 NA
5 pple app 1 counts_col1 2 NA
6 pple app 1 counts_col2 50 NA
7 banana bananna 2 counts_col1 200 222
8 banana bananna 2 counts_col2 2 NA
9 banana banan 2 counts_col1 200 NA
10 banana banan 2 counts_col2 20 NA
11 bananna banan 2 counts_col1 2 NA
12 bananna banan 2 counts_col2 20 NA
使用 data.table
,您可以:
Reprex
- 代码
library(tidyverse) # to read your tibble
library(data.table)
# 1 - Building your starting data.table
df <- melt(setDT(df),
id.vars = c("col1", "col2", "id"),
measure.vars = c("counts_col1", "counts_col2"),
variable.name = "string")[order(id, col1, -col2)]
df
#> col1 col2 id string value
#> 1: apple pple 1 counts_col1 100
#> 2: apple pple 1 counts_col2 2
#> 3: apple app 1 counts_col1 100
#> 4: apple app 1 counts_col2 50
#> 5: pple app 1 counts_col1 2
#> 6: pple app 1 counts_col2 50
#> 7: banana bananna 2 counts_col1 200
#> 8: banana bananna 2 counts_col2 2
#> 9: banana banan 2 counts_col1 200
#> 10: banana banan 2 counts_col2 20
#> 11: bananna banan 2 counts_col1 2
#> 12: bananna banan 2 counts_col2 20
# 2 - Computing the 'sum_distinct' column
df[, sum_distinct := sum(unique(value)), by = id
][!df[, .I[1], by = id]$V1, sum_distinct := NA_integer_][]
- 输出
#> col1 col2 id string value sum_distinct
#> <char> <char> <num> <fctr> <num> <num>
#> 1: apple pple 1 counts_col1 100 152
#> 2: apple pple 1 counts_col2 2 NA
#> 3: apple app 1 counts_col1 100 NA
#> 4: apple app 1 counts_col2 50 NA
#> 5: pple app 1 counts_col1 2 NA
#> 6: pple app 1 counts_col2 50 NA
#> 7: banana bananna 2 counts_col1 200 222
#> 8: banana bananna 2 counts_col2 2 NA
#> 9: banana banan 2 counts_col1 200 NA
#> 10: banana banan 2 counts_col2 20 NA
#> 11: bananna banan 2 counts_col1 2 NA
#> 12: bananna banan 2 counts_col2 20 NA
由 reprex package (v2.0.1)
于 2022-03-16 创建