如何计算 R dplyr 中不同观察值的总和

How to calculate the sum of distinct observations in R dplyr

我很纳闷。虽然我知道如何使用 n_distinct() 计算每组 distinct_values 的总和,但目前我似乎很难找到唯一观察值的总和。

我想按 id 分组,然后对每个唯一值求和......

library(tidyverse)

df <- tibble(col1 = c("apple","apple","pple", "banana", "banana","bananna"),
             col2 = c("pple","app","app", "bananna", "banan", "banan"), 
             counts_col1 = c(100,100,2,200,200,2),
             counts_col2 = c(2,50,50,2,20,20),
             id=c(1,1,1,2,2,2))

df1 <- df %>% 
  pivot_longer(c(counts_col1:counts_col2),names_to ="strings",values_to = "value") %>% 
  group_by(id,col1,col2) %>% 
  ungroup() %>% 
  group_by(id) 

df1
#> # A tibble: 12 × 5
#> # Groups:   id [2]
#>    col1    col2       id strings     value
#>    <chr>   <chr>   <dbl> <chr>       <dbl>
#>  1 apple   pple        1 counts_col1   100
#>  2 apple   pple        1 counts_col2     2
#>  3 apple   app         1 counts_col1   100
#>  4 apple   app         1 counts_col2    50
#>  5 pple    app         1 counts_col1     2
#>  6 pple    app         1 counts_col2    50
#>  7 banana  bananna     2 counts_col1   200
#>  8 banana  bananna     2 counts_col2     2
#>  9 banana  banan       2 counts_col1   200
#> 10 banana  banan       2 counts_col2    20
#> 11 bananna banan       2 counts_col1     2
#> 12 bananna banan       2 counts_col2    20

reprex package (v2.0.1)

于 2022-03-16 创建

...最终会变成这样


#>    col1    col2       id strings     value    sum_distinct
#>    <chr>   <chr>   <dbl> <chr>       <dbl>      
#>  1 apple   pple        1 counts_col1   100    152
#>  2 apple   pple        1 counts_col2     2    NA
#>  3 apple   app         1 counts_col1   100    NA
#>  4 apple   app         1 counts_col2    50    NA
#>  5 pple    app         1 counts_col1     2    NA
#>  6 pple    app         1 counts_col2    50    NA
#>  7 banana  bananna     2 counts_col1   200    222
#>  8 banana  bananna     2 counts_col2     2    NA
#>  9 banana  banan       2 counts_col1   200    NA
#> 10 banana  banan       2 counts_col2    20    NA
#> 11 bananna banan       2 counts_col1     2    NA
#> 12 bananna banan       2 counts_col2    20    NA

我们可以使用 replaceunique

library(dplyr)
library(tidyr)
df %>% 
  pivot_longer(c(counts_col1:counts_col2), 
      names_to ="strings",values_to = "value") %>% 
  group_by(id,col1,col2) %>%    
  group_by(id) %>%
  mutate(sum_distinct = replace(rep(NA_real_, n()), 1, sum(unique(value)))) %>%
  ungroup

-输出

# A tibble: 12 × 6
   col1    col2       id strings     value sum_distinct
   <chr>   <chr>   <dbl> <chr>       <dbl>        <dbl>
 1 apple   pple        1 counts_col1   100          152
 2 apple   pple        1 counts_col2     2           NA
 3 apple   app         1 counts_col1   100           NA
 4 apple   app         1 counts_col2    50           NA
 5 pple    app         1 counts_col1     2           NA
 6 pple    app         1 counts_col2    50           NA
 7 banana  bananna     2 counts_col1   200          222
 8 banana  bananna     2 counts_col2     2           NA
 9 banana  banan       2 counts_col1   200           NA
10 banana  banan       2 counts_col2    20           NA
11 bananna banan       2 counts_col1     2           NA
12 bananna banan       2 counts_col2    20           NA

使用 data.table,您可以:

Reprex

  • 代码
library(tidyverse) # to read your tibble
library(data.table)

# 1 - Building your starting data.table
df <- melt(setDT(df),
  id.vars = c("col1", "col2", "id"),
  measure.vars = c("counts_col1", "counts_col2"),
  variable.name = "string")[order(id, col1, -col2)]

df
#>        col1    col2 id      string value
#>  1:   apple    pple  1 counts_col1   100
#>  2:   apple    pple  1 counts_col2     2
#>  3:   apple     app  1 counts_col1   100
#>  4:   apple     app  1 counts_col2    50
#>  5:    pple     app  1 counts_col1     2
#>  6:    pple     app  1 counts_col2    50
#>  7:  banana bananna  2 counts_col1   200
#>  8:  banana bananna  2 counts_col2     2
#>  9:  banana   banan  2 counts_col1   200
#> 10:  banana   banan  2 counts_col2    20
#> 11: bananna   banan  2 counts_col1     2
#> 12: bananna   banan  2 counts_col2    20

# 2 - Computing the 'sum_distinct' column
df[, sum_distinct := sum(unique(value)), by = id
   ][!df[, .I[1], by = id]$V1, sum_distinct := NA_integer_][]
  • 输出
#>        col1    col2    id      string value sum_distinct
#>      <char>  <char> <num>      <fctr> <num>        <num>
#>  1:   apple    pple     1 counts_col1   100          152
#>  2:   apple    pple     1 counts_col2     2           NA
#>  3:   apple     app     1 counts_col1   100           NA
#>  4:   apple     app     1 counts_col2    50           NA
#>  5:    pple     app     1 counts_col1     2           NA
#>  6:    pple     app     1 counts_col2    50           NA
#>  7:  banana bananna     2 counts_col1   200          222
#>  8:  banana bananna     2 counts_col2     2           NA
#>  9:  banana   banan     2 counts_col1   200           NA
#> 10:  banana   banan     2 counts_col2    20           NA
#> 11: bananna   banan     2 counts_col1     2           NA
#> 12: bananna   banan     2 counts_col2    20           NA

reprex package (v2.0.1)

于 2022-03-16 创建