我如何分组和汇总,同时按组保留唯一值并计算它们的出现次数?

How can I group by and summarize while keeping unique values by group and count their occurence?

我有一个看起来像这样的数据框:

df <- data.frame(resource = c("gold", "bronze", "gold", "silver", "silver", "gold", "gold", "silver"), price = (c(10, 15, 20, 12, 12, 10, 10, 15)), extraction = c(100, 200, 50, 200, 250, 100, 50, 50))


    r       p   e
1   gold    10  100
2   bronze  15  200
3   gold    20  50
4   silver  12  200
5   silver  12  250
6   gold    10  100
7   gold    10  50
8   silver  15  50

我想按资源折叠此数据集,以便我有一个变量来计算总提取量和与资源的独特价格一样多的额外变量。此外,我想有另一个变量,对于每个唯一价格,计算有多少观察值以该价格计价。

这看起来像:

ID r       total_extr. price1 n_price1 price2 n_price2
1  gold    300         10     3        20     1
2  silver  500         12     2        15     1
3  bronze  200         15     1        NA     NA

理想情况下,价格会上升或下降(在我的数据集中,每组有两个以上的不同价格)。

我的原始数据集的前 50 行是:

structure(list(extraction = c(NA, NA, 3800, 5000, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 660, 3800, 125, 
3800, 3800, 660, 100, 3800, 40950, 250, 250, 150000, 35000, NA, 
1e+05, 53000, NA, 225000, NA, 260000, 260000, NA, 260000, NA, 
260000, NA, NA, 260000, 260000, 260000, 260000, 40, 523, NA, 
NA, 523), price = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, 27226012, NA, 21677.578125, NA, NA, 21047.84765625, 
15398.7431640625, NA, 12181.1640625, 11378.0888671875, 11137.2998046875, 
0.326765239238739, 0.326765239238739, 0.326765239238739, 0.352094233036041, 
0.352094233036041, 0.307463765144348, 0.307463765144348, 0.280774921178818, 
0.280774921178818, 0.240696549415588, 0.240696549415588, 0.168027445673943, 
0.168027445673943, 0.144999995827675, 0.144999995827675, 0.131485313177109, 
0.131485313177109, 0.129491910338402, 0.103749454021454, 0.14696241915226, 
473.7353515625, NA, NA, NA, NA), resource = c("salt", "salt", 
"natural gas", "natural gas", "natural gas", "natural gas", "natural gas", 
"natural gas", "natural gas", "natural gas", "natural gas", "natural gas", 
"natural gas", "natural gas", "tin", "natural gas", "tin", "natural gas", 
"natural gas", "tin", "tin", "natural gas", "gold", "gold", "gold", 
"diamond", "diamond", "diamond", "diamond", "diamond", "diamond", 
"diamond", "diamond", "diamond", "diamond", "diamond", "diamond", 
"diamond", "diamond", "diamond", "diamond", "diamond", "diamond", 
"diamond", "diamond", "diamond", "natural gas", "natural gas", 
"natural gas", "natural gas")), row.names = c(NA, 50L), class = "data.frame")

你可以这样做:

library(tidyverse)
df |> 
  group_by(resource) |> 
  add_count(price) |> 
  mutate(extraction = sum(extraction)) |> 
  distinct() |> 
  mutate(id = 1:n()) |> 
  ungroup() |> 
  pivot_wider(names_from = id,
              values_from = c(price, n),
              names_vary = "slowest") |> 
  mutate(order = c(1, 3, 2)) |> 
  arrange(order) |> 
  select(-order)

# A tibble: 3 × 6
  resource extraction price_1   n_1 price_2   n_2
  <chr>         <dbl>   <dbl> <int>   <dbl> <int>
1 gold            300      10     3      20     1
2 silver          500      12     2      15     1
3 bronze          200      15     1      NA    NA

使用 TO 更新的正则表达式的新解决方案。

基本上,只需将 na.rm = TRUE 参数添加到求和函数即可:

df |> 
  group_by(resource) |> 
  add_count(price) |> 
  mutate(extraction = sum(extraction, na.rm = TRUE)) |> 
  distinct() |> 
  mutate(id = 1:n()) |> 
  ungroup() |> 
  pivot_wider(names_from = id,
              values_from = c(price, n),
              names_vary = "slowest")

# A tibble: 5 × 26
  extraction resource         price_1   n_1   price_2   n_2   price_3   n_3   price_4   n_4 price_5   n_5 price_6   n_6 price_7   n_7 price_8   n_8 price_9   n_9 price_10  n_10 price_11  n_11 price_12  n_12
       <dbl> <chr>              <dbl> <int>     <dbl> <int>     <dbl> <int>     <dbl> <int>   <dbl> <int>   <dbl> <int>   <dbl> <int>   <dbl> <int>   <dbl> <int>    <dbl> <int>    <dbl> <int>    <dbl> <int>
1          0 salt              NA         2    NA        NA    NA        NA    NA        NA  NA        NA  NA        NA  NA        NA  NA        NA  NA        NA   NA        NA   NA        NA      NA     NA
2      63046 natural gas       NA        20    NA        NA    NA        NA    NA        NA  NA        NA  NA        NA  NA        NA  NA        NA  NA        NA   NA        NA   NA        NA      NA     NA
3       1545 tin         27226012         1 21678.        1 21048.        1 15399.        1  NA        NA  NA        NA  NA        NA  NA        NA  NA        NA   NA        NA   NA        NA      NA     NA
4      41450 gold           12181.        1 11378.        1 11137.        1    NA        NA  NA        NA  NA        NA  NA        NA  NA        NA  NA        NA   NA        NA   NA        NA      NA     NA
5    2643040 diamond            0.327     3     0.352     2     0.307     2     0.281     2   0.241     2   0.168     2   0.145     2   0.131     2   0.129     1    0.104     1    0.147     1     474.     1