我如何分组和汇总,同时按组保留唯一值并计算它们的出现次数?
How can I group by and summarize while keeping unique values by group and count their occurence?
我有一个看起来像这样的数据框:
df <- data.frame(resource = c("gold", "bronze", "gold", "silver", "silver", "gold", "gold", "silver"), price = (c(10, 15, 20, 12, 12, 10, 10, 15)), extraction = c(100, 200, 50, 200, 250, 100, 50, 50))
r p e
1 gold 10 100
2 bronze 15 200
3 gold 20 50
4 silver 12 200
5 silver 12 250
6 gold 10 100
7 gold 10 50
8 silver 15 50
我想按资源折叠此数据集,以便我有一个变量来计算总提取量和与资源的独特价格一样多的额外变量。此外,我想有另一个变量,对于每个唯一价格,计算有多少观察值以该价格计价。
这看起来像:
ID r total_extr. price1 n_price1 price2 n_price2
1 gold 300 10 3 20 1
2 silver 500 12 2 15 1
3 bronze 200 15 1 NA NA
理想情况下,价格会上升或下降(在我的数据集中,每组有两个以上的不同价格)。
我的原始数据集的前 50 行是:
structure(list(extraction = c(NA, NA, 3800, 5000, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 660, 3800, 125,
3800, 3800, 660, 100, 3800, 40950, 250, 250, 150000, 35000, NA,
1e+05, 53000, NA, 225000, NA, 260000, 260000, NA, 260000, NA,
260000, NA, NA, 260000, 260000, 260000, 260000, 40, 523, NA,
NA, 523), price = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 27226012, NA, 21677.578125, NA, NA, 21047.84765625,
15398.7431640625, NA, 12181.1640625, 11378.0888671875, 11137.2998046875,
0.326765239238739, 0.326765239238739, 0.326765239238739, 0.352094233036041,
0.352094233036041, 0.307463765144348, 0.307463765144348, 0.280774921178818,
0.280774921178818, 0.240696549415588, 0.240696549415588, 0.168027445673943,
0.168027445673943, 0.144999995827675, 0.144999995827675, 0.131485313177109,
0.131485313177109, 0.129491910338402, 0.103749454021454, 0.14696241915226,
473.7353515625, NA, NA, NA, NA), resource = c("salt", "salt",
"natural gas", "natural gas", "natural gas", "natural gas", "natural gas",
"natural gas", "natural gas", "natural gas", "natural gas", "natural gas",
"natural gas", "natural gas", "tin", "natural gas", "tin", "natural gas",
"natural gas", "tin", "tin", "natural gas", "gold", "gold", "gold",
"diamond", "diamond", "diamond", "diamond", "diamond", "diamond",
"diamond", "diamond", "diamond", "diamond", "diamond", "diamond",
"diamond", "diamond", "diamond", "diamond", "diamond", "diamond",
"diamond", "diamond", "diamond", "natural gas", "natural gas",
"natural gas", "natural gas")), row.names = c(NA, 50L), class = "data.frame")
你可以这样做:
library(tidyverse)
df |>
group_by(resource) |>
add_count(price) |>
mutate(extraction = sum(extraction)) |>
distinct() |>
mutate(id = 1:n()) |>
ungroup() |>
pivot_wider(names_from = id,
values_from = c(price, n),
names_vary = "slowest") |>
mutate(order = c(1, 3, 2)) |>
arrange(order) |>
select(-order)
# A tibble: 3 × 6
resource extraction price_1 n_1 price_2 n_2
<chr> <dbl> <dbl> <int> <dbl> <int>
1 gold 300 10 3 20 1
2 silver 500 12 2 15 1
3 bronze 200 15 1 NA NA
使用 TO 更新的正则表达式的新解决方案。
基本上,只需将 na.rm = TRUE
参数添加到求和函数即可:
df |>
group_by(resource) |>
add_count(price) |>
mutate(extraction = sum(extraction, na.rm = TRUE)) |>
distinct() |>
mutate(id = 1:n()) |>
ungroup() |>
pivot_wider(names_from = id,
values_from = c(price, n),
names_vary = "slowest")
# A tibble: 5 × 26
extraction resource price_1 n_1 price_2 n_2 price_3 n_3 price_4 n_4 price_5 n_5 price_6 n_6 price_7 n_7 price_8 n_8 price_9 n_9 price_10 n_10 price_11 n_11 price_12 n_12
<dbl> <chr> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int>
1 0 salt NA 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2 63046 natural gas NA 20 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
3 1545 tin 27226012 1 21678. 1 21048. 1 15399. 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
4 41450 gold 12181. 1 11378. 1 11137. 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
5 2643040 diamond 0.327 3 0.352 2 0.307 2 0.281 2 0.241 2 0.168 2 0.145 2 0.131 2 0.129 1 0.104 1 0.147 1 474. 1
我有一个看起来像这样的数据框:
df <- data.frame(resource = c("gold", "bronze", "gold", "silver", "silver", "gold", "gold", "silver"), price = (c(10, 15, 20, 12, 12, 10, 10, 15)), extraction = c(100, 200, 50, 200, 250, 100, 50, 50))
r p e
1 gold 10 100
2 bronze 15 200
3 gold 20 50
4 silver 12 200
5 silver 12 250
6 gold 10 100
7 gold 10 50
8 silver 15 50
我想按资源折叠此数据集,以便我有一个变量来计算总提取量和与资源的独特价格一样多的额外变量。此外,我想有另一个变量,对于每个唯一价格,计算有多少观察值以该价格计价。
这看起来像:
ID r total_extr. price1 n_price1 price2 n_price2
1 gold 300 10 3 20 1
2 silver 500 12 2 15 1
3 bronze 200 15 1 NA NA
理想情况下,价格会上升或下降(在我的数据集中,每组有两个以上的不同价格)。
我的原始数据集的前 50 行是:
structure(list(extraction = c(NA, NA, 3800, 5000, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 3800, 660, 3800, 125,
3800, 3800, 660, 100, 3800, 40950, 250, 250, 150000, 35000, NA,
1e+05, 53000, NA, 225000, NA, 260000, 260000, NA, 260000, NA,
260000, NA, NA, 260000, 260000, 260000, 260000, 40, 523, NA,
NA, 523), price = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 27226012, NA, 21677.578125, NA, NA, 21047.84765625,
15398.7431640625, NA, 12181.1640625, 11378.0888671875, 11137.2998046875,
0.326765239238739, 0.326765239238739, 0.326765239238739, 0.352094233036041,
0.352094233036041, 0.307463765144348, 0.307463765144348, 0.280774921178818,
0.280774921178818, 0.240696549415588, 0.240696549415588, 0.168027445673943,
0.168027445673943, 0.144999995827675, 0.144999995827675, 0.131485313177109,
0.131485313177109, 0.129491910338402, 0.103749454021454, 0.14696241915226,
473.7353515625, NA, NA, NA, NA), resource = c("salt", "salt",
"natural gas", "natural gas", "natural gas", "natural gas", "natural gas",
"natural gas", "natural gas", "natural gas", "natural gas", "natural gas",
"natural gas", "natural gas", "tin", "natural gas", "tin", "natural gas",
"natural gas", "tin", "tin", "natural gas", "gold", "gold", "gold",
"diamond", "diamond", "diamond", "diamond", "diamond", "diamond",
"diamond", "diamond", "diamond", "diamond", "diamond", "diamond",
"diamond", "diamond", "diamond", "diamond", "diamond", "diamond",
"diamond", "diamond", "diamond", "natural gas", "natural gas",
"natural gas", "natural gas")), row.names = c(NA, 50L), class = "data.frame")
你可以这样做:
library(tidyverse)
df |>
group_by(resource) |>
add_count(price) |>
mutate(extraction = sum(extraction)) |>
distinct() |>
mutate(id = 1:n()) |>
ungroup() |>
pivot_wider(names_from = id,
values_from = c(price, n),
names_vary = "slowest") |>
mutate(order = c(1, 3, 2)) |>
arrange(order) |>
select(-order)
# A tibble: 3 × 6
resource extraction price_1 n_1 price_2 n_2
<chr> <dbl> <dbl> <int> <dbl> <int>
1 gold 300 10 3 20 1
2 silver 500 12 2 15 1
3 bronze 200 15 1 NA NA
使用 TO 更新的正则表达式的新解决方案。
基本上,只需将 na.rm = TRUE
参数添加到求和函数即可:
df |>
group_by(resource) |>
add_count(price) |>
mutate(extraction = sum(extraction, na.rm = TRUE)) |>
distinct() |>
mutate(id = 1:n()) |>
ungroup() |>
pivot_wider(names_from = id,
values_from = c(price, n),
names_vary = "slowest")
# A tibble: 5 × 26
extraction resource price_1 n_1 price_2 n_2 price_3 n_3 price_4 n_4 price_5 n_5 price_6 n_6 price_7 n_7 price_8 n_8 price_9 n_9 price_10 n_10 price_11 n_11 price_12 n_12
<dbl> <chr> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int>
1 0 salt NA 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2 63046 natural gas NA 20 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
3 1545 tin 27226012 1 21678. 1 21048. 1 15399. 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
4 41450 gold 12181. 1 11378. 1 11137. 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
5 2643040 diamond 0.327 3 0.352 2 0.307 2 0.281 2 0.241 2 0.168 2 0.145 2 0.131 2 0.129 1 0.104 1 0.147 1 474. 1