Pivot_Longer 有多个变量
Pivot_Longer with Multiple Variables
在上一个问题()中,我学习了如何使用R中的“pivot_longer()”函数来格式化数据:
v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015")
v2 <- c("A", "B", "C", "D", "E")
data_1 = data.frame(var_1 = rnorm(871, 10,10), var_2 = rnorm(871, 5,5))
data_1$dates <- as.factor(sample(v1, 871, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_1$types <- as.factor(sample(v2, 871, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_2 = data.frame(var_1 = rnorm(412, 10,10), var_2 = rnorm(412, 5,5))
data_2$dates <- as.factor(sample(v1, 412, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_2$types <- as.factor(sample(v2, 412, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_3 = data.frame(var_1 = rnorm(332, 10,10), var_2 = rnorm(332, 5,5))
data_3$dates <- as.factor(sample(v1, 332, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_3$types <- as.factor(sample(v2, 332, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_1$group = as.factor("A")
data_2$group = as.factor("B")
data_3$group = as.factor("C")
#data frame
dt = rbind(data_1, data_2, data_3)
library(tidyverse)
list(data_1, data_2, data_3) %>%
set_names(paste0("data_", 1:length(.))) %>%
bind_rows(.id = "data_nr") %>%
count(data_nr, dates, name = "my_counts") %>%
pivot_wider(names_from = dates, values_from = my_counts, names_prefix = "counts_")
这会产生以下数据集:
# A tibble: 3 x 6
data_nr `counts_2010-2011` `counts_2011-2012` `counts_2012-2013` `counts_2013-2014` `counts_2014-2015`
<chr> <int> <int> <int> <int> <int>
1 data_1 443 171 83 93 81
2 data_2 200 78 44 47 43
3 data_3 172 61 32 33 34
我想在上面table的基础上再增加几列,比如:
mean_var1_2010-2011
mean_var1_2011-2012
等等
mean_var2_2010-2011
mean_var2_2011-2012
等等
30thquantile_var1_2010-2011
30thquantile_var1_2011-2012
等
30thquantile_var2_2010-2011
30thquantile_var2_2011-2012
等
我试图修改上面的代码来做到这一点:
#put everything in one data frame to make it easier
data_1$group = as.factor("A")
data_2$group = as.factor("B")
data_3$group = as.factor("C")
#data frame
dt = rbind(data_1, data_2, data_3)
#QUESTION
final = dt %>%
bind_rows(.id = "data_nr") %>%
count(data_nr, dates, name = "my_counts") %>%
mean(data_nr, dates, var_1, name = "my_mean_var_1") %>%
mean(data_nr, dates, var_2, name = "my_mean_var_2") %>%
quantile(data_nr, dates, var_1, probs = 0.3, name = "my_30_percentile_var_1") %>%
quantile(data_nr, dates, var_2, probs = 0.3, name = "my_30_percentile_var_2") %>%
pivot_wider(names_from = dates, values_from = c(my_counts, my_mean_var_1, my_mean_var_2, my_30_percentile_var_1, my_30_percentile_var_2), names_prefix = "counts_")
但我认为这不是正确的做法
有人可以告诉我怎么做吗?
谢谢!
假设你有一个数据框 dt
看起来像这样。
> head(dt)
var_1 var_2 dates types group
1 27.5979494 -0.1823654 2014-2015 C A
2 8.2266573 4.9165620 2011-2012 D A
3 14.9731504 0.5343270 2010-2011 A A
4 22.5124430 2.3846317 2010-2011 A A
5 7.5399511 -5.1710378 2014-2015 A A
6 0.1473477 5.2621775 2014-2015 A A
然后使用 summarize
和 across
可以更有效地完成您的任务。你可以做到
library(dplyr)
library(tidyr)
dt %>% #v-----------------------I changed this one from "data_nr" to "group"
group_by(group, dates) %>%
summarize(
counts = n(),
across(
.cols = c(var_1, var_2),
.fns = list(mean = mean, `30thquantile` = ~quantile(., probs = 0.3)),
.names = "{.fn}_{.col}"
),
.groups = "drop"
) %>%
pivot_wider(names_from = dates, values_from = -c(group, dates))
#^------------... and this one as well
输出
# A tibble: 3 x 26
group `counts_2010-201~ `counts_2011-20~ `counts_2012-20~ `counts_2013-20~ `counts_2014-20~ `mean_var_1_201~ `mean_var_1_201~
<fct> <int> <int> <int> <int> <int> <dbl> <dbl>
1 A 412 193 84 85 97 9.88 9.75
2 B 209 74 37 49 43 10.0 11.4
3 C 135 87 44 33 33 10.5 9.33
# ... with 18 more variables: mean_var_1_2012-2013 <dbl>, mean_var_1_2013-2014 <dbl>, mean_var_1_2014-2015 <dbl>,
# 30thquantile_var_1_2010-2011 <dbl>, 30thquantile_var_1_2011-2012 <dbl>, 30thquantile_var_1_2012-2013 <dbl>,
# 30thquantile_var_1_2013-2014 <dbl>, 30thquantile_var_1_2014-2015 <dbl>, mean_var_2_2010-2011 <dbl>,
# mean_var_2_2011-2012 <dbl>, mean_var_2_2012-2013 <dbl>, mean_var_2_2013-2014 <dbl>, mean_var_2_2014-2015 <dbl>,
# 30thquantile_var_2_2010-2011 <dbl>, 30thquantile_var_2_2011-2012 <dbl>, 30thquantile_var_2_2012-2013 <dbl>,
# 30thquantile_var_2_2013-2014 <dbl>, 30thquantile_var_2_2014-2015 <dbl>
在上一个问题(
v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015")
v2 <- c("A", "B", "C", "D", "E")
data_1 = data.frame(var_1 = rnorm(871, 10,10), var_2 = rnorm(871, 5,5))
data_1$dates <- as.factor(sample(v1, 871, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_1$types <- as.factor(sample(v2, 871, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_2 = data.frame(var_1 = rnorm(412, 10,10), var_2 = rnorm(412, 5,5))
data_2$dates <- as.factor(sample(v1, 412, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_2$types <- as.factor(sample(v2, 412, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_3 = data.frame(var_1 = rnorm(332, 10,10), var_2 = rnorm(332, 5,5))
data_3$dates <- as.factor(sample(v1, 332, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_3$types <- as.factor(sample(v2, 332, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_1$group = as.factor("A")
data_2$group = as.factor("B")
data_3$group = as.factor("C")
#data frame
dt = rbind(data_1, data_2, data_3)
library(tidyverse)
list(data_1, data_2, data_3) %>%
set_names(paste0("data_", 1:length(.))) %>%
bind_rows(.id = "data_nr") %>%
count(data_nr, dates, name = "my_counts") %>%
pivot_wider(names_from = dates, values_from = my_counts, names_prefix = "counts_")
这会产生以下数据集:
# A tibble: 3 x 6
data_nr `counts_2010-2011` `counts_2011-2012` `counts_2012-2013` `counts_2013-2014` `counts_2014-2015`
<chr> <int> <int> <int> <int> <int>
1 data_1 443 171 83 93 81
2 data_2 200 78 44 47 43
3 data_3 172 61 32 33 34
我想在上面table的基础上再增加几列,比如:
mean_var1_2010-2011
mean_var1_2011-2012
等等mean_var2_2010-2011
mean_var2_2011-2012
等等30thquantile_var1_2010-2011
30thquantile_var1_2011-2012
等30thquantile_var2_2010-2011
30thquantile_var2_2011-2012
等
我试图修改上面的代码来做到这一点:
#put everything in one data frame to make it easier
data_1$group = as.factor("A")
data_2$group = as.factor("B")
data_3$group = as.factor("C")
#data frame
dt = rbind(data_1, data_2, data_3)
#QUESTION
final = dt %>%
bind_rows(.id = "data_nr") %>%
count(data_nr, dates, name = "my_counts") %>%
mean(data_nr, dates, var_1, name = "my_mean_var_1") %>%
mean(data_nr, dates, var_2, name = "my_mean_var_2") %>%
quantile(data_nr, dates, var_1, probs = 0.3, name = "my_30_percentile_var_1") %>%
quantile(data_nr, dates, var_2, probs = 0.3, name = "my_30_percentile_var_2") %>%
pivot_wider(names_from = dates, values_from = c(my_counts, my_mean_var_1, my_mean_var_2, my_30_percentile_var_1, my_30_percentile_var_2), names_prefix = "counts_")
但我认为这不是正确的做法
有人可以告诉我怎么做吗?
谢谢!
假设你有一个数据框 dt
看起来像这样。
> head(dt)
var_1 var_2 dates types group
1 27.5979494 -0.1823654 2014-2015 C A
2 8.2266573 4.9165620 2011-2012 D A
3 14.9731504 0.5343270 2010-2011 A A
4 22.5124430 2.3846317 2010-2011 A A
5 7.5399511 -5.1710378 2014-2015 A A
6 0.1473477 5.2621775 2014-2015 A A
然后使用 summarize
和 across
可以更有效地完成您的任务。你可以做到
library(dplyr)
library(tidyr)
dt %>% #v-----------------------I changed this one from "data_nr" to "group"
group_by(group, dates) %>%
summarize(
counts = n(),
across(
.cols = c(var_1, var_2),
.fns = list(mean = mean, `30thquantile` = ~quantile(., probs = 0.3)),
.names = "{.fn}_{.col}"
),
.groups = "drop"
) %>%
pivot_wider(names_from = dates, values_from = -c(group, dates))
#^------------... and this one as well
输出
# A tibble: 3 x 26
group `counts_2010-201~ `counts_2011-20~ `counts_2012-20~ `counts_2013-20~ `counts_2014-20~ `mean_var_1_201~ `mean_var_1_201~
<fct> <int> <int> <int> <int> <int> <dbl> <dbl>
1 A 412 193 84 85 97 9.88 9.75
2 B 209 74 37 49 43 10.0 11.4
3 C 135 87 44 33 33 10.5 9.33
# ... with 18 more variables: mean_var_1_2012-2013 <dbl>, mean_var_1_2013-2014 <dbl>, mean_var_1_2014-2015 <dbl>,
# 30thquantile_var_1_2010-2011 <dbl>, 30thquantile_var_1_2011-2012 <dbl>, 30thquantile_var_1_2012-2013 <dbl>,
# 30thquantile_var_1_2013-2014 <dbl>, 30thquantile_var_1_2014-2015 <dbl>, mean_var_2_2010-2011 <dbl>,
# mean_var_2_2011-2012 <dbl>, mean_var_2_2012-2013 <dbl>, mean_var_2_2013-2014 <dbl>, mean_var_2_2014-2015 <dbl>,
# 30thquantile_var_2_2010-2011 <dbl>, 30thquantile_var_2_2011-2012 <dbl>, 30thquantile_var_2_2012-2013 <dbl>,
# 30thquantile_var_2_2013-2014 <dbl>, 30thquantile_var_2_2014-2015 <dbl>