为每个唯一组创建一个新变量
Creating a New Variable for Each Unique Group
我在 R 工作
我有以下5个数据集(data_1、data_2、data_3、data_4、data_5):
v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015")
v2 <- c("A", "B", "C", "D", "E")
data_1 = data.frame(var_1 = rnorm(871, 10,10), var_2 = rnorm(871, 5,5))
data_1$dates <- as.factor(sample(v1, 871, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_1$types <- as.factor(sample(v2, 871, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_2 = data.frame(var_1 = rnorm(412, 10,10), var_2 = rnorm(412, 5,5))
data_2$dates <- as.factor(sample(v1, 412, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_2$types <- as.factor(sample(v2, 412, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_3 = data.frame(var_1 = rnorm(332, 10,10), var_2 = rnorm(332, 5,5))
data_3$dates <- as.factor(sample(v1, 332, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_3$types <- as.factor(sample(v2, 332, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_4 = data.frame(var_1 = rnorm(611, 10,10), var_2 = rnorm(611, 5,5))
data_4$dates <- as.factor(sample(v1, 611, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_4$types <- as.factor(sample(v2, 611, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_5 = data.frame(var_1 = rnorm(789, 10,10), var_2 = rnorm(789, 5,5))
data_5$dates <- as.factor(sample(v1, 789, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_5$types <- as.factor(sample(v2, 789, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
# sample of one of the files
head(data_1)
var_1 var_2 dates types
1 8.523382 4.945344 2010-2011 E
2 14.137515 3.223525 2012-2013 A
3 19.610770 7.762698 2011-2012 D
4 11.334196 10.879946 2012-2013 E
5 -1.406475 2.498347 2011-2012 E
6 11.116458 9.988073 2011-2012 E
根据以上数据,我做了一个table,提供汇总:
summary_table = data.frame(names = c("data_1", "data_2", "data_3", "data_4", "data_5"),
counts = c(nrow(data_1), nrow(data_2), nrow(data_3), nrow(data_4), nrow(data_5) ),
mean_var_1 = c(mean(data_1$var_1), mean(data_2$var_1), mean(data_3$var_1), mean(data_4$var_1), mean(data_5$var_1)),
mean_var_2 = c(mean(data_2$var_1), mean(data_2$var_2), mean(data_3$var_2), mean(data_4$var_2), mean(data_5$var_2))
)
names counts mean_var_1 mean_var_2
1 data_1 871 9.426475 9.853399
2 data_2 412 9.853399 4.680188
3 data_3 332 10.275049 5.256084
4 data_4 611 10.094421 5.323108
5 data_5 789 9.960050 4.946458
我想向上面的 table 添加 5 个新列,其中包含每年 的计数。这看起来像这样(这是一个空模板):
df <- data.frame(matrix(ncol = 7, nrow = 0))
x <- c("names", "counts", "counts 2010-2011", "counts 2011-2012", "counts 2012-2013", "counts 2013-2014", "counts 2014-2015")
colnames(df) <- x
我知道如何手动执行此操作,但需要很长时间:
library(dplyr)
summary_1 = data.frame( data_1 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_2 = data.frame( data_2 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_3 = data.frame( data_3 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_4 = data.frame( data_4 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_5 = data.frame( data_5 %>% group_by(dates) %>% summarise(my_counts = n()) )
#view sample of output
summary_1
dates my_counts
1 2010-2011 407
2 2011-2012 189
3 2012-2013 79
4 2013-2014 101
5 2014-2015 95
但我必须手动创建 5 个新列并手动复制这 25 个计数 (5 x 5 = 25)。
有人可以告诉我一个更快的方法吗?
谢谢!
所需输出示例
也许你可以考虑在下次使用随机值时使用set.seed()
,这样回答的人可能会使用完全相同的采样数据。此外,您可以考虑在定义 V1 时使用 _
而不是 -
符号,以避免在结果数据框中使用反引号,因为 R 不喜欢列名中的减号。
您可以使用命名列表在 data_nr 列中包含 data_1、data_2 等值,而不是 1、2 等
library(tidyverse)
list(data_1, data_2, data_3, data_4, data_5) %>%
set_names(paste0("data_", 1:length(.))) %>%
bind_rows(.id = "data_nr") %>%
count(data_nr, dates, name = "my_counts") %>%
pivot_wider(names_from = dates, values_from = my_counts, names_prefix = "counts_")
data_nr `counts_2010-2011` `counts_2011-2012` `counts_2012-2013` `counts_2013-2014` `counts_2014-2015`
<chr> <int> <int> <int> <int> <int>
1 data_1 437 161 93 88 92
2 data_2 218 68 40 36 50
3 data_3 170 58 35 34 35
4 data_4 331 114 65 54 47
5 data_5 398 146 89 78 78
我在 R 工作
我有以下5个数据集(data_1、data_2、data_3、data_4、data_5):
v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015")
v2 <- c("A", "B", "C", "D", "E")
data_1 = data.frame(var_1 = rnorm(871, 10,10), var_2 = rnorm(871, 5,5))
data_1$dates <- as.factor(sample(v1, 871, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_1$types <- as.factor(sample(v2, 871, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_2 = data.frame(var_1 = rnorm(412, 10,10), var_2 = rnorm(412, 5,5))
data_2$dates <- as.factor(sample(v1, 412, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_2$types <- as.factor(sample(v2, 412, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_3 = data.frame(var_1 = rnorm(332, 10,10), var_2 = rnorm(332, 5,5))
data_3$dates <- as.factor(sample(v1, 332, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_3$types <- as.factor(sample(v2, 332, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_4 = data.frame(var_1 = rnorm(611, 10,10), var_2 = rnorm(611, 5,5))
data_4$dates <- as.factor(sample(v1, 611, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_4$types <- as.factor(sample(v2, 611, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_5 = data.frame(var_1 = rnorm(789, 10,10), var_2 = rnorm(789, 5,5))
data_5$dates <- as.factor(sample(v1, 789, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_5$types <- as.factor(sample(v2, 789, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
# sample of one of the files
head(data_1)
var_1 var_2 dates types
1 8.523382 4.945344 2010-2011 E
2 14.137515 3.223525 2012-2013 A
3 19.610770 7.762698 2011-2012 D
4 11.334196 10.879946 2012-2013 E
5 -1.406475 2.498347 2011-2012 E
6 11.116458 9.988073 2011-2012 E
根据以上数据,我做了一个table,提供汇总:
summary_table = data.frame(names = c("data_1", "data_2", "data_3", "data_4", "data_5"),
counts = c(nrow(data_1), nrow(data_2), nrow(data_3), nrow(data_4), nrow(data_5) ),
mean_var_1 = c(mean(data_1$var_1), mean(data_2$var_1), mean(data_3$var_1), mean(data_4$var_1), mean(data_5$var_1)),
mean_var_2 = c(mean(data_2$var_1), mean(data_2$var_2), mean(data_3$var_2), mean(data_4$var_2), mean(data_5$var_2))
)
names counts mean_var_1 mean_var_2
1 data_1 871 9.426475 9.853399
2 data_2 412 9.853399 4.680188
3 data_3 332 10.275049 5.256084
4 data_4 611 10.094421 5.323108
5 data_5 789 9.960050 4.946458
我想向上面的 table 添加 5 个新列,其中包含每年 的计数。这看起来像这样(这是一个空模板):
df <- data.frame(matrix(ncol = 7, nrow = 0))
x <- c("names", "counts", "counts 2010-2011", "counts 2011-2012", "counts 2012-2013", "counts 2013-2014", "counts 2014-2015")
colnames(df) <- x
我知道如何手动执行此操作,但需要很长时间:
library(dplyr)
summary_1 = data.frame( data_1 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_2 = data.frame( data_2 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_3 = data.frame( data_3 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_4 = data.frame( data_4 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_5 = data.frame( data_5 %>% group_by(dates) %>% summarise(my_counts = n()) )
#view sample of output
summary_1
dates my_counts
1 2010-2011 407
2 2011-2012 189
3 2012-2013 79
4 2013-2014 101
5 2014-2015 95
但我必须手动创建 5 个新列并手动复制这 25 个计数 (5 x 5 = 25)。
有人可以告诉我一个更快的方法吗?
谢谢!
所需输出示例
也许你可以考虑在下次使用随机值时使用set.seed()
,这样回答的人可能会使用完全相同的采样数据。此外,您可以考虑在定义 V1 时使用 _
而不是 -
符号,以避免在结果数据框中使用反引号,因为 R 不喜欢列名中的减号。
您可以使用命名列表在 data_nr 列中包含 data_1、data_2 等值,而不是 1、2 等
library(tidyverse)
list(data_1, data_2, data_3, data_4, data_5) %>%
set_names(paste0("data_", 1:length(.))) %>%
bind_rows(.id = "data_nr") %>%
count(data_nr, dates, name = "my_counts") %>%
pivot_wider(names_from = dates, values_from = my_counts, names_prefix = "counts_")
data_nr `counts_2010-2011` `counts_2011-2012` `counts_2012-2013` `counts_2013-2014` `counts_2014-2015`
<chr> <int> <int> <int> <int> <int>
1 data_1 437 161 93 88 92
2 data_2 218 68 40 36 50
3 data_3 170 58 35 34 35
4 data_4 331 114 65 54 47
5 data_5 398 146 89 78 78