为每个唯一组创建一个新变量

Creating a New Variable for Each Unique Group

我在 R 工作

我有以下5个数据集(data_1、data_2、data_3、data_4、data_5):

v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015") 
v2 <- c("A", "B", "C", "D", "E")

data_1 = data.frame(var_1 = rnorm(871, 10,10), var_2 = rnorm(871, 5,5))

data_1$dates <- as.factor(sample(v1, 871, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_1$types <- as.factor(sample(v2, 871, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_2 = data.frame(var_1 = rnorm(412, 10,10), var_2 = rnorm(412, 5,5))

data_2$dates <- as.factor(sample(v1, 412, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_2$types <- as.factor(sample(v2, 412, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_3 = data.frame(var_1 = rnorm(332, 10,10), var_2 = rnorm(332, 5,5))

data_3$dates <- as.factor(sample(v1, 332, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_3$types <- as.factor(sample(v2, 332, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_4 = data.frame(var_1 = rnorm(611, 10,10), var_2 = rnorm(611, 5,5))

data_4$dates <- as.factor(sample(v1, 611, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_4$types <- as.factor(sample(v2, 611, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_5 = data.frame(var_1 = rnorm(789, 10,10), var_2 = rnorm(789, 5,5))

data_5$dates <- as.factor(sample(v1, 789, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_5$types <- as.factor(sample(v2, 789, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))

# sample of one of the files

head(data_1)
      var_1     var_2     dates types
1  8.523382  4.945344 2010-2011     E
2 14.137515  3.223525 2012-2013     A
3 19.610770  7.762698 2011-2012     D
4 11.334196 10.879946 2012-2013     E
5 -1.406475  2.498347 2011-2012     E
6 11.116458  9.988073 2011-2012     E

根据以上数据,我做了一个table,提供汇总:

summary_table = data.frame(names = c("data_1", "data_2", "data_3", "data_4", "data_5"),
 counts = c(nrow(data_1), nrow(data_2), nrow(data_3), nrow(data_4), nrow(data_5) ),
mean_var_1 = c(mean(data_1$var_1), mean(data_2$var_1), mean(data_3$var_1), mean(data_4$var_1), mean(data_5$var_1)),
mean_var_2 = c(mean(data_2$var_1), mean(data_2$var_2), mean(data_3$var_2), mean(data_4$var_2), mean(data_5$var_2))

)


   names counts mean_var_1 mean_var_2
1 data_1    871   9.426475   9.853399
2 data_2    412   9.853399   4.680188
3 data_3    332  10.275049   5.256084
4 data_4    611  10.094421   5.323108
5 data_5    789   9.960050   4.946458

我想向上面的 table 添加 5 个新列,其中包含每年 的计数。这看起来像这样(这是一个空模板):

df <- data.frame(matrix(ncol = 7, nrow = 0))
x <- c("names", "counts", "counts 2010-2011", "counts 2011-2012", "counts 2012-2013", "counts 2013-2014", "counts 2014-2015")
colnames(df) <- x

我知道如何手动执行此操作,但需要很长时间:

    library(dplyr)
    
     summary_1 = data.frame( data_1 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_2 = data.frame( data_2 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_3 = data.frame( data_3 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_4 = data.frame( data_4 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_5 = data.frame( data_5 %>%  group_by(dates) %>% summarise(my_counts = n()) )

#view sample of output
summary_1

      dates my_counts
1 2010-2011       407
2 2011-2012       189
3 2012-2013        79
4 2013-2014       101
5 2014-2015        95

但我必须手动创建 5 个新列并手动复制这 25 个计数 (5 x 5 = 25)。

有人可以告诉我一个更快的方法吗?

谢谢!

所需输出示例

也许你可以考虑在下次使用随机值时使用set.seed(),这样回答的人可能会使用完全相同的采样数据。此外,您可以考虑在定义 V1 时使用 _ 而不是 - 符号,以避免在结果数据框中使用反引号,因为 R 不喜欢列名中的减号。

您可以使用命名列表在 data_nr 列中包含 data_1、data_2 等值,而不是 1、2 等

library(tidyverse)

list(data_1, data_2, data_3, data_4, data_5) %>% 
  set_names(paste0("data_", 1:length(.))) %>% 
  bind_rows(.id = "data_nr") %>% 
  count(data_nr, dates, name = "my_counts") %>% 
  pivot_wider(names_from = dates, values_from = my_counts, names_prefix = "counts_")

       data_nr `counts_2010-2011` `counts_2011-2012` `counts_2012-2013` `counts_2013-2014` `counts_2014-2015`
  <chr>                <int>              <int>              <int>              <int>              <int>
1 data_1                 437                161                 93                 88                 92
2 data_2                 218                 68                 40                 36                 50
3 data_3                 170                 58                 35                 34                 35
4 data_4                 331                114                 65                 54                 47
5 data_5                 398                146                 89                 78                 78