分组 summaries/subsets dplyr
group-wise summaries/subsets dplyr
我有一个包含 2 个不同学期的两门课程的数据集,其形式如下:
set.seed(200)
sem <- sample(c("1", "2"), 200, replace = T)
course <- sample(c("1", "2"), 200, replace = T)
d.gender = sample(c(0, 1), 200, replace = T, prob = c(0.6, 0.4))
d.pass = sample(c(0, 1), 200, replace = T, prob = c(0.7, 0.3))
df <- data.frame(sem, course, d.gender, d.pass)
我正在尝试有效地创建一个包含 4 个不同的 sem、课程组合及其总通过率、d.gender = 1 的百分比,最后是这两个性别类别中的通过率。我可以制作一个 table 来提供我需要计算的所有值,但我知道有一种更有效的方法来计算我需要的东西,而无需嵌套一堆不同的 group_by 和汇总函数,或者制作一个一大堆不同的 tbl 和 left_joining 我想要的列。我可以通过索引和子集函数得到我需要的东西,但我希望有更好的方法来获得一个包含我需要的一切的 4 行矩阵,但它很难看并且需要很长时间,而且很容易出错代码:
df1 <- df %>% group_by(sem, course, d.gender, d.pass) %>% summarize(total = n())
df1$total_pass <- rep(NA, dim(df1)[1])
df1$total_pass[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.pass == "1",
select = total))
df1$total_pass[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.pass == "1",
select = total))
df1$total_pass[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.pass == "1",
select = total))
df1$total_pass[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.pass == "1",
select = total))
df1$n_male <- rep(NA, dim(df1)[1])
df1$n_male[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.gender == "1",
select = total))
df1$n_male[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.gender == "1",
select = total))
df1$n_male[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.gender == "1",
select = total))
df1$n_male[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.gender == "1",
select = total))
df1$n_fem <- rep(NA, dim(df1)[1])
df1$n_fem[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.gender == "0", select = total))
df1$n_fem[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.gender == "0", select = total))
df1$n_fem[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.gender == "0",
select = total))
df1$n_fem[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.gender == "0",
select = total))
df1$pct_male <- rep(NA, dim(df1)[1])
df1$pct_male[1:4] <- df1$n_male[1:4]/sum(subset(df1, sem == "1" & course == "1",
select = total))
df1$pct_male[5:8] <- df1$n_male[5:8]/sum(subset(df1, sem == "1" & course == "2",
select = total))
df1$pct_male[9:12] <- df1$n_male[9:12]/sum(subset(df1, sem == "2" & course == "1",
select = total))
df1$pct_male[13:16] <- df1$n_male[13:16]/sum(subset(df1, sem == "2" & course == "2",
select = total))
df1$pct_fem <- rep(NA, dim(df1)[1])
df1$pct_fem <- 1 - df1$pct_male
df1$pct_pass <- rep(NA, dim(df1)[1])
df1$pct_pass[1:4] <- df1$total_pass[1:4]/sum(subset(df1, sem == "1" & course == "1",
select = total))
df1$pct_pass[5:8] <- df1$total_pass[5:8]/sum(subset(df1, sem == "1" & course == "2",
select = total))
df1$pct_pass[9:12] <- df1$total_pass[9:12]/sum(subset(df1, sem == "2" & course ==
"1", select = total))
df1$pct_pass[13:16] <- df1$total_pass[13:16]/sum(subset(df1, sem == "2" & course ==
"2", select = total))
df1$male_pass_pct <- rep(NA, dim(df1)[1])
df1$male_pass_pct[1:4] <- subset(df1, sem == "1" & course == "1" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[1:4]
df1$male_pass_pct[5:8] <- subset(df1, sem == "1" & course == "2" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[5:8]
df1$male_pass_pct[9:12] <- subset(df1, sem == "2" & course == "1" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[9:12]
df1$male_pass_pct[13:16] <- subset(df1, sem == "2" & course == "2" & d.gender ==
"1" & d.pass == "1", select = total)/df1$n_male[13:16]
df1$fem_pass_pct <- rep(NA, dim(df1)[1])
df1$fem_pass_pct[1:4] <- subset(df1, sem == "1" & course == "1" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[1:4]
df1$fem_pass_pct[5:8] <- subset(df1, sem == "1" & course == "2" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[5:8]
df1$fem_pass_pct[9:12] <- subset(df1, sem == "2" & course == "1" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[9:12]
df1$fem_pass_pct[13:16] <- subset(df1, sem == "2" & course == "2" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[13:16]
df2 <- df1 %>%
group_by(sem, course) %>%
summarize(total_pass = first(total_pass),
pct_pass = first(pct_pass),
n_male = first(n_male),
n_fem = first(n_fem),
pct_male = first(pct_male),
pct_fem = first(pct_fem),
male_pass_pct = first(male_pass_pct),
fem_pass_pct = first(fem_pass_pct))
df2 <- unique(df1[, c(1, 2, 6, 11, 7:10, 12, 13)])
df2[, c(9, 10)] <- lapply(df2[, c(9, 10)], as.numeric)
对于只需要 4 行的度量来说真的很费力,但我无法让它为这个聚合工作......任何帮助都会很棒
只需分组,然后 summarise
原始文件。您可以使用 n()
来引用组中的行数,并且可以引用之前在 summarise
中创建的变量,这样您就可以
df %>% group_by(sem, course) %>%
summarise(total_pass = sum(d.pass),
n_male = sum(d.gender),
n_fem = sum(d.gender == 0),
pct_male = n_male / n(),
pct_fem = n_fem / n(),
pct_pass = total_pass / n(),
male_pass_pct = sum(d.gender & d.pass) / n_male,
fem_pass_pct = sum(d.gender == 0 & d.pass) / n_fem)
## Source: local data frame [4 x 10]
## Groups: sem [?]
##
## sem course total_pass n_male n_fem pct_male pct_fem pct_pass male_pass_pct fem_pass_pct
## <fctr> <fctr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 14 20 30 0.4000000 0.6000000 0.2800000 0.25000000 0.3000000
## 2 1 2 7 19 26 0.4222222 0.5777778 0.1555556 0.05263158 0.2307692
## 3 2 1 12 23 23 0.5000000 0.5000000 0.2608696 0.30434783 0.2173913
## 4 2 2 16 25 34 0.4237288 0.5762712 0.2711864 0.20000000 0.3235294
重塑您的数据以将性别从列 headers 移至实际变量将使您的数据更整洁并且需要更少的操作,如果您愿意的话。
我有一个包含 2 个不同学期的两门课程的数据集,其形式如下:
set.seed(200)
sem <- sample(c("1", "2"), 200, replace = T)
course <- sample(c("1", "2"), 200, replace = T)
d.gender = sample(c(0, 1), 200, replace = T, prob = c(0.6, 0.4))
d.pass = sample(c(0, 1), 200, replace = T, prob = c(0.7, 0.3))
df <- data.frame(sem, course, d.gender, d.pass)
我正在尝试有效地创建一个包含 4 个不同的 sem、课程组合及其总通过率、d.gender = 1 的百分比,最后是这两个性别类别中的通过率。我可以制作一个 table 来提供我需要计算的所有值,但我知道有一种更有效的方法来计算我需要的东西,而无需嵌套一堆不同的 group_by 和汇总函数,或者制作一个一大堆不同的 tbl 和 left_joining 我想要的列。我可以通过索引和子集函数得到我需要的东西,但我希望有更好的方法来获得一个包含我需要的一切的 4 行矩阵,但它很难看并且需要很长时间,而且很容易出错代码:
df1 <- df %>% group_by(sem, course, d.gender, d.pass) %>% summarize(total = n())
df1$total_pass <- rep(NA, dim(df1)[1])
df1$total_pass[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.pass == "1",
select = total))
df1$total_pass[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.pass == "1",
select = total))
df1$total_pass[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.pass == "1",
select = total))
df1$total_pass[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.pass == "1",
select = total))
df1$n_male <- rep(NA, dim(df1)[1])
df1$n_male[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.gender == "1",
select = total))
df1$n_male[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.gender == "1",
select = total))
df1$n_male[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.gender == "1",
select = total))
df1$n_male[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.gender == "1",
select = total))
df1$n_fem <- rep(NA, dim(df1)[1])
df1$n_fem[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.gender == "0", select = total))
df1$n_fem[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.gender == "0", select = total))
df1$n_fem[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.gender == "0",
select = total))
df1$n_fem[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.gender == "0",
select = total))
df1$pct_male <- rep(NA, dim(df1)[1])
df1$pct_male[1:4] <- df1$n_male[1:4]/sum(subset(df1, sem == "1" & course == "1",
select = total))
df1$pct_male[5:8] <- df1$n_male[5:8]/sum(subset(df1, sem == "1" & course == "2",
select = total))
df1$pct_male[9:12] <- df1$n_male[9:12]/sum(subset(df1, sem == "2" & course == "1",
select = total))
df1$pct_male[13:16] <- df1$n_male[13:16]/sum(subset(df1, sem == "2" & course == "2",
select = total))
df1$pct_fem <- rep(NA, dim(df1)[1])
df1$pct_fem <- 1 - df1$pct_male
df1$pct_pass <- rep(NA, dim(df1)[1])
df1$pct_pass[1:4] <- df1$total_pass[1:4]/sum(subset(df1, sem == "1" & course == "1",
select = total))
df1$pct_pass[5:8] <- df1$total_pass[5:8]/sum(subset(df1, sem == "1" & course == "2",
select = total))
df1$pct_pass[9:12] <- df1$total_pass[9:12]/sum(subset(df1, sem == "2" & course ==
"1", select = total))
df1$pct_pass[13:16] <- df1$total_pass[13:16]/sum(subset(df1, sem == "2" & course ==
"2", select = total))
df1$male_pass_pct <- rep(NA, dim(df1)[1])
df1$male_pass_pct[1:4] <- subset(df1, sem == "1" & course == "1" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[1:4]
df1$male_pass_pct[5:8] <- subset(df1, sem == "1" & course == "2" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[5:8]
df1$male_pass_pct[9:12] <- subset(df1, sem == "2" & course == "1" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[9:12]
df1$male_pass_pct[13:16] <- subset(df1, sem == "2" & course == "2" & d.gender ==
"1" & d.pass == "1", select = total)/df1$n_male[13:16]
df1$fem_pass_pct <- rep(NA, dim(df1)[1])
df1$fem_pass_pct[1:4] <- subset(df1, sem == "1" & course == "1" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[1:4]
df1$fem_pass_pct[5:8] <- subset(df1, sem == "1" & course == "2" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[5:8]
df1$fem_pass_pct[9:12] <- subset(df1, sem == "2" & course == "1" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[9:12]
df1$fem_pass_pct[13:16] <- subset(df1, sem == "2" & course == "2" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[13:16]
df2 <- df1 %>%
group_by(sem, course) %>%
summarize(total_pass = first(total_pass),
pct_pass = first(pct_pass),
n_male = first(n_male),
n_fem = first(n_fem),
pct_male = first(pct_male),
pct_fem = first(pct_fem),
male_pass_pct = first(male_pass_pct),
fem_pass_pct = first(fem_pass_pct))
df2 <- unique(df1[, c(1, 2, 6, 11, 7:10, 12, 13)])
df2[, c(9, 10)] <- lapply(df2[, c(9, 10)], as.numeric)
对于只需要 4 行的度量来说真的很费力,但我无法让它为这个聚合工作......任何帮助都会很棒
只需分组,然后 summarise
原始文件。您可以使用 n()
来引用组中的行数,并且可以引用之前在 summarise
中创建的变量,这样您就可以
df %>% group_by(sem, course) %>%
summarise(total_pass = sum(d.pass),
n_male = sum(d.gender),
n_fem = sum(d.gender == 0),
pct_male = n_male / n(),
pct_fem = n_fem / n(),
pct_pass = total_pass / n(),
male_pass_pct = sum(d.gender & d.pass) / n_male,
fem_pass_pct = sum(d.gender == 0 & d.pass) / n_fem)
## Source: local data frame [4 x 10]
## Groups: sem [?]
##
## sem course total_pass n_male n_fem pct_male pct_fem pct_pass male_pass_pct fem_pass_pct
## <fctr> <fctr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 14 20 30 0.4000000 0.6000000 0.2800000 0.25000000 0.3000000
## 2 1 2 7 19 26 0.4222222 0.5777778 0.1555556 0.05263158 0.2307692
## 3 2 1 12 23 23 0.5000000 0.5000000 0.2608696 0.30434783 0.2173913
## 4 2 2 16 25 34 0.4237288 0.5762712 0.2711864 0.20000000 0.3235294
重塑您的数据以将性别从列 headers 移至实际变量将使您的数据更整洁并且需要更少的操作,如果您愿意的话。