创建一个包含计数和百分比的 table,但缺少数据
Create a table with counts and percentages with missing data
我正在尝试创建一个具有预设尺寸的 table 并让 R 填写计数和百分比。这是针对 R-markdown 报告的。
这是我的示例数据的代码。
#This is the most realistic data I could produce.
Maj <- rep("Major A", times=50)
set.seed(24601)
Race <- sample(c("Asian","Black", "Am Indian","Hawiian" ,"Hispanic","White","Two or More Races","Not Reported"),
prob=c(.01,.1,.01,.01,.02,.80,.05,.01),size=50, replace = T)
Sex <- sample(c("Female","Male"), prob=c(.98,.02),size=50,replace=T)
Enroll_MajorA <- cbind(Maj,Sex,Race)
我需要 table 来计算数据集中是否存在给定种族和性别组合的计数和百分比。这是我需要的样子。
我已经尝试单独计算 table 的每个值,R-markdown 给了我一个 "memory error"。我已经尝试创建一个计数和百分比 table 并将它们组合在一起,但它并没有提供报告所需的所有 Race/Sex 组合。我不知道从这里去哪里。请帮忙!
您可以使用 aggregate
。您可以保持矩阵不变,因为您可以使用 as.data.frame
,它会自动转换为可数因子。 NROW
(大写字母)不区分矩阵和向量。
m.agg <- do.call(data.frame,
aggregate(. ~ Sex + Race, as.data.frame(Enroll_MajorA), function(x)
c(count=as.integer(NROW(x)), share=NROW(x) / NROW(Enroll_MajorA))))
为了获得完整的集合,我们可能会合并一个 expand.grid
,我们可能需要稍微清理一下。
res <- merge(as.data.frame(m.agg), expand.grid(Sex=c("Female", "Male"),
Race=relevant.races), all=TRUE) # `relevant.races` below
res[, 3:4][is.na(res[, 3:4])] <- 0 # transform `NA` into 0 to be nice
res[order(res[, "Race"]), ] # order output
# Sex Race Maj.count Maj.share
# 1 Female Black 2 0.04
# 10 Male Black 0 0.00
# 2 Female Hawiian 1 0.02
# 3 Female Hispanic 1 0.02
# 11 Male Hispanic 0 0.00
# 4 Female Two or More Races 2 0.04
# 12 Male Two or More Races 0 0.00
# 5 Female White 44 0.88
# 13 Male White 0 0.00
# 6 Female Asian 0 0.00
# 14 Male Asian 0 0.00
# 7 Female Am. Indian 0 0.00
# 15 Male Am. Indian 0 0.00
# 8 Female Hawaiian 0 0.00
# 16 Male Hawaiian 0 0.00
# 9 Female Not Reported 0 0.00
# 17 Male Not Reported 0 0.00
数据
relevant.races <- c("Asian","Black", "Am. Indian", "Hawaiian" , "Hispanic", "White",
"Two or More Races", "Not Reported")
Enroll_MajorA <- structure(c("Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "White", "White",
"White", "Hawiian", "White", "White", "White", "White", "White",
"White", "White", "White", "White", "Two or More Races", "White",
"White", "White", "White", "White", "White", "White", "Hispanic",
"White", "White", "White", "White", "White", "White", "Two or More Races",
"White", "White", "White", "White", "White", "White", "White",
"White", "Black", "White", "White", "Black", "White", "White",
"White", "White", "White", "White", "White", "White", "White"
), .Dim = c(50L, 3L), .Dimnames = list(NULL, c("Maj", "Sex",
"Race")))
使用 tidyverse
的一种方法使用 .drop = FALSE
将包括缺失因子水平
library(tidyverse)
Enroll_MajorA %>%
group_by(Race, Sex, .drop = FALSE) %>%
summarise(count = n()) %>%
ungroup() %>%
mutate(perc = count/sum(count)) %>%
gather(key, value, -Sex, -Race) %>%
unite(Race, Race, key) %>%
spread(Race, value)
数据
正如@Cath 所评论的,我们需要在数据中明确包含所有级别。
Maj <- rep("Major A", times=50)
set.seed(24601)
Race <- factor(sample(c("Asian","Black", "Am Indian","Hawiian" ,"Hispanic","White","Two or More Races","Not Reported"),
prob=c(.01,.1,.01,.01,.02,.80,.05,.01),size=50, replace = T),
levels=c("Asian","Black", "Am Indian","Hawiian" ,"Hispanic","White","Two or More Races","Not Reported"))
Sex <- factor(sample(c("Female","Male"), prob=c(.98,.02),size=50,replace=T), levels = c("Female","Male"))
Enroll_MajorA <- data.frame(Maj,Sex,Race)
我正在尝试创建一个具有预设尺寸的 table 并让 R 填写计数和百分比。这是针对 R-markdown 报告的。
这是我的示例数据的代码。
#This is the most realistic data I could produce.
Maj <- rep("Major A", times=50)
set.seed(24601)
Race <- sample(c("Asian","Black", "Am Indian","Hawiian" ,"Hispanic","White","Two or More Races","Not Reported"),
prob=c(.01,.1,.01,.01,.02,.80,.05,.01),size=50, replace = T)
Sex <- sample(c("Female","Male"), prob=c(.98,.02),size=50,replace=T)
Enroll_MajorA <- cbind(Maj,Sex,Race)
我需要 table 来计算数据集中是否存在给定种族和性别组合的计数和百分比。这是我需要的样子。
我已经尝试单独计算 table 的每个值,R-markdown 给了我一个 "memory error"。我已经尝试创建一个计数和百分比 table 并将它们组合在一起,但它并没有提供报告所需的所有 Race/Sex 组合。我不知道从这里去哪里。请帮忙!
您可以使用 aggregate
。您可以保持矩阵不变,因为您可以使用 as.data.frame
,它会自动转换为可数因子。 NROW
(大写字母)不区分矩阵和向量。
m.agg <- do.call(data.frame,
aggregate(. ~ Sex + Race, as.data.frame(Enroll_MajorA), function(x)
c(count=as.integer(NROW(x)), share=NROW(x) / NROW(Enroll_MajorA))))
为了获得完整的集合,我们可能会合并一个 expand.grid
,我们可能需要稍微清理一下。
res <- merge(as.data.frame(m.agg), expand.grid(Sex=c("Female", "Male"),
Race=relevant.races), all=TRUE) # `relevant.races` below
res[, 3:4][is.na(res[, 3:4])] <- 0 # transform `NA` into 0 to be nice
res[order(res[, "Race"]), ] # order output
# Sex Race Maj.count Maj.share
# 1 Female Black 2 0.04
# 10 Male Black 0 0.00
# 2 Female Hawiian 1 0.02
# 3 Female Hispanic 1 0.02
# 11 Male Hispanic 0 0.00
# 4 Female Two or More Races 2 0.04
# 12 Male Two or More Races 0 0.00
# 5 Female White 44 0.88
# 13 Male White 0 0.00
# 6 Female Asian 0 0.00
# 14 Male Asian 0 0.00
# 7 Female Am. Indian 0 0.00
# 15 Male Am. Indian 0 0.00
# 8 Female Hawaiian 0 0.00
# 16 Male Hawaiian 0 0.00
# 9 Female Not Reported 0 0.00
# 17 Male Not Reported 0 0.00
数据
relevant.races <- c("Asian","Black", "Am. Indian", "Hawaiian" , "Hispanic", "White",
"Two or More Races", "Not Reported")
Enroll_MajorA <- structure(c("Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Major A", "Major A", "Major A",
"Major A", "Major A", "Major A", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "White", "White",
"White", "Hawiian", "White", "White", "White", "White", "White",
"White", "White", "White", "White", "Two or More Races", "White",
"White", "White", "White", "White", "White", "White", "Hispanic",
"White", "White", "White", "White", "White", "White", "Two or More Races",
"White", "White", "White", "White", "White", "White", "White",
"White", "Black", "White", "White", "Black", "White", "White",
"White", "White", "White", "White", "White", "White", "White"
), .Dim = c(50L, 3L), .Dimnames = list(NULL, c("Maj", "Sex",
"Race")))
使用 tidyverse
的一种方法使用 .drop = FALSE
将包括缺失因子水平
library(tidyverse)
Enroll_MajorA %>%
group_by(Race, Sex, .drop = FALSE) %>%
summarise(count = n()) %>%
ungroup() %>%
mutate(perc = count/sum(count)) %>%
gather(key, value, -Sex, -Race) %>%
unite(Race, Race, key) %>%
spread(Race, value)
数据
正如@Cath 所评论的,我们需要在数据中明确包含所有级别。
Maj <- rep("Major A", times=50)
set.seed(24601)
Race <- factor(sample(c("Asian","Black", "Am Indian","Hawiian" ,"Hispanic","White","Two or More Races","Not Reported"),
prob=c(.01,.1,.01,.01,.02,.80,.05,.01),size=50, replace = T),
levels=c("Asian","Black", "Am Indian","Hawiian" ,"Hispanic","White","Two or More Races","Not Reported"))
Sex <- factor(sample(c("Female","Male"), prob=c(.98,.02),size=50,replace=T), levels = c("Female","Male"))
Enroll_MajorA <- data.frame(Maj,Sex,Race)