如何一次汇总跨年龄组的多个 vriables/items
How to summarise multiple vriables/items across age group at once
我有一个包含超过 150,000 个条目的数据框。示例如下:
ID <- 1111, 1222, 3333, 4444, 1555, 6666
V1 <- 1, 0, 1, 0, 0, 0
V2 <- 1, 0, 0, 0, 0, 1
V3 <- 0, 1, 1, 0, 0, 1
V4 <- 1, 0, 1, 1, 0, 0
AgeGr <- 15-24,24-35,15-24,35-48, 48+, 35-48
所有变量(示例中的V1-V4)都是0/1回答的二分问题。现在我想总结一下所有年龄组中每个变量的 0/1 出现情况。我期望这样的输出:
Variable V1 V2 V3 V4 # Variale names
Answer 0 1 0 1 0 1 0 1 # answer levels (1/0)
15-24 0 2 1 1 1 1 0 2 # the frequency of "0" and "1" under this age group
24-35 1 0 1 0 0 1 1 0
35-48 2 0 1 1 1 1 0 1
48+ 1 0 1 0 1 0 1 0
我已经尝试 janitor::tabyl,使用 tabyl(df,AgeGr, V1)。然而,它只在一行中总结了 V1。
当我尝试 tabyl(df,AgeGr, df[V1:V4]) 时,它失败了。我想知道我是否可以使用 tabyl() 并使用 apply() 之类的函数?还是我应该转向其他事情?
如有任何建议,我们将不胜感激。提前谢谢你:)
您可以执行以下操作:
ID <- c(1111, 1222, 3333, 4444, 1555, 6666)
V1 <- c(1, 0, 1, 0, 0, 0)
V2 <- c(1, 0, 0, 0, 0, 1)
V3 <- c(0, 1, 1, 0, 0, 1)
V4 <- c(1, 0, 1, 1, 0, 0)
AgeGr <- c("15-24","24-35","15-24","35-48", "48+", "35-48")
df <- data.frame(ID=ID,V1=V1,V2=V2,V3=V3,V4=V4,AgeGr = AgeGr, stringsAsFactors = FALSE)
ageAnswerSplit <- split(df[,c("V1","V2","V3","V4")],df[["AgeGr"]])
summarized <- do.call("rbind",lapply(ageAnswerSplit, function(answerdf) {
answertables <- lapply(names(answerdf), function(nam) {
at <- table(answerdf[[nam]])
setNames(data.frame(unname(at["0"]),unname(at["1"])),paste0(nam,":",c(0,1)))
})
do.call("cbind",answertables)
}))
summarized[is.na(summarized)] <- 0
导致
> summarized
V1:0 V1:1 V2:0 V2:1 V3:0 V3:1 V4:0 V4:1
15-24 0 2 1 1 1 1 0 2
24-35 1 0 1 0 0 1 1 0
35-48 2 0 1 1 1 1 1 1
48+ 1 0 1 0 1 0 1 0
这里有一个tidyverse
选项-
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with('V')) %>%
count(AgeGr, name, value) %>%
unite(col, name, value) %>%
arrange(col) %>%
pivot_wider(names_from = col, values_from = n, values_fill = 0)
# AgeGr V1_0 V1_1 V2_0 V2_1 V3_0 V3_1 V4_0 V4_1
# <chr> <int> <int> <int> <int> <int> <int> <int> <int>
#1 24-35 1 0 1 0 0 1 1 0
#2 35-48 2 0 1 1 1 1 1 1
#3 48+ 1 0 1 0 1 0 1 0
#4 15-24 0 2 1 1 1 1 0 2
我有一个包含超过 150,000 个条目的数据框。示例如下:
ID <- 1111, 1222, 3333, 4444, 1555, 6666
V1 <- 1, 0, 1, 0, 0, 0
V2 <- 1, 0, 0, 0, 0, 1
V3 <- 0, 1, 1, 0, 0, 1
V4 <- 1, 0, 1, 1, 0, 0
AgeGr <- 15-24,24-35,15-24,35-48, 48+, 35-48
所有变量(示例中的V1-V4)都是0/1回答的二分问题。现在我想总结一下所有年龄组中每个变量的 0/1 出现情况。我期望这样的输出:
Variable V1 V2 V3 V4 # Variale names
Answer 0 1 0 1 0 1 0 1 # answer levels (1/0)
15-24 0 2 1 1 1 1 0 2 # the frequency of "0" and "1" under this age group
24-35 1 0 1 0 0 1 1 0
35-48 2 0 1 1 1 1 0 1
48+ 1 0 1 0 1 0 1 0
我已经尝试 janitor::tabyl,使用 tabyl(df,AgeGr, V1)。然而,它只在一行中总结了 V1。 当我尝试 tabyl(df,AgeGr, df[V1:V4]) 时,它失败了。我想知道我是否可以使用 tabyl() 并使用 apply() 之类的函数?还是我应该转向其他事情?
如有任何建议,我们将不胜感激。提前谢谢你:)
您可以执行以下操作:
ID <- c(1111, 1222, 3333, 4444, 1555, 6666)
V1 <- c(1, 0, 1, 0, 0, 0)
V2 <- c(1, 0, 0, 0, 0, 1)
V3 <- c(0, 1, 1, 0, 0, 1)
V4 <- c(1, 0, 1, 1, 0, 0)
AgeGr <- c("15-24","24-35","15-24","35-48", "48+", "35-48")
df <- data.frame(ID=ID,V1=V1,V2=V2,V3=V3,V4=V4,AgeGr = AgeGr, stringsAsFactors = FALSE)
ageAnswerSplit <- split(df[,c("V1","V2","V3","V4")],df[["AgeGr"]])
summarized <- do.call("rbind",lapply(ageAnswerSplit, function(answerdf) {
answertables <- lapply(names(answerdf), function(nam) {
at <- table(answerdf[[nam]])
setNames(data.frame(unname(at["0"]),unname(at["1"])),paste0(nam,":",c(0,1)))
})
do.call("cbind",answertables)
}))
summarized[is.na(summarized)] <- 0
导致
> summarized
V1:0 V1:1 V2:0 V2:1 V3:0 V3:1 V4:0 V4:1
15-24 0 2 1 1 1 1 0 2
24-35 1 0 1 0 0 1 1 0
35-48 2 0 1 1 1 1 1 1
48+ 1 0 1 0 1 0 1 0
这里有一个tidyverse
选项-
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with('V')) %>%
count(AgeGr, name, value) %>%
unite(col, name, value) %>%
arrange(col) %>%
pivot_wider(names_from = col, values_from = n, values_fill = 0)
# AgeGr V1_0 V1_1 V2_0 V2_1 V3_0 V3_1 V4_0 V4_1
# <chr> <int> <int> <int> <int> <int> <int> <int> <int>
#1 24-35 1 0 1 0 0 1 1 0
#2 35-48 2 0 1 1 1 1 1 1
#3 48+ 1 0 1 0 1 0 1 0
#4 15-24 0 2 1 1 1 1 0 2