聚合因子水平计数 - 按因子
Aggregating factor level counts - by factor
我一直在尝试制作一个 table 显示另一个因素的因素水平计数。为此,我查看了数十页、问题……试图使用某些包(dplyr、reshape)中的函数来完成工作,但没有成功地正确使用它们。
这就是我得到的:
# my data:
var1 <- c("red","blue","red","blue","red","red","red","red","red","red","red","red","blue","red","blue")
var2 <- c("0","1","0","0","0","0","0","0","0","0","1","0","0","0","0")
var3 <- c("2","2","1","1","1","3","1","2","1","1","3","1","1","2","1")
var4 <- c("0","1","0","0","0","0","1","0","1","1","0","1","0","1","1")
mydata <- data.frame(var1,var2,var3,var4)
head(mydata)
尝试n+1:仅显示另一个因素的因素总数。
t(aggregate(. ~ var1, mydata, sum))
[,1] [,2]
var1 "blue" "red"
var2 " 5" "12"
var3 " 5" "18"
var4 " 6" "16"
尝试 n+2:这是正确的格式,但我无法让它在多个因素上发挥作用。
library(dplyr)
data1 <- ddply(mydata, c("var1", "var3"), summarise,
N = length(var1))
library(reshape)
df1 <- cast(data1, var1 ~ var3, sum)
df1 <- t(df1)
df1
blue red
1 3 6
2 1 3
3 0 2
我想要的是:
blue red
var2.0 3 10
var2.1 1 1
var3.1 3 6
var3.2 1 3
var3.3 0 2
var4.0 2 6
var4.1 2 5
如何获得这种格式?非常感谢,
我们可以通过 'var1' melt
数据集,然后使用 table
library(reshape2)
tbl <- table(transform(melt(mydata, id.var="var1"),
varN = paste(variable, value, sep="."))[c(4,1)])
names(dimnames(tbl)) <- NULL
tbl
#
# blue red
# var2.0 3 10
# var2.1 1 1
# var3.1 3 6
# var3.2 1 3
# var3.3 0 2
# var4.0 2 6
# var4.1 2 5
或使用 dplyr/tidyr
,我们将数据集从 'wide' 转换为 'long' 格式 gather
,然后 unite
列('var', 'val')创建'varV',得到'var1'和'varV'分组后的频率(tally
),然后spread
到'wide'格式。
library(dplyr)
library(tidyr)
gather(mydata, var, val, -var1) %>%
unite(varV,var, val, sep=".") %>%
group_by(var1, varV) %>%
tally() %>%
spread(var1, n, fill = 0)
# varV blue red
# <chr> <dbl> <dbl>
#1 var2.0 3 10
#2 var2.1 1 1
#3 var3.1 3 6
#4 var3.2 1 3
#5 var3.3 0 2
#6 var4.0 2 6
#7 var4.1 2 5
我一直在尝试制作一个 table 显示另一个因素的因素水平计数。为此,我查看了数十页、问题……试图使用某些包(dplyr、reshape)中的函数来完成工作,但没有成功地正确使用它们。
这就是我得到的:
# my data:
var1 <- c("red","blue","red","blue","red","red","red","red","red","red","red","red","blue","red","blue")
var2 <- c("0","1","0","0","0","0","0","0","0","0","1","0","0","0","0")
var3 <- c("2","2","1","1","1","3","1","2","1","1","3","1","1","2","1")
var4 <- c("0","1","0","0","0","0","1","0","1","1","0","1","0","1","1")
mydata <- data.frame(var1,var2,var3,var4)
head(mydata)
尝试n+1:仅显示另一个因素的因素总数。
t(aggregate(. ~ var1, mydata, sum))
[,1] [,2]
var1 "blue" "red"
var2 " 5" "12"
var3 " 5" "18"
var4 " 6" "16"
尝试 n+2:这是正确的格式,但我无法让它在多个因素上发挥作用。
library(dplyr)
data1 <- ddply(mydata, c("var1", "var3"), summarise,
N = length(var1))
library(reshape)
df1 <- cast(data1, var1 ~ var3, sum)
df1 <- t(df1)
df1
blue red
1 3 6
2 1 3
3 0 2
我想要的是:
blue red
var2.0 3 10
var2.1 1 1
var3.1 3 6
var3.2 1 3
var3.3 0 2
var4.0 2 6
var4.1 2 5
如何获得这种格式?非常感谢,
我们可以通过 'var1' melt
数据集,然后使用 table
library(reshape2)
tbl <- table(transform(melt(mydata, id.var="var1"),
varN = paste(variable, value, sep="."))[c(4,1)])
names(dimnames(tbl)) <- NULL
tbl
#
# blue red
# var2.0 3 10
# var2.1 1 1
# var3.1 3 6
# var3.2 1 3
# var3.3 0 2
# var4.0 2 6
# var4.1 2 5
或使用 dplyr/tidyr
,我们将数据集从 'wide' 转换为 'long' 格式 gather
,然后 unite
列('var', 'val')创建'varV',得到'var1'和'varV'分组后的频率(tally
),然后spread
到'wide'格式。
library(dplyr)
library(tidyr)
gather(mydata, var, val, -var1) %>%
unite(varV,var, val, sep=".") %>%
group_by(var1, varV) %>%
tally() %>%
spread(var1, n, fill = 0)
# varV blue red
# <chr> <dbl> <dbl>
#1 var2.0 3 10
#2 var2.1 1 1
#3 var3.1 3 6
#4 var3.2 1 3
#5 var3.3 0 2
#6 var4.0 2 6
#7 var4.1 2 5