在 R 中使用均值、总和、长度和 sd 实现频率计数的更简单方法
A simpler way to achieve a frequency count with mean, sum, length and sd in R
我的任务是创建包含统计摘要的频率表。我的目标是创建一个可以简单导出到 excel 的数据框。
其中大部分可能在 sql 中使用存储过程,但我决定在 R 中执行此操作。我正在学习 R,所以我可能会做很长的路要走。这是
的后续问题
给定
Id <- c(1,2,3,4,5,6,7,8,9,10)
ClassA <- c(1,NA,3,1,1,2,1,4,5,3)
ClassB <- c(2,1,1,3,3,2,1,1,3,3)
R <- c(1,2,3,NA,9,2,4,5,6,7)
S <- c(3,7,NA,9,5,8,7,NA,7,6)
df <- data.frame(Id,ClassA,ClassB,R,S)
ZeroTenNAScale <- c(0:10,NA);
R.freq <- setNames(nm=c('answer','value'),data.frame(table(factor(df$R,levels=ZeroTenNAScale,exclude=NULL))));
R.freq[, 1] <- as.numeric(as.character( R.freq[, 1] ))
R.freq <- cbind(question='R',R.freq)
S.freq <- setNames(nm=c('answer','value'),data.frame(table(factor(df$S,levels=ZeroTenNAScale,exclude=NULL))));
S.freq[, 1] <- as.numeric(as.character( S.freq[, 1] ))
S.freq <- cbind(question='S',S.freq)
R.mean = mean(df$R, na.rm = TRUE)
R.length = sum(!is.na(df$R))
R.sd = sd(df$R, na.rm = TRUE)
R.sum = sum(df$R, na.rm = TRUE)
S.mean = mean(df$S, na.rm = TRUE)
S.length = sum(!is.na(df$S))
S.sd = sd(df$S, na.rm = TRUE)
S.sum = sum(df$S, na.rm = TRUE)
S.row <- cbind('S','sum',as.numeric(S.sum))
S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
S.freq = rbind(S.freq, S.row )
S.row <- cbind('S','length',as.numeric(S.length))
S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
S.freq = rbind(S.freq, S.row )
S.row <- cbind('S','mean',as.numeric(S.mean))
S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
S.freq = rbind(S.freq, S.row )
S.row <- cbind('S','sd',as.numeric(S.sd))
S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
S.freq = rbind(S.freq, S.row )
R.row <- cbind('R','sum',as.numeric(R.sum))
R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
R.freq = rbind(R.freq, R.row )
R.row <- cbind('R','length',as.numeric(R.length))
R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
R.freq = rbind(R.freq, R.row )
R.row <- cbind('R','mean',as.numeric(R.mean))
R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
R.freq = rbind(R.freq, R.row )
R.row <- cbind('R','sd',as.numeric(R.sd))
R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
R.freq = rbind(R.freq, R.row )
result <- rbind(R.freq,S.freq)
result <- cbind(filter='None',result)
result
我明白了
filter question answer value
1 None R 0 0
2 None R 1 1
3 None R 2 2
4 None R 3 1
5 None R 4 1
6 None R 5 1
7 None R 6 1
8 None R 7 1
9 None R 8 0
10 None R 9 1
11 None R 10 0
12 None R <NA> 1
13 None R sum 39
14 None R length 9
15 None R mean 4.33333333333333
16 None R sd 2.64575131106459
17 None S 0 0
18 None S 1 0
19 None S 2 0
20 None S 3 1
21 None S 4 0
22 None S 5 1
23 None S 6 1
24 None S 7 3
25 None S 8 1
26 None S 9 1
27 None S 10 0
28 None S <NA> 2
29 None S sum 52
30 None S length 8
31 None S mean 6.5
32 None S sd 1.8516401995451
这正是我要找的。我认为下一步是开始包装一些函数以简化代码,然后再开始添加来自 ClassA=1、ClassA=n+1 ... ClassA=NA、ClassB=1、ClassB= 的类似结果集2 ... ClassB=NA。有更简单的方法吗?
#
# create the summary function
summaryStatistics <- function(x) {
xx <- na.omit(x)
c(table(factor(x, levels=0:10), useNA='always', exclude=NULL),
sum=sum(xx), length=length(x), mean=mean(xx), sd=sqrt(var(xx)))
}
# create the test data frame
Id <- c(1,2,3,4,5,6,7,8,9,10)
ClassA <- c(1,NA,3,1,1,2,1,4,5,3)
ClassB <- c(2,1,1,3,3,2,1,1,3,3)
R <- c(1,2,3,NA,9,2,4,5,6,7)
S <- c(3,7,NA,9,5,8,7,NA,7,6)
df <- data.frame(Id,ClassA,ClassB,R,S)
# create the result
result <- setNames(
nm=c('answer','question','value'),
as.data.frame(
as.table(
simplify2array(
lapply(
df[c('R', 'S')],
summaryStatistics
)
)
)
)
)
# change the order to question, answer, value
result <- result[, c(2, 1, 3)]
# add the filter
result <- cbind(filter='None',result)
# return the result
result
这样就简单多了,也让我训练团队的其他任务简单多了。感谢 Ernest A and Imo.
关于我对 R 的理解的下一个问题是
减少代码大小的一个方法是将摘要统计信息包装在一个函数中:
myStats <- function(x) {
answer <- c("sum"=sum(x, na.rm = TRUE), "length"=sum(!is.na(x)),
"mean"=mean(x, na.rm = TRUE), "sd"=sd(x, na.rm = TRUE))
return(answer)
}
这 returns 一个命名的摘要统计向量,按照您在输出中的顺序排序。然后,您可以 rbind
返回值以及您频率的名称 table:
R.stats <- myStats(df$R)
rbind(R.freq, data.frame("question"='R', "answer"=names(R.stats),
"value"=R.stats))
是的,绝对可以简化。通常您会使用汇总函数,例如
smry <- function(x, levels) {
xx <- na.omit(x)
c(table(factor(x, levels=levels), useNA='always', exclude=NULL),
sum=sum(xx), length=length(x), mean=mean(xx), sd=sqrt(var(xx)))
}
然后将其应用于数据的不同子集
> lapply(df[c('R', 'S')], smry, 0:10)
$R
0 1 2 3 4 5 6 7
0.000000 1.000000 2.000000 1.000000 1.000000 1.000000 1.000000 1.000000
8 9 10 <NA> sum length mean sd
0.000000 1.000000 0.000000 1.000000 39.000000 10.000000 4.333333 2.645751
$S
0 1 2 3 4 5 6 7
0.00000 0.00000 0.00000 1.00000 0.00000 1.00000 1.00000 3.00000
8 9 10 <NA> sum length mean sd
1.00000 1.00000 0.00000 2.00000 52.00000 10.00000 6.50000 1.85164
如果您绝对必须将所有内容放入数据框中
> as.data.frame(as.table(simplify2array(lapply(df[c('R', 'S')], smry, 0:10))))
Var1 Var2 Freq
1 0 R 0.000000
2 1 R 1.000000
3 2 R 2.000000
4 3 R 1.000000
5 4 R 1.000000
6 5 R 1.000000
7 6 R 1.000000
8 7 R 1.000000
9 8 R 0.000000
10 9 R 1.000000
11 10 R 0.000000
12 <NA> R 1.000000
13 sum R 39.000000
14 length R 10.000000
15 mean R 4.333333
16 sd R 2.645751
17 0 S 0.000000
18 1 S 0.000000
19 2 S 0.000000
20 3 S 1.000000
21 4 S 0.000000
22 5 S 1.000000
23 6 S 1.000000
24 7 S 3.000000
25 8 S 1.000000
26 9 S 1.000000
27 10 S 0.000000
28 <NA> S 2.000000
29 sum S 52.000000
30 length S 10.000000
31 mean S 6.500000
32 sd S 1.851640
然后根据需要更改列名/添加列。
我的任务是创建包含统计摘要的频率表。我的目标是创建一个可以简单导出到 excel 的数据框。
其中大部分可能在 sql 中使用存储过程,但我决定在 R 中执行此操作。我正在学习 R,所以我可能会做很长的路要走。这是
给定
Id <- c(1,2,3,4,5,6,7,8,9,10)
ClassA <- c(1,NA,3,1,1,2,1,4,5,3)
ClassB <- c(2,1,1,3,3,2,1,1,3,3)
R <- c(1,2,3,NA,9,2,4,5,6,7)
S <- c(3,7,NA,9,5,8,7,NA,7,6)
df <- data.frame(Id,ClassA,ClassB,R,S)
ZeroTenNAScale <- c(0:10,NA);
R.freq <- setNames(nm=c('answer','value'),data.frame(table(factor(df$R,levels=ZeroTenNAScale,exclude=NULL))));
R.freq[, 1] <- as.numeric(as.character( R.freq[, 1] ))
R.freq <- cbind(question='R',R.freq)
S.freq <- setNames(nm=c('answer','value'),data.frame(table(factor(df$S,levels=ZeroTenNAScale,exclude=NULL))));
S.freq[, 1] <- as.numeric(as.character( S.freq[, 1] ))
S.freq <- cbind(question='S',S.freq)
R.mean = mean(df$R, na.rm = TRUE)
R.length = sum(!is.na(df$R))
R.sd = sd(df$R, na.rm = TRUE)
R.sum = sum(df$R, na.rm = TRUE)
S.mean = mean(df$S, na.rm = TRUE)
S.length = sum(!is.na(df$S))
S.sd = sd(df$S, na.rm = TRUE)
S.sum = sum(df$S, na.rm = TRUE)
S.row <- cbind('S','sum',as.numeric(S.sum))
S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
S.freq = rbind(S.freq, S.row )
S.row <- cbind('S','length',as.numeric(S.length))
S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
S.freq = rbind(S.freq, S.row )
S.row <- cbind('S','mean',as.numeric(S.mean))
S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
S.freq = rbind(S.freq, S.row )
S.row <- cbind('S','sd',as.numeric(S.sd))
S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
S.freq = rbind(S.freq, S.row )
R.row <- cbind('R','sum',as.numeric(R.sum))
R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
R.freq = rbind(R.freq, R.row )
R.row <- cbind('R','length',as.numeric(R.length))
R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
R.freq = rbind(R.freq, R.row )
R.row <- cbind('R','mean',as.numeric(R.mean))
R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
R.freq = rbind(R.freq, R.row )
R.row <- cbind('R','sd',as.numeric(R.sd))
R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
R.freq = rbind(R.freq, R.row )
result <- rbind(R.freq,S.freq)
result <- cbind(filter='None',result)
result
我明白了
filter question answer value
1 None R 0 0
2 None R 1 1
3 None R 2 2
4 None R 3 1
5 None R 4 1
6 None R 5 1
7 None R 6 1
8 None R 7 1
9 None R 8 0
10 None R 9 1
11 None R 10 0
12 None R <NA> 1
13 None R sum 39
14 None R length 9
15 None R mean 4.33333333333333
16 None R sd 2.64575131106459
17 None S 0 0
18 None S 1 0
19 None S 2 0
20 None S 3 1
21 None S 4 0
22 None S 5 1
23 None S 6 1
24 None S 7 3
25 None S 8 1
26 None S 9 1
27 None S 10 0
28 None S <NA> 2
29 None S sum 52
30 None S length 8
31 None S mean 6.5
32 None S sd 1.8516401995451
这正是我要找的。我认为下一步是开始包装一些函数以简化代码,然后再开始添加来自 ClassA=1、ClassA=n+1 ... ClassA=NA、ClassB=1、ClassB= 的类似结果集2 ... ClassB=NA。有更简单的方法吗?
#
# create the summary function
summaryStatistics <- function(x) {
xx <- na.omit(x)
c(table(factor(x, levels=0:10), useNA='always', exclude=NULL),
sum=sum(xx), length=length(x), mean=mean(xx), sd=sqrt(var(xx)))
}
# create the test data frame
Id <- c(1,2,3,4,5,6,7,8,9,10)
ClassA <- c(1,NA,3,1,1,2,1,4,5,3)
ClassB <- c(2,1,1,3,3,2,1,1,3,3)
R <- c(1,2,3,NA,9,2,4,5,6,7)
S <- c(3,7,NA,9,5,8,7,NA,7,6)
df <- data.frame(Id,ClassA,ClassB,R,S)
# create the result
result <- setNames(
nm=c('answer','question','value'),
as.data.frame(
as.table(
simplify2array(
lapply(
df[c('R', 'S')],
summaryStatistics
)
)
)
)
)
# change the order to question, answer, value
result <- result[, c(2, 1, 3)]
# add the filter
result <- cbind(filter='None',result)
# return the result
result
这样就简单多了,也让我训练团队的其他任务简单多了。感谢 Ernest A and Imo.
关于我对 R 的理解的下一个问题是
减少代码大小的一个方法是将摘要统计信息包装在一个函数中:
myStats <- function(x) {
answer <- c("sum"=sum(x, na.rm = TRUE), "length"=sum(!is.na(x)),
"mean"=mean(x, na.rm = TRUE), "sd"=sd(x, na.rm = TRUE))
return(answer)
}
这 returns 一个命名的摘要统计向量,按照您在输出中的顺序排序。然后,您可以 rbind
返回值以及您频率的名称 table:
R.stats <- myStats(df$R)
rbind(R.freq, data.frame("question"='R', "answer"=names(R.stats),
"value"=R.stats))
是的,绝对可以简化。通常您会使用汇总函数,例如
smry <- function(x, levels) {
xx <- na.omit(x)
c(table(factor(x, levels=levels), useNA='always', exclude=NULL),
sum=sum(xx), length=length(x), mean=mean(xx), sd=sqrt(var(xx)))
}
然后将其应用于数据的不同子集
> lapply(df[c('R', 'S')], smry, 0:10)
$R
0 1 2 3 4 5 6 7
0.000000 1.000000 2.000000 1.000000 1.000000 1.000000 1.000000 1.000000
8 9 10 <NA> sum length mean sd
0.000000 1.000000 0.000000 1.000000 39.000000 10.000000 4.333333 2.645751
$S
0 1 2 3 4 5 6 7
0.00000 0.00000 0.00000 1.00000 0.00000 1.00000 1.00000 3.00000
8 9 10 <NA> sum length mean sd
1.00000 1.00000 0.00000 2.00000 52.00000 10.00000 6.50000 1.85164
如果您绝对必须将所有内容放入数据框中
> as.data.frame(as.table(simplify2array(lapply(df[c('R', 'S')], smry, 0:10))))
Var1 Var2 Freq
1 0 R 0.000000
2 1 R 1.000000
3 2 R 2.000000
4 3 R 1.000000
5 4 R 1.000000
6 5 R 1.000000
7 6 R 1.000000
8 7 R 1.000000
9 8 R 0.000000
10 9 R 1.000000
11 10 R 0.000000
12 <NA> R 1.000000
13 sum R 39.000000
14 length R 10.000000
15 mean R 4.333333
16 sd R 2.645751
17 0 S 0.000000
18 1 S 0.000000
19 2 S 0.000000
20 3 S 1.000000
21 4 S 0.000000
22 5 S 1.000000
23 6 S 1.000000
24 7 S 3.000000
25 8 S 1.000000
26 9 S 1.000000
27 10 S 0.000000
28 <NA> S 2.000000
29 sum S 52.000000
30 length S 10.000000
31 mean S 6.500000
32 sd S 1.851640
然后根据需要更改列名/添加列。