R - 获取摘要 table,其中包含数据框的指定百分位级别
R - Get a summary table containing specified percentile levels for a dataframe
我想获得一个摘要 table,它显示的内容比 R 中的 summary(x)
函数生成的典型描述性统计信息多。例如 10% 百分位数、90% 百分位数。
我在网上找到的其他答案推荐给出答案的方式,但不是以表格形式。
我一直在寻找一种方法,可以在 summary(x) 函数生成的摘要 table 中添加指定的百分位级别。
示例数据如下:
df = data.frame("a"=seq(1,10), "b"=seq(10,100,10),
"c"=letters[seq(1,10)], "d"=seq(5,95,10))
# generate data
df = data.frame("a"=seq(1,10), "b"=seq(10,100,10), "c"=letters[seq(1,10)], "d"=seq(5,95,10))
# filter numerical columns
ndf = Filter(is.numeric,df)
features = colnames(ndf)
# percentiles reqd
p_reqd = c(0,0.10,0.25,0.5,0.75,0.90,0.95,1) # more percentile levels can be specified here
# after adding/removing, adjust p_lev as well
# labels for specified percentiles + mean
p_lev = c('Min','10%','25%','50%','Mean','75%','90%','95%','Max')
# created empty dataframe with row names specified
final = data.frame(row.names = p_lev)
# loop
for (i in features) {
x = ndf[,i]
sm = data.frame("dStats" = quantile(x, p_reqd))
final[1:which(rownames(final)=="50%"),i] = sm$dStats[1:which(rownames(sm)=="50%")]
final[which(rownames(final)=="50%")+1,i] = round(mean(x),2)
final[(which(rownames(final)=="50%")+2):nrow(final), i] =
sm$dStats[(which(rownames(sm)=="50%")+1):nrow(sm)]
}
# custom summary table
final
还有 dplyr
和 tidyr
的方法。
df = data.frame("a"=seq(1,10), "b"=seq(10,100,10),
"c"=letters[seq(1,10)], "d"=seq(5,95,10))
library(dplyr)
library(tidyr)
out <- df %>% summarise_if(is.numeric, .funs = list(
"Min" = min,
"10%" = function(x)quantile(x, .1),
"25%" = function(x)quantile(x, .25),
"50%" = median,
"Mean" = mean,
"75%" = function(x)quantile(x, .75),
"90%" = function(x)quantile(x, .90),
"Max" = max)) %>%
pivot_longer(cols=everything(),
names_pattern = "(.*)_(.*)",
names_to = c("var", "stat"),
values_to="vals") %>%
pivot_wider(names_from="var",
values_from="vals",
id_cols="stat") %>%
as.data.frame()
rownames(out) <- out$stat
out <- out %>% select(-stat)
out
# a b d
# Min 1.00 10.0 5.0
# 10% 1.90 19.0 14.0
# 25% 3.25 32.5 27.5
# 50% 5.50 55.0 50.0
# Mean 5.50 55.0 50.0
# 75% 7.75 77.5 72.5
# 90% 9.10 91.0 86.0
# Max 10.00 100.0 95.0
我想获得一个摘要 table,它显示的内容比 R 中的 summary(x)
函数生成的典型描述性统计信息多。例如 10% 百分位数、90% 百分位数。
我在网上找到的其他答案推荐给出答案的方式,但不是以表格形式。
我一直在寻找一种方法,可以在 summary(x) 函数生成的摘要 table 中添加指定的百分位级别。
示例数据如下:
df = data.frame("a"=seq(1,10), "b"=seq(10,100,10),
"c"=letters[seq(1,10)], "d"=seq(5,95,10))
# generate data
df = data.frame("a"=seq(1,10), "b"=seq(10,100,10), "c"=letters[seq(1,10)], "d"=seq(5,95,10))
# filter numerical columns
ndf = Filter(is.numeric,df)
features = colnames(ndf)
# percentiles reqd
p_reqd = c(0,0.10,0.25,0.5,0.75,0.90,0.95,1) # more percentile levels can be specified here
# after adding/removing, adjust p_lev as well
# labels for specified percentiles + mean
p_lev = c('Min','10%','25%','50%','Mean','75%','90%','95%','Max')
# created empty dataframe with row names specified
final = data.frame(row.names = p_lev)
# loop
for (i in features) {
x = ndf[,i]
sm = data.frame("dStats" = quantile(x, p_reqd))
final[1:which(rownames(final)=="50%"),i] = sm$dStats[1:which(rownames(sm)=="50%")]
final[which(rownames(final)=="50%")+1,i] = round(mean(x),2)
final[(which(rownames(final)=="50%")+2):nrow(final), i] =
sm$dStats[(which(rownames(sm)=="50%")+1):nrow(sm)]
}
# custom summary table
final
还有 dplyr
和 tidyr
的方法。
df = data.frame("a"=seq(1,10), "b"=seq(10,100,10),
"c"=letters[seq(1,10)], "d"=seq(5,95,10))
library(dplyr)
library(tidyr)
out <- df %>% summarise_if(is.numeric, .funs = list(
"Min" = min,
"10%" = function(x)quantile(x, .1),
"25%" = function(x)quantile(x, .25),
"50%" = median,
"Mean" = mean,
"75%" = function(x)quantile(x, .75),
"90%" = function(x)quantile(x, .90),
"Max" = max)) %>%
pivot_longer(cols=everything(),
names_pattern = "(.*)_(.*)",
names_to = c("var", "stat"),
values_to="vals") %>%
pivot_wider(names_from="var",
values_from="vals",
id_cols="stat") %>%
as.data.frame()
rownames(out) <- out$stat
out <- out %>% select(-stat)
out
# a b d
# Min 1.00 10.0 5.0
# 10% 1.90 19.0 14.0
# 25% 3.25 32.5 27.5
# 50% 5.50 55.0 50.0
# Mean 5.50 55.0 50.0
# 75% 7.75 77.5 72.5
# 90% 9.10 91.0 86.0
# Max 10.00 100.0 95.0