如何计算 R 中数值变量和分类变量的描述性统计?
How to calculates the descriptive statistics for both numeric and categorical variables in R?
我正在尝试编写一个函数来计算数值变量和分类变量(因子)的描述性统计数据。
对于数值型变量,要计算均值(MEAN)、中位数(MEDIAN)、标准差(SD),统计缺失值的个数(NMiss)。
对于字符型变量,应该将变量每一级内的计数列表化,并统计缺失值的个数。
起始输入数据为:
ID GLUC TGL HDL LDL HRT MAMM SMOKE
1 A 88 NA 32 99 Y <NA> ever
2 B NA 150 60 NA <NA> no never
3 C 110 NA NA 120 N <NA> <NA>
4 D NA 200 65 165 <NA> yes never
我希望它看起来像这样:
> table1 (dat=patient, numvar=c("TGL", "HDL", "LDL"), charvar=c("HRT", "MAMM"))
$numericStats
varName MEAN MEDIAN SD NMiss
1 TGL 180.66667 180.0 23.03620 4
2 HDL 55.66667 62.5 19.00175 4
3 LDL 160.28571 165.0 40.06126 3
$FactorStats
varName group count
1 HRT N 2
2 Y 3
3 NMiss 5
4 MAMM no 2
5 yes 4
6 NMiss 4
这是我目前的代码:
#numericstats
findnum = function(dat, numvar){
numstats=data.frame()
for (i in length(numvar[])){
var_select = dat[[numvar[i]]]
mean_value = round(mean(var_select, na.rm=T),2)
median_value = round(median(var_select, na.rm=T),2)
SD = round(sd(var_select, na.rm=T),2)
N = length(var_select[!is.na(var_select)])
N_miss = length(var_select[is.na(var_select)])
numstats =
cbind(varname = numvar, mean = mean_value, median = median_value, sd = SD, nmissing = N_miss)
}
return(numstats)
}
findnum(dat=patient, numvar=c("TGL","HDL","LDL"))
#factorstats
findfactor = function(dat, charvar){
factstats=data.frame()
for (i in length(charvar[])){
var_select = dat[[charvar[i]]]
count = length(charvar)
group = charvar
factstats =
cbind(varname = charvar, group = charvar, count = count)
}
return(factstats)
}
findfactor(dat=patient, charvar=c("MAMM","SMOKE"))
#full function
table1 = function(dat, numvar, charvar){
for (i in 1:length(dat)){
if (!is.numeric(i))
numericstats = findnum(dat, i)
else factorstats = findfactor(dat, i)
return(data.frame(numericstats, factorstats))
}
}
这是使用 lapply
的一种方法:
table1 <- function(df, numvar, charvar) {
list(numericStats = cbind(VarName = numvar,do.call(rbind,
lapply(df[numvar], function(x) {
data.frame(MEAN = mean(x, na.rm = TRUE), MEDIAN = median(x, na.rm = TRUE),
SD = sd(x, na.rm = TRUE), NMiss = sum(!is.na(x)))
}))),
FactorStats = do.call(rbind, lapply(charvar, function(x) {
tab <- stack(c(table(df[[x]]), Nmiss = sum(is.na(df[[x]]))))[2:1]
names(tab) <- c('group', 'count')
cbind(Varname = x, tab)
})))
}
table1(patient, numvar=c("TGL", "HDL", "LDL"), charvar=c("HRT", "MAMM"))
#$numericStats
# VarName MEAN MEDIAN SD NMiss
#TGL TGL 175.0 175 35.4 2
#HDL HDL 52.3 60 17.8 3
#LDL LDL 128.0 120 33.7 3
#$FactorStats
# Varname group count
#1 HRT N 1
#2 HRT Y 1
#3 HRT Nmiss 2
#4 MAMM no 1
#5 MAMM yes 1
#6 MAMM Nmiss 2
数据
patient <- structure(list(ID = c("A", "B", "C", "D"), GLUC = c(88L, NA,
110L, NA), TGL = c(NA, 150L, NA, 200L), HDL = c(32L, 60L, NA,
65L), LDL = c(99L, NA, 120L, 165L), HRT = c("Y", NA, "N", NA),
MAMM = c(NA, "no", NA, "yes"), SMOKE = c("ever", "never",
NA, "never")), row.names = c(NA, -4L), class = "data.frame")
我正在尝试编写一个函数来计算数值变量和分类变量(因子)的描述性统计数据。 对于数值型变量,要计算均值(MEAN)、中位数(MEDIAN)、标准差(SD),统计缺失值的个数(NMiss)。 对于字符型变量,应该将变量每一级内的计数列表化,并统计缺失值的个数。
起始输入数据为:
ID GLUC TGL HDL LDL HRT MAMM SMOKE
1 A 88 NA 32 99 Y <NA> ever
2 B NA 150 60 NA <NA> no never
3 C 110 NA NA 120 N <NA> <NA>
4 D NA 200 65 165 <NA> yes never
我希望它看起来像这样:
> table1 (dat=patient, numvar=c("TGL", "HDL", "LDL"), charvar=c("HRT", "MAMM"))
$numericStats
varName MEAN MEDIAN SD NMiss
1 TGL 180.66667 180.0 23.03620 4
2 HDL 55.66667 62.5 19.00175 4
3 LDL 160.28571 165.0 40.06126 3
$FactorStats
varName group count
1 HRT N 2
2 Y 3
3 NMiss 5
4 MAMM no 2
5 yes 4
6 NMiss 4
这是我目前的代码:
#numericstats
findnum = function(dat, numvar){
numstats=data.frame()
for (i in length(numvar[])){
var_select = dat[[numvar[i]]]
mean_value = round(mean(var_select, na.rm=T),2)
median_value = round(median(var_select, na.rm=T),2)
SD = round(sd(var_select, na.rm=T),2)
N = length(var_select[!is.na(var_select)])
N_miss = length(var_select[is.na(var_select)])
numstats =
cbind(varname = numvar, mean = mean_value, median = median_value, sd = SD, nmissing = N_miss)
}
return(numstats)
}
findnum(dat=patient, numvar=c("TGL","HDL","LDL"))
#factorstats
findfactor = function(dat, charvar){
factstats=data.frame()
for (i in length(charvar[])){
var_select = dat[[charvar[i]]]
count = length(charvar)
group = charvar
factstats =
cbind(varname = charvar, group = charvar, count = count)
}
return(factstats)
}
findfactor(dat=patient, charvar=c("MAMM","SMOKE"))
#full function
table1 = function(dat, numvar, charvar){
for (i in 1:length(dat)){
if (!is.numeric(i))
numericstats = findnum(dat, i)
else factorstats = findfactor(dat, i)
return(data.frame(numericstats, factorstats))
}
}
这是使用 lapply
的一种方法:
table1 <- function(df, numvar, charvar) {
list(numericStats = cbind(VarName = numvar,do.call(rbind,
lapply(df[numvar], function(x) {
data.frame(MEAN = mean(x, na.rm = TRUE), MEDIAN = median(x, na.rm = TRUE),
SD = sd(x, na.rm = TRUE), NMiss = sum(!is.na(x)))
}))),
FactorStats = do.call(rbind, lapply(charvar, function(x) {
tab <- stack(c(table(df[[x]]), Nmiss = sum(is.na(df[[x]]))))[2:1]
names(tab) <- c('group', 'count')
cbind(Varname = x, tab)
})))
}
table1(patient, numvar=c("TGL", "HDL", "LDL"), charvar=c("HRT", "MAMM"))
#$numericStats
# VarName MEAN MEDIAN SD NMiss
#TGL TGL 175.0 175 35.4 2
#HDL HDL 52.3 60 17.8 3
#LDL LDL 128.0 120 33.7 3
#$FactorStats
# Varname group count
#1 HRT N 1
#2 HRT Y 1
#3 HRT Nmiss 2
#4 MAMM no 1
#5 MAMM yes 1
#6 MAMM Nmiss 2
数据
patient <- structure(list(ID = c("A", "B", "C", "D"), GLUC = c(88L, NA,
110L, NA), TGL = c(NA, 150L, NA, 200L), HDL = c(32L, 60L, NA,
65L), LDL = c(99L, NA, 120L, 165L), HRT = c("Y", NA, "N", NA),
MAMM = c(NA, "no", NA, "yes"), SMOKE = c("ever", "never",
NA, "never")), row.names = c(NA, -4L), class = "data.frame")