编辑 R 中的默认汇总函数会给出多个变量的错误
Edit default summary function in R gives error for multiple variables
我正在扩展默认的 summary() 函数,因为我需要更多的百分位数。它似乎对一个变量工作正常,但如果我添加一个包含多个变量的数据框,我会得到奇怪的值,而使用默认的 summary() 它可以工作。即使我完全复制了默认的汇总函数,所以没有添加更多的百分位数,它也不起作用。我使用这一行来获取代码:
getS3method('summary','default')
-
summary_adj <- function (object, ..., digits = max(3L, getOption("digits") -
3L))
{
if (is.factor(object))
return(summary.factor(object, ...))
else if (is.matrix(object))
return(summary.matrix(object, digits = digits, ...))
value <- if (is.logical(object))
c(Mode = "logical", {
tb <- table(object, exclude = NULL)
if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
tb
})
else if (is.numeric(object)) {
nas <- is.na(object)
object <- object[!nas]
qq <- stats::quantile(object)
qq <- signif(c(qq[1L:3L], mean(object), qq[4L:5L]), digits)
names(qq) <- c("Min.", "1st Qu.", "Median", "Mean", "3rd Qu.",
"Max.")
if (any(nas))
c(qq, `NA's` = sum(nas))
else qq
}
else if (is.recursive(object) && !is.language(object) &&
(n <- length(object))) {
sumry <- array("", c(n, 3L), list(names(object), c("Length",
"Class", "Mode")))
ll <- numeric(n)
for (i in 1L:n) {
ii <- object[[i]]
ll[i] <- length(ii)
cls <- oldClass(ii)
sumry[i, 2L] <- if (length(cls))
cls[1L]
else "-none-"
sumry[i, 3L] <- mode(ii)
}
sumry[, 1L] <- format(as.integer(ll))
sumry
}
else c(Length = length(object), Class = class(object), Mode = mode(object))
class(value) <- c("summaryDefault", "table")
value
}
示例数据集:
nums <- data.frame(var1=rnorm(n=20,mean=5,sd=2),var2=rnorm(n=20,mean=10,sd=4))
-
> summary(nums)
var1 var2
Min. :1.821 Min. : 5.095
1st Qu.:3.705 1st Qu.: 7.827
Median :4.930 Median :10.440
Mean :4.975 Mean :10.176
3rd Qu.:6.553 3rd Qu.:12.247
Max. :7.802 Max. :14.862
> summary_adj(nums)
Length Class Mode
var1 20 -none- numeric
var2 20 -none- numeric
但它适用于 1 个变量:
> summary_adj(nums$var1)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.821 3.705 4.930 4.975 6.553 7.802
所以它似乎不适用于具有多个变量的数据框..
非常感谢任何帮助!
此致,
蒂姆
@ 根据要求编辑我添加了用于不同分位数的代码:
summary_adj<-function (object, ..., digits = max(3L, getOption("digits") -
3L))
{
if (is.factor(object))
return(summary.factor(object, ...))
else if (is.matrix(object))
return(summary.matrix(object, digits = digits, ...))
value <- if (is.logical(object))
c(Mode = "logical", {
tb <- table(object, exclude = NULL)
if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
tb
})
else if (is.numeric(object)) {
nas <- is.na(object)
object <- object[!nas]
#qq <- stats::quantile(object)
qq <- stats::quantile(object,c(.05,.25,.5,.75,.95,1))
qq <- signif(c(qq[1L:3L], mean(object), qq[4L:6L],NROW(object)), digits)
names(qq) <- c("5th Perc", "25th Perc", "Median","Mean", "75th Perc","95th Perc",
"Max.","obs.")
if (any(nas))
c(qq, `NA's` = sum(nas))
else qq
}
else if (is.recursive(object) && !is.language(object) &&
(n <- length(object))) {
sumry <- array("", c(n, 3L), list(names(object), c("Length",
"Class", "Mode")))
ll <- numeric(n)
for (i in 1L:n) {
ii <- object[[i]]
ll[i] <- length(ii)
cls <- oldClass(ii)
sumry[i, 2L] <- if (length(cls))
cls[1L]
else "-none-"
sumry[i, 3L] <- mode(ii)
}
sumry[, 1L] <- format(as.integer(ll))
sumry
}
else c(Length = length(object), Class = class(object), Mode = mode(object))
class(value) <- c("summaryDefault", "table")
value
}
这适用于我的 df 中的一个变量:
summary_adj(nums$var1)
5th Perc 25th Perc Median Mean 75th Perc 95th Perc Max. obs.
1.984 3.705 4.930 4.975 6.553 7.491 7.802 20.000
但不适合所有人:
> summary_adj(nums)
Length Class Mode
var1 20 -none- numeric
var2 20 -none- numeric
而它与普通摘要一样:
> summary(nums)
var1 var2
Min. :1.821 Min. : 5.095
1st Qu.:3.705 1st Qu.: 7.827
Median :4.930 Median :10.440
Mean :4.975 Mean :10.176
3rd Qu.:6.553 3rd Qu.:12.247
Max. :7.802 Max. :14.862
您可以使用 'getS3method(summary.data.frame)' 作为原型定义一个新函数 summary_adj.data.frame
函数。请注意,我将 z
分配行更改为 lapply
.
使用 summary_adj.data.frame(df)
而非 summary_adj(df)
调用它。欢迎评论如何覆盖数据框的 summary_adj
。
summary_adj.data.frame<- function (object, maxsum = 7L, digits = max(3L, getOption("digits") -
3L), ...)
{
ncw <- function(x) {
z <- nchar(x, type = "w")
if (any(na <- is.na(z))) {
z[na] <- nchar(encodeString(z[na]), "b")
}
z
}
z <- lapply(X = as.list(object), FUN = summary_adj, maxsum = maxsum,
digits = 12L, ...)
nv <- length(object)
nm <- names(object)
lw <- numeric(nv)
nr <- if (nv)
max(unlist(lapply(z, NROW)))
else 0
for (i in seq_len(nv)) {
sms <- z[[i]]
if (is.matrix(sms)) {
cn <- paste(nm[i], gsub("^ +", "", colnames(sms),
useBytes = TRUE), sep = ".")
tmp <- format(sms)
if (nrow(sms) < nr)
tmp <- rbind(tmp, matrix("", nr - nrow(sms),
ncol(sms)))
sms <- apply(tmp, 1L, function(x) paste(x, collapse = " "))
wid <- sapply(tmp[1L, ], nchar, type = "w")
blanks <- paste(character(max(wid)), collapse = " ")
wcn <- ncw(cn)
pad0 <- floor((wid - wcn)/2)
pad1 <- wid - wcn - pad0
cn <- paste0(substring(blanks, 1L, pad0), cn, substring(blanks,
1L, pad1))
nm[i] <- paste(cn, collapse = " ")
z[[i]] <- sms
}
else {
sms <- format(sms, digits = digits)
lbs <- format(names(sms))
sms <- paste0(lbs, ":", sms, " ")
lw[i] <- ncw(lbs[1L])
length(sms) <- nr
z[[i]] <- sms
}
}
if (nv) {
z <- unlist(z, use.names = TRUE)
dim(z) <- c(nr, nv)
if (anyNA(lw))
warning("probably wrong encoding in names(.) of column ",
paste(which(is.na(lw)), collapse = ", "))
blanks <- paste(character(max(lw, na.rm = TRUE) + 2L),
collapse = " ")
pad <- floor(lw - ncw(nm)/2)
nm <- paste0(substring(blanks, 1, pad), nm)
dimnames(z) <- list(rep.int("", nr), nm)
}
else {
z <- character()
dim(z) <- c(nr, nv)
}
attr(z, "class") <- c("table")
z
}
我正在扩展默认的 summary() 函数,因为我需要更多的百分位数。它似乎对一个变量工作正常,但如果我添加一个包含多个变量的数据框,我会得到奇怪的值,而使用默认的 summary() 它可以工作。即使我完全复制了默认的汇总函数,所以没有添加更多的百分位数,它也不起作用。我使用这一行来获取代码:
getS3method('summary','default')
-
summary_adj <- function (object, ..., digits = max(3L, getOption("digits") -
3L))
{
if (is.factor(object))
return(summary.factor(object, ...))
else if (is.matrix(object))
return(summary.matrix(object, digits = digits, ...))
value <- if (is.logical(object))
c(Mode = "logical", {
tb <- table(object, exclude = NULL)
if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
tb
})
else if (is.numeric(object)) {
nas <- is.na(object)
object <- object[!nas]
qq <- stats::quantile(object)
qq <- signif(c(qq[1L:3L], mean(object), qq[4L:5L]), digits)
names(qq) <- c("Min.", "1st Qu.", "Median", "Mean", "3rd Qu.",
"Max.")
if (any(nas))
c(qq, `NA's` = sum(nas))
else qq
}
else if (is.recursive(object) && !is.language(object) &&
(n <- length(object))) {
sumry <- array("", c(n, 3L), list(names(object), c("Length",
"Class", "Mode")))
ll <- numeric(n)
for (i in 1L:n) {
ii <- object[[i]]
ll[i] <- length(ii)
cls <- oldClass(ii)
sumry[i, 2L] <- if (length(cls))
cls[1L]
else "-none-"
sumry[i, 3L] <- mode(ii)
}
sumry[, 1L] <- format(as.integer(ll))
sumry
}
else c(Length = length(object), Class = class(object), Mode = mode(object))
class(value) <- c("summaryDefault", "table")
value
}
示例数据集:
nums <- data.frame(var1=rnorm(n=20,mean=5,sd=2),var2=rnorm(n=20,mean=10,sd=4))
-
> summary(nums)
var1 var2
Min. :1.821 Min. : 5.095
1st Qu.:3.705 1st Qu.: 7.827
Median :4.930 Median :10.440
Mean :4.975 Mean :10.176
3rd Qu.:6.553 3rd Qu.:12.247
Max. :7.802 Max. :14.862
> summary_adj(nums)
Length Class Mode
var1 20 -none- numeric
var2 20 -none- numeric
但它适用于 1 个变量:
> summary_adj(nums$var1)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.821 3.705 4.930 4.975 6.553 7.802
所以它似乎不适用于具有多个变量的数据框.. 非常感谢任何帮助!
此致, 蒂姆
@ 根据要求编辑我添加了用于不同分位数的代码:
summary_adj<-function (object, ..., digits = max(3L, getOption("digits") -
3L))
{
if (is.factor(object))
return(summary.factor(object, ...))
else if (is.matrix(object))
return(summary.matrix(object, digits = digits, ...))
value <- if (is.logical(object))
c(Mode = "logical", {
tb <- table(object, exclude = NULL)
if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
tb
})
else if (is.numeric(object)) {
nas <- is.na(object)
object <- object[!nas]
#qq <- stats::quantile(object)
qq <- stats::quantile(object,c(.05,.25,.5,.75,.95,1))
qq <- signif(c(qq[1L:3L], mean(object), qq[4L:6L],NROW(object)), digits)
names(qq) <- c("5th Perc", "25th Perc", "Median","Mean", "75th Perc","95th Perc",
"Max.","obs.")
if (any(nas))
c(qq, `NA's` = sum(nas))
else qq
}
else if (is.recursive(object) && !is.language(object) &&
(n <- length(object))) {
sumry <- array("", c(n, 3L), list(names(object), c("Length",
"Class", "Mode")))
ll <- numeric(n)
for (i in 1L:n) {
ii <- object[[i]]
ll[i] <- length(ii)
cls <- oldClass(ii)
sumry[i, 2L] <- if (length(cls))
cls[1L]
else "-none-"
sumry[i, 3L] <- mode(ii)
}
sumry[, 1L] <- format(as.integer(ll))
sumry
}
else c(Length = length(object), Class = class(object), Mode = mode(object))
class(value) <- c("summaryDefault", "table")
value
}
这适用于我的 df 中的一个变量:
summary_adj(nums$var1)
5th Perc 25th Perc Median Mean 75th Perc 95th Perc Max. obs.
1.984 3.705 4.930 4.975 6.553 7.491 7.802 20.000
但不适合所有人:
> summary_adj(nums)
Length Class Mode
var1 20 -none- numeric
var2 20 -none- numeric
而它与普通摘要一样:
> summary(nums)
var1 var2
Min. :1.821 Min. : 5.095
1st Qu.:3.705 1st Qu.: 7.827
Median :4.930 Median :10.440
Mean :4.975 Mean :10.176
3rd Qu.:6.553 3rd Qu.:12.247
Max. :7.802 Max. :14.862
您可以使用 'getS3method(summary.data.frame)' 作为原型定义一个新函数 summary_adj.data.frame
函数。请注意,我将 z
分配行更改为 lapply
.
使用 summary_adj.data.frame(df)
而非 summary_adj(df)
调用它。欢迎评论如何覆盖数据框的 summary_adj
。
summary_adj.data.frame<- function (object, maxsum = 7L, digits = max(3L, getOption("digits") -
3L), ...)
{
ncw <- function(x) {
z <- nchar(x, type = "w")
if (any(na <- is.na(z))) {
z[na] <- nchar(encodeString(z[na]), "b")
}
z
}
z <- lapply(X = as.list(object), FUN = summary_adj, maxsum = maxsum,
digits = 12L, ...)
nv <- length(object)
nm <- names(object)
lw <- numeric(nv)
nr <- if (nv)
max(unlist(lapply(z, NROW)))
else 0
for (i in seq_len(nv)) {
sms <- z[[i]]
if (is.matrix(sms)) {
cn <- paste(nm[i], gsub("^ +", "", colnames(sms),
useBytes = TRUE), sep = ".")
tmp <- format(sms)
if (nrow(sms) < nr)
tmp <- rbind(tmp, matrix("", nr - nrow(sms),
ncol(sms)))
sms <- apply(tmp, 1L, function(x) paste(x, collapse = " "))
wid <- sapply(tmp[1L, ], nchar, type = "w")
blanks <- paste(character(max(wid)), collapse = " ")
wcn <- ncw(cn)
pad0 <- floor((wid - wcn)/2)
pad1 <- wid - wcn - pad0
cn <- paste0(substring(blanks, 1L, pad0), cn, substring(blanks,
1L, pad1))
nm[i] <- paste(cn, collapse = " ")
z[[i]] <- sms
}
else {
sms <- format(sms, digits = digits)
lbs <- format(names(sms))
sms <- paste0(lbs, ":", sms, " ")
lw[i] <- ncw(lbs[1L])
length(sms) <- nr
z[[i]] <- sms
}
}
if (nv) {
z <- unlist(z, use.names = TRUE)
dim(z) <- c(nr, nv)
if (anyNA(lw))
warning("probably wrong encoding in names(.) of column ",
paste(which(is.na(lw)), collapse = ", "))
blanks <- paste(character(max(lw, na.rm = TRUE) + 2L),
collapse = " ")
pad <- floor(lw - ncw(nm)/2)
nm <- paste0(substring(blanks, 1, pad), nm)
dimnames(z) <- list(rep.int("", nr), nm)
}
else {
z <- character()
dim(z) <- c(nr, nv)
}
attr(z, "class") <- c("table")
z
}