编辑 R 中的默认汇总函数会给出多个变量的错误

Question

我正在扩展默认的 summary() 函数，因为我需要更多的百分位数。它似乎对一个变量工作正常，但如果我添加一个包含多个变量的数据框，我会得到奇怪的值，而使用默认的 summary() 它可以工作。即使我完全复制了默认的汇总函数，所以没有添加更多的百分位数，它也不起作用。我使用这一行来获取代码：

getS3method('summary','default')

-

summary_adj <- function (object, ..., digits = max(3L, getOption("digits") - 
    3L)) 
{
    if (is.factor(object)) 
        return(summary.factor(object, ...))
    else if (is.matrix(object)) 
        return(summary.matrix(object, digits = digits, ...))
    value <- if (is.logical(object)) 
        c(Mode = "logical", {
            tb <- table(object, exclude = NULL)
            if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
            tb
        })
    else if (is.numeric(object)) {
        nas <- is.na(object)
        object <- object[!nas]
        qq <- stats::quantile(object)
        qq <- signif(c(qq[1L:3L], mean(object), qq[4L:5L]), digits)
        names(qq) <- c("Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", 
            "Max.")
        if (any(nas)) 
            c(qq, `NA's` = sum(nas))
        else qq
    }
    else if (is.recursive(object) && !is.language(object) && 
        (n <- length(object))) {
        sumry <- array("", c(n, 3L), list(names(object), c("Length", 
            "Class", "Mode")))
        ll <- numeric(n)
        for (i in 1L:n) {
            ii <- object[[i]]
            ll[i] <- length(ii)
            cls <- oldClass(ii)
            sumry[i, 2L] <- if (length(cls)) 
                cls[1L]
            else "-none-"
            sumry[i, 3L] <- mode(ii)
        }
        sumry[, 1L] <- format(as.integer(ll))
        sumry
    }
    else c(Length = length(object), Class = class(object), Mode = mode(object))
    class(value) <- c("summaryDefault", "table")
    value
}

示例数据集：

nums <- data.frame(var1=rnorm(n=20,mean=5,sd=2),var2=rnorm(n=20,mean=10,sd=4))

-

> summary(nums)
      var1            var2       
 Min.   :1.821   Min.   : 5.095  
 1st Qu.:3.705   1st Qu.: 7.827  
 Median :4.930   Median :10.440  
 Mean   :4.975   Mean   :10.176  
 3rd Qu.:6.553   3rd Qu.:12.247  
 Max.   :7.802   Max.   :14.862  
> summary_adj(nums)
     Length Class  Mode   
var1 20     -none- numeric
var2 20     -none- numeric

但它适用于 1 个变量：

 > summary_adj(nums$var1)
       Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1.821   3.705   4.930   4.975   6.553   7.802

所以它似乎不适用于具有多个变量的数据框.. 非常感谢任何帮助！

此致，蒂姆

@ 根据要求编辑我添加了用于不同分位数的代码：

summary_adj<-function (object, ..., digits = max(3L, getOption("digits") - 
                                                3L)) 
{
  if (is.factor(object)) 
    return(summary.factor(object, ...))
  else if (is.matrix(object)) 
    return(summary.matrix(object, digits = digits, ...))
  value <- if (is.logical(object)) 
    c(Mode = "logical", {
      tb <- table(object, exclude = NULL)
      if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
      tb
    })
  else if (is.numeric(object)) {
    nas <- is.na(object)
    object <- object[!nas]
    #qq <- stats::quantile(object)
    qq <- stats::quantile(object,c(.05,.25,.5,.75,.95,1))
    qq <- signif(c(qq[1L:3L], mean(object), qq[4L:6L],NROW(object)), digits)
    names(qq) <- c("5th Perc", "25th Perc", "Median","Mean", "75th Perc","95th Perc", 
                   "Max.","obs.")
    if (any(nas)) 
      c(qq, `NA's` = sum(nas))
    else qq
  }
  else if (is.recursive(object) && !is.language(object) && 
             (n <- length(object))) {
    sumry <- array("", c(n, 3L), list(names(object), c("Length", 
                                                       "Class", "Mode")))
    ll <- numeric(n)
    for (i in 1L:n) {
      ii <- object[[i]]
      ll[i] <- length(ii)
      cls <- oldClass(ii)
      sumry[i, 2L] <- if (length(cls)) 
        cls[1L]
      else "-none-"
      sumry[i, 3L] <- mode(ii)
    }
    sumry[, 1L] <- format(as.integer(ll))
    sumry
  }
  else c(Length = length(object), Class = class(object), Mode = mode(object))
  class(value) <- c("summaryDefault", "table")
  value
}

这适用于我的 df 中的一个变量：

summary_adj(nums$var1)
 5th Perc 25th Perc    Median      Mean 75th Perc 95th Perc      Max.      obs. 
    1.984     3.705     4.930     4.975     6.553     7.491     7.802    20.000

但不适合所有人：

> summary_adj(nums)
     Length Class  Mode   
var1 20     -none- numeric
var2 20     -none- numeric

而它与普通摘要一样：

> summary(nums)
      var1            var2       
 Min.   :1.821   Min.   : 5.095  
 1st Qu.:3.705   1st Qu.: 7.827  
 Median :4.930   Median :10.440  
 Mean   :4.975   Mean   :10.176  
 3rd Qu.:6.553   3rd Qu.:12.247  
 Max.   :7.802   Max.   :14.862

Answer 1

您可以使用 'getS3method(summary.data.frame)' 作为原型定义一个新函数 summary_adj.data.frame 函数。请注意，我将 z 分配行更改为 lapply.

使用 summary_adj.data.frame(df) 而非 summary_adj(df) 调用它。欢迎评论如何覆盖数据框的 summary_adj。

summary_adj.data.frame<- function (object, maxsum = 7L, digits = max(3L, getOption("digits") - 
                                                3L), ...) 
{
    ncw <- function(x) {
        z <- nchar(x, type = "w")
        if (any(na <- is.na(z))) {
            z[na] <- nchar(encodeString(z[na]), "b")
        }
        z
    }
    z <- lapply(X = as.list(object), FUN = summary_adj, maxsum = maxsum, 
                digits = 12L, ...)
    nv <- length(object)
    nm <- names(object)
    lw <- numeric(nv)
    nr <- if (nv) 
        max(unlist(lapply(z, NROW)))
    else 0
    for (i in seq_len(nv)) {
        sms <- z[[i]]
        if (is.matrix(sms)) {
            cn <- paste(nm[i], gsub("^ +", "", colnames(sms), 
                                    useBytes = TRUE), sep = ".")
            tmp <- format(sms)
            if (nrow(sms) < nr) 
                tmp <- rbind(tmp, matrix("", nr - nrow(sms), 
                                         ncol(sms)))
            sms <- apply(tmp, 1L, function(x) paste(x, collapse = "  "))
            wid <- sapply(tmp[1L, ], nchar, type = "w")
            blanks <- paste(character(max(wid)), collapse = " ")
            wcn <- ncw(cn)
            pad0 <- floor((wid - wcn)/2)
            pad1 <- wid - wcn - pad0
            cn <- paste0(substring(blanks, 1L, pad0), cn, substring(blanks, 
                                                                    1L, pad1))
            nm[i] <- paste(cn, collapse = "  ")
            z[[i]] <- sms
        }
        else {
            sms <- format(sms, digits = digits)
            lbs <- format(names(sms))
            sms <- paste0(lbs, ":", sms, "  ")
            lw[i] <- ncw(lbs[1L])
            length(sms) <- nr
            z[[i]] <- sms
        }
    }
    if (nv) {
        z <- unlist(z, use.names = TRUE)
        dim(z) <- c(nr, nv)
        if (anyNA(lw)) 
            warning("probably wrong encoding in names(.) of column ", 
                    paste(which(is.na(lw)), collapse = ", "))
        blanks <- paste(character(max(lw, na.rm = TRUE) + 2L), 
                        collapse = " ")
        pad <- floor(lw - ncw(nm)/2)
        nm <- paste0(substring(blanks, 1, pad), nm)
        dimnames(z) <- list(rep.int("", nr), nm)
    }
    else {
        z <- character()
        dim(z) <- c(nr, nv)
    }
    attr(z, "class") <- c("table")
    z
}

编辑 R 中的默认汇总函数会给出多个变量的错误

Edit default summary function in R gives error for multiple variables

customization

r

function

summary

s