在 lm 中使用权重时更正 df

Correcting for the df's when using weights in lm

基本上我的问题很简单。我想知道 lm 函数内部的数据子集(NA 被删除的位置),基于公式中使用的所有变量。我想知道的原因是因为我想只使用子集数据(NA 被删除)而不是我的完整数据集来对变量求和。

背景:

我尝试将 R 中的 lm 函数改编为 lm 中的 。我认为最简单的解决方案是从函数内部计算子集数据的权重总和,具体取决于所选变量。因此,首先我根据所选变量查看 lm 数据集的子集,然后我计算 sum(ind$weight_freq),我想将其作为函数的输出之一提供,以便我可以参考它。

示例数据:

library(dplyr)

set.seed(1024)

# individual (true) dataset
x <- round(rnorm(1e5))
y <- round(x + x^2 + rnorm(1e5))
ind <- data.frame(x, y)

# Create an NA value
ind[1,1] <- NA

ind <- ind %>%
  group_by(x, y) %>%
  summarize(weight_freq= n())

我开始时只是将 lm 代码复制到一个新函数中,并将所有 <- 替换为 <<-,以查看数据在何处被子集化,使用 lm_plus(y ~ x, data = ind, weights = weight_freq),以及虽然我收到错误 Error in lm_plus(y ~ x, data = ind, weights = weight_freq) : argument "offset" is missing, with no default,但代码已经足够进行子集化了(因为 mf 应该是 99999,因为 NA,它是) :

lm_plus <- function (formula, data, subset, weights, na.action, method = "qr", 
    model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, 
    contrasts = NULL, offset, ...) 
{
    ret.x <<- x
    ret.y <<- y
    cl <<- match.call()
    mf <<- match.call(expand.dots = FALSE)
    m <<- match(c("formula", "data", "subset", 
        "weights", "na.action", "offset"), 
        names(mf), 0L)
    mf <<- mf[c(1L, m)]
    mf$drop.unused.levels <<- TRUE
    mf[[1L]] <<- quote(stats::model.frame)
    mf <<- eval(mf, parent.frame())
    if (method == "model.frame") 
        return(mf)
    else if (method != "qr") 
        warning(gettextf("method = '%s' is not supported. Using 'qr'", 
            method), domain = NA)
    mt <<- attr(mf, "terms")
    y <<- model.response(mf, "numeric")
    w <<- as.vector(model.weights(mf))
    if (!is.null(w) && !is.numeric(w)) 
        stop("'weights' must be a numeric vector")
    offset <<- model.offset(mf)
    mlm <<- is.matrix(y)
    ny <<- if (mlm) 
        nrow(y)
    else length(y)
    if (!is.null(offset)) {
        if (!mlm) 
            offset <<- as.vector(offset)
        if (NROW(offset) != ny) 
            stop(gettextf("number of offsets is %d, should equal %d (number of observations)", 
                NROW(offset), ny), domain = NA)
    }
    if (is.empty.model(mt)) {
        x <<- NULL
        z <<- list(coefficients = if (mlm) matrix(NA_real_, 0, 
            ncol(y)) else numeric(), residuals = y, fitted.values = 0 * 
            y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w != 
            0) else ny)
        if (!is.null(offset)) {
            z$fitted.values <<- offset
            z$residuals <<- y - offset
        }
    }
    else {
        x <<- model.matrix(mt, mf, contrasts)
        z <<- if (is.null(w)) 
            lm.fit(x, y, offset = offset, singular.ok = singular.ok, 
                ...)
        else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, 
            ...)
    }
    class(z) <<- c(if (mlm) "mlm", "lm")
    z$na.action <<- attr(mf, "na.action")
    z$offset <<- offset
    z$contrasts <<- attr(x, "contrasts")
    z$xlevels <<- .getXlevels(mt, mf)
    z$call <<- cl
    z$terms <<- mt
    if (model) 
        z$model <<- mf
    if (ret.x) 
        z$x <<- x
    if (ret.y) 
        z$y <<- y
    if (!qr) 
        z$qr <<- NULL
    z
}

然后我尝试将每个 mf 实例重命名为 mf1mf2mf3,以查看 mf 的实际子集位置,但是我被卡住了,因为我遇到了错误(尽管我认为我确保我在 mf 之间正确地引用了。

我也试着在各处输入 test <<- sum(mf$weights, na.rm=TRUE),但没有成功。

有没有人可以帮助我在正确的地方求和权重?

这个有效:

编辑:作为一个小笔记。我 认为 最好完全覆盖该函数。我认为它不会有任何缺点,我注意到当您不使用 lm.

时,其他一些依赖 lm 的函数开始抱怨
lm_plus <- function (formula, data, subset, weights, na.action, method = "qr", 
    model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, 
    contrasts = NULL, offset, ...) 
{
    ret.x <- x
    ret.y <- y
    cl <- match.call()
    mf <- match.call(expand.dots = FALSE)
    m <- match(c("formula", "data", "subset", 
        "weights", "na.action", "offset"), 
        names(mf), 0L)
    mf <- mf[c(1L, m)]
    mf$drop.unused.levels <- TRUE
    mf[[1L]] <- quote(stats::model.frame)
    mf <- eval(mf, parent.frame())
    # test <<- sum(mf$weights, na.rm=TRUE)
    if (method == "model.frame") 
        return(mf)
    else if (method != "qr") 
        warning(gettextf("method = '%s' is not supported. Using 'qr'", 
            method), domain = NA)
    mt <- attr(mf, "terms")
    y <- model.response(mf, "numeric")
    w <- as.vector(model.weights(mf))
    sum_of_weights <- sum(as.vector(model.weights(mf)))
    if (!is.null(w) && !is.numeric(w)) 
        stop("'weights' must be a numeric vector")
    offset <- model.offset(mf)
    mlm <- is.matrix(y)
    ny <- if (mlm) 
        nrow(y)
    else length(y)
    if (!is.null(offset)) {
        if (!mlm) 
            offset <- as.vector(offset)
        if (NROW(offset) != ny) 
            stop(gettextf("number of offsets is %d, should equal %d (number of observations)", 
                NROW(offset), ny), domain = NA)
    }
    if (is.empty.model(mt)) {
        x <- NULL
        z <- list(coefficients = if (mlm) matrix(NA_real_, 0, 
            ncol(y)) else numeric(), residuals = y, fitted.values = 0 * 
            y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w != 
            0) else ny)
        if (!is.null(offset)) {
            z$fitted.values <- offset
            z$residuals <- y - offset
        }
    }
    else {
        x <- model.matrix(mt, mf, contrasts)
        z <- if (is.null(w)) 
            lm.fit(x, y, offset = offset, singular.ok = singular.ok, 
                ...)
        else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, 
            ...)
    }
    class(z) <- c(if (mlm) "mlm", "lm")
    z$na.action <- attr(mf, "na.action")
    z$offset <- offset
    z$contrasts <- attr(x, "contrasts")
    z$xlevels <- .getXlevels(mt, mf)
    z$call <- cl
    z$terms <- mt
    if (model) 
        z$model <- mf
    if (ret.x) 
        z$x <- x
    if (ret.y) 
        z$y <- y
    if (!qr) 
        z$qr <- NULL
    z$df.residual <- sum_of_weights - length(coef(z))
    z
}

示例数据:

library(dplyr)
library(modelsummary)

set.seed(1024)

# individual (true) dataset
x <- round(rnorm(1e5))
y <- round(x + x^2 + rnorm(1e5))
ind <- data.frame(x, y)

# aggregated dataset
agg <- ind %>%
  group_by(x, y) %>%
  summarize(freq = n())

# Note that the last entry uses lm_plus
models <- list( 
  "True"                = lm(y ~ x, data = ind),
  "Aggregated"          = lm(y ~ x, data = agg),
  "Aggregated & W"      = lm(y ~ x, data = agg, weights=freq),
  "Aggregated & W & DF" = lm_plus(y ~ x, data = agg, weights=freq)
)

modelsummary(models, fmt=5)