在 lm 中使用权重时更正 df
Correcting for the df's when using weights in lm
基本上我的问题很简单。我想知道 lm
函数内部的数据子集(NA
被删除的位置),基于公式中使用的所有变量。我想知道的原因是因为我想只使用子集数据(NA
被删除)而不是我的完整数据集来对变量求和。
背景:
我尝试将 R
中的 lm
函数改编为 lm
中的 。我认为最简单的解决方案是从函数内部计算子集数据的权重总和,具体取决于所选变量。因此,首先我根据所选变量查看 lm
数据集的子集,然后我计算 sum(ind$weight_freq)
,我想将其作为函数的输出之一提供,以便我可以参考它。
示例数据:
library(dplyr)
set.seed(1024)
# individual (true) dataset
x <- round(rnorm(1e5))
y <- round(x + x^2 + rnorm(1e5))
ind <- data.frame(x, y)
# Create an NA value
ind[1,1] <- NA
ind <- ind %>%
group_by(x, y) %>%
summarize(weight_freq= n())
我开始时只是将 lm 代码复制到一个新函数中,并将所有 <-
替换为 <<-
,以查看数据在何处被子集化,使用 lm_plus(y ~ x, data = ind, weights = weight_freq)
,以及虽然我收到错误 Error in lm_plus(y ~ x, data = ind, weights = weight_freq) : argument "offset" is missing, with no default
,但代码已经足够进行子集化了(因为 mf
应该是 99999
,因为 NA
,它是) :
lm_plus <- function (formula, data, subset, weights, na.action, method = "qr",
model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
contrasts = NULL, offset, ...)
{
ret.x <<- x
ret.y <<- y
cl <<- match.call()
mf <<- match.call(expand.dots = FALSE)
m <<- match(c("formula", "data", "subset",
"weights", "na.action", "offset"),
names(mf), 0L)
mf <<- mf[c(1L, m)]
mf$drop.unused.levels <<- TRUE
mf[[1L]] <<- quote(stats::model.frame)
mf <<- eval(mf, parent.frame())
if (method == "model.frame")
return(mf)
else if (method != "qr")
warning(gettextf("method = '%s' is not supported. Using 'qr'",
method), domain = NA)
mt <<- attr(mf, "terms")
y <<- model.response(mf, "numeric")
w <<- as.vector(model.weights(mf))
if (!is.null(w) && !is.numeric(w))
stop("'weights' must be a numeric vector")
offset <<- model.offset(mf)
mlm <<- is.matrix(y)
ny <<- if (mlm)
nrow(y)
else length(y)
if (!is.null(offset)) {
if (!mlm)
offset <<- as.vector(offset)
if (NROW(offset) != ny)
stop(gettextf("number of offsets is %d, should equal %d (number of observations)",
NROW(offset), ny), domain = NA)
}
if (is.empty.model(mt)) {
x <<- NULL
z <<- list(coefficients = if (mlm) matrix(NA_real_, 0,
ncol(y)) else numeric(), residuals = y, fitted.values = 0 *
y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w !=
0) else ny)
if (!is.null(offset)) {
z$fitted.values <<- offset
z$residuals <<- y - offset
}
}
else {
x <<- model.matrix(mt, mf, contrasts)
z <<- if (is.null(w))
lm.fit(x, y, offset = offset, singular.ok = singular.ok,
...)
else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok,
...)
}
class(z) <<- c(if (mlm) "mlm", "lm")
z$na.action <<- attr(mf, "na.action")
z$offset <<- offset
z$contrasts <<- attr(x, "contrasts")
z$xlevels <<- .getXlevels(mt, mf)
z$call <<- cl
z$terms <<- mt
if (model)
z$model <<- mf
if (ret.x)
z$x <<- x
if (ret.y)
z$y <<- y
if (!qr)
z$qr <<- NULL
z
}
然后我尝试将每个 mf
实例重命名为 mf1
、mf2
、mf3
,以查看 mf
的实际子集位置,但是我被卡住了,因为我遇到了错误(尽管我认为我确保我在 mf
之间正确地引用了。
我也试着在各处输入 test <<- sum(mf$weights, na.rm=TRUE)
,但没有成功。
有没有人可以帮助我在正确的地方求和权重?
这个有效:
编辑:作为一个小笔记。我 认为 最好完全覆盖该函数。我认为它不会有任何缺点,我注意到当您不使用 lm
.
时,其他一些依赖 lm 的函数开始抱怨
lm_plus <- function (formula, data, subset, weights, na.action, method = "qr",
model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
contrasts = NULL, offset, ...)
{
ret.x <- x
ret.y <- y
cl <- match.call()
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data", "subset",
"weights", "na.action", "offset"),
names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- quote(stats::model.frame)
mf <- eval(mf, parent.frame())
# test <<- sum(mf$weights, na.rm=TRUE)
if (method == "model.frame")
return(mf)
else if (method != "qr")
warning(gettextf("method = '%s' is not supported. Using 'qr'",
method), domain = NA)
mt <- attr(mf, "terms")
y <- model.response(mf, "numeric")
w <- as.vector(model.weights(mf))
sum_of_weights <- sum(as.vector(model.weights(mf)))
if (!is.null(w) && !is.numeric(w))
stop("'weights' must be a numeric vector")
offset <- model.offset(mf)
mlm <- is.matrix(y)
ny <- if (mlm)
nrow(y)
else length(y)
if (!is.null(offset)) {
if (!mlm)
offset <- as.vector(offset)
if (NROW(offset) != ny)
stop(gettextf("number of offsets is %d, should equal %d (number of observations)",
NROW(offset), ny), domain = NA)
}
if (is.empty.model(mt)) {
x <- NULL
z <- list(coefficients = if (mlm) matrix(NA_real_, 0,
ncol(y)) else numeric(), residuals = y, fitted.values = 0 *
y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w !=
0) else ny)
if (!is.null(offset)) {
z$fitted.values <- offset
z$residuals <- y - offset
}
}
else {
x <- model.matrix(mt, mf, contrasts)
z <- if (is.null(w))
lm.fit(x, y, offset = offset, singular.ok = singular.ok,
...)
else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok,
...)
}
class(z) <- c(if (mlm) "mlm", "lm")
z$na.action <- attr(mf, "na.action")
z$offset <- offset
z$contrasts <- attr(x, "contrasts")
z$xlevels <- .getXlevels(mt, mf)
z$call <- cl
z$terms <- mt
if (model)
z$model <- mf
if (ret.x)
z$x <- x
if (ret.y)
z$y <- y
if (!qr)
z$qr <- NULL
z$df.residual <- sum_of_weights - length(coef(z))
z
}
示例数据:
library(dplyr)
library(modelsummary)
set.seed(1024)
# individual (true) dataset
x <- round(rnorm(1e5))
y <- round(x + x^2 + rnorm(1e5))
ind <- data.frame(x, y)
# aggregated dataset
agg <- ind %>%
group_by(x, y) %>%
summarize(freq = n())
# Note that the last entry uses lm_plus
models <- list(
"True" = lm(y ~ x, data = ind),
"Aggregated" = lm(y ~ x, data = agg),
"Aggregated & W" = lm(y ~ x, data = agg, weights=freq),
"Aggregated & W & DF" = lm_plus(y ~ x, data = agg, weights=freq)
)
modelsummary(models, fmt=5)
基本上我的问题很简单。我想知道 lm
函数内部的数据子集(NA
被删除的位置),基于公式中使用的所有变量。我想知道的原因是因为我想只使用子集数据(NA
被删除)而不是我的完整数据集来对变量求和。
背景:
我尝试将 R
中的 lm
函数改编为 lm
中的 lm
数据集的子集,然后我计算 sum(ind$weight_freq)
,我想将其作为函数的输出之一提供,以便我可以参考它。
示例数据:
library(dplyr)
set.seed(1024)
# individual (true) dataset
x <- round(rnorm(1e5))
y <- round(x + x^2 + rnorm(1e5))
ind <- data.frame(x, y)
# Create an NA value
ind[1,1] <- NA
ind <- ind %>%
group_by(x, y) %>%
summarize(weight_freq= n())
我开始时只是将 lm 代码复制到一个新函数中,并将所有 <-
替换为 <<-
,以查看数据在何处被子集化,使用 lm_plus(y ~ x, data = ind, weights = weight_freq)
,以及虽然我收到错误 Error in lm_plus(y ~ x, data = ind, weights = weight_freq) : argument "offset" is missing, with no default
,但代码已经足够进行子集化了(因为 mf
应该是 99999
,因为 NA
,它是) :
lm_plus <- function (formula, data, subset, weights, na.action, method = "qr",
model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
contrasts = NULL, offset, ...)
{
ret.x <<- x
ret.y <<- y
cl <<- match.call()
mf <<- match.call(expand.dots = FALSE)
m <<- match(c("formula", "data", "subset",
"weights", "na.action", "offset"),
names(mf), 0L)
mf <<- mf[c(1L, m)]
mf$drop.unused.levels <<- TRUE
mf[[1L]] <<- quote(stats::model.frame)
mf <<- eval(mf, parent.frame())
if (method == "model.frame")
return(mf)
else if (method != "qr")
warning(gettextf("method = '%s' is not supported. Using 'qr'",
method), domain = NA)
mt <<- attr(mf, "terms")
y <<- model.response(mf, "numeric")
w <<- as.vector(model.weights(mf))
if (!is.null(w) && !is.numeric(w))
stop("'weights' must be a numeric vector")
offset <<- model.offset(mf)
mlm <<- is.matrix(y)
ny <<- if (mlm)
nrow(y)
else length(y)
if (!is.null(offset)) {
if (!mlm)
offset <<- as.vector(offset)
if (NROW(offset) != ny)
stop(gettextf("number of offsets is %d, should equal %d (number of observations)",
NROW(offset), ny), domain = NA)
}
if (is.empty.model(mt)) {
x <<- NULL
z <<- list(coefficients = if (mlm) matrix(NA_real_, 0,
ncol(y)) else numeric(), residuals = y, fitted.values = 0 *
y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w !=
0) else ny)
if (!is.null(offset)) {
z$fitted.values <<- offset
z$residuals <<- y - offset
}
}
else {
x <<- model.matrix(mt, mf, contrasts)
z <<- if (is.null(w))
lm.fit(x, y, offset = offset, singular.ok = singular.ok,
...)
else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok,
...)
}
class(z) <<- c(if (mlm) "mlm", "lm")
z$na.action <<- attr(mf, "na.action")
z$offset <<- offset
z$contrasts <<- attr(x, "contrasts")
z$xlevels <<- .getXlevels(mt, mf)
z$call <<- cl
z$terms <<- mt
if (model)
z$model <<- mf
if (ret.x)
z$x <<- x
if (ret.y)
z$y <<- y
if (!qr)
z$qr <<- NULL
z
}
然后我尝试将每个 mf
实例重命名为 mf1
、mf2
、mf3
,以查看 mf
的实际子集位置,但是我被卡住了,因为我遇到了错误(尽管我认为我确保我在 mf
之间正确地引用了。
我也试着在各处输入 test <<- sum(mf$weights, na.rm=TRUE)
,但没有成功。
有没有人可以帮助我在正确的地方求和权重?
这个有效:
编辑:作为一个小笔记。我 认为 最好完全覆盖该函数。我认为它不会有任何缺点,我注意到当您不使用 lm
.
lm_plus <- function (formula, data, subset, weights, na.action, method = "qr",
model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
contrasts = NULL, offset, ...)
{
ret.x <- x
ret.y <- y
cl <- match.call()
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data", "subset",
"weights", "na.action", "offset"),
names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- quote(stats::model.frame)
mf <- eval(mf, parent.frame())
# test <<- sum(mf$weights, na.rm=TRUE)
if (method == "model.frame")
return(mf)
else if (method != "qr")
warning(gettextf("method = '%s' is not supported. Using 'qr'",
method), domain = NA)
mt <- attr(mf, "terms")
y <- model.response(mf, "numeric")
w <- as.vector(model.weights(mf))
sum_of_weights <- sum(as.vector(model.weights(mf)))
if (!is.null(w) && !is.numeric(w))
stop("'weights' must be a numeric vector")
offset <- model.offset(mf)
mlm <- is.matrix(y)
ny <- if (mlm)
nrow(y)
else length(y)
if (!is.null(offset)) {
if (!mlm)
offset <- as.vector(offset)
if (NROW(offset) != ny)
stop(gettextf("number of offsets is %d, should equal %d (number of observations)",
NROW(offset), ny), domain = NA)
}
if (is.empty.model(mt)) {
x <- NULL
z <- list(coefficients = if (mlm) matrix(NA_real_, 0,
ncol(y)) else numeric(), residuals = y, fitted.values = 0 *
y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w !=
0) else ny)
if (!is.null(offset)) {
z$fitted.values <- offset
z$residuals <- y - offset
}
}
else {
x <- model.matrix(mt, mf, contrasts)
z <- if (is.null(w))
lm.fit(x, y, offset = offset, singular.ok = singular.ok,
...)
else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok,
...)
}
class(z) <- c(if (mlm) "mlm", "lm")
z$na.action <- attr(mf, "na.action")
z$offset <- offset
z$contrasts <- attr(x, "contrasts")
z$xlevels <- .getXlevels(mt, mf)
z$call <- cl
z$terms <- mt
if (model)
z$model <- mf
if (ret.x)
z$x <- x
if (ret.y)
z$y <- y
if (!qr)
z$qr <- NULL
z$df.residual <- sum_of_weights - length(coef(z))
z
}
示例数据:
library(dplyr)
library(modelsummary)
set.seed(1024)
# individual (true) dataset
x <- round(rnorm(1e5))
y <- round(x + x^2 + rnorm(1e5))
ind <- data.frame(x, y)
# aggregated dataset
agg <- ind %>%
group_by(x, y) %>%
summarize(freq = n())
# Note that the last entry uses lm_plus
models <- list(
"True" = lm(y ~ x, data = ind),
"Aggregated" = lm(y ~ x, data = agg),
"Aggregated & W" = lm(y ~ x, data = agg, weights=freq),
"Aggregated & W & DF" = lm_plus(y ~ x, data = agg, weights=freq)
)
modelsummary(models, fmt=5)