防止空数据进入 R 中的 lm() 调用?
Prevent empty data to enter an lm() call in R?
我正在尝试想出一种机制来防止 lm()
输出中出现空结果。确切地说,我想先找到它们,然后阻止它们进入新的 lm()
调用。
例如,在下面的例子中,tmp
输出中的cf.type99:time3
&cf.type99:time4
是NA
。因此,在检测到它们之后,我想防止它们被包含在第二个 lm()
调用中(new_tmp
)。
这在 R 中可行吗?
p.s。我希望 new_tmp
不适合其 model.matrix()
中的 cf.type99:time3
和 cf.type99:time4
,而不仅仅是从其输出中物理删除 NA。
d5 <- read.csv('https://raw.githubusercontent.com/rnorouzian/m/master/v14.csv')
d5[c("cf.type","time")] <- lapply(d5[c("cf.type","time")], as.factor)
tmp <- lm(dint~cf.type*time, data = d5)
(coef.na <- is.na(coef(tmp))) # detects the `NA` in the output:
# cf.type2:time3 cf.type3:time3 cf.type8:time3 cf.type99:time3 cf.type1:time4
# FALSE FALSE FALSE TRUE FALSE
# cf.type2:time4 cf.type3:time4 cf.type8:time4 cf.type99:time4
# FALSE FALSE FALSE TRUE
new_tmp <- lm(dint~cf.type*time, data = d5) # NOW PREVENT `cf.type99:time3` & `cf.type99:time4` from entering new_tmp
您可以将列从 model.matrix
、
中删除
X.star <- model.matrix(tmp)[,!is.na(tmp$coefficients)]
然后用lm.fit
计算系数,
y <- d5$dint
fit.star <- lm.fit(X.star, y)
(beta.hat <- fit.star$coe)
# (Intercept) cf.type1 cf.type2 cf.type3 cf.type8
# 0.72824752 -0.20633963 0.72317588 -0.08369434 0.17387840
# cf.type99 time2 time3 time4 cf.type1:time2
# -0.53932949 -0.49471558 -0.34560481 0.01830757 0.28441094
# cf.type2:time2 cf.type3:time2 cf.type8:time2 cf.type99:time2 cf.type1:time3
# -0.44254640 0.77070527 0.71423737 0.51203670 1.15793712
# cf.type2:time3 cf.type3:time3 cf.type8:time3 cf.type1:time4 cf.type2:time4
# -0.66306413 0.18818360 -0.12287313 -0.06825029 -1.06941453
# cf.type3:time4 cf.type8:time4
# -0.37337606 -0.06063658
以及可选的手动标准错误。
## Standard errors
y.hat <- X.star %*% beta.hat
e.hat <- y - y.hat
n <- dim(X.star)[2]
m <- dim(X.star)[1]
sig2 <- (t(e.hat) %*% e.hat) / (m - (n + 1))
res.se <- sqrt(sig2)
diag.xx <- diag(solve(t(X.star) %*% X.star))
beta.se <- as.vector(res.se)*sqrt(diag.xx)
结果
cbind(Estimate=beta.hat, `Std. Error`=beta.se, `t value`=beta.hat/beta.se,
`Pr(>|t|)`=2*pt(-abs(beta.hat/beta.se), df=fit.star$df.residual))
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.72824752 0.1420687 5.12602302 6.252427e-07
# cf.type1 -0.20633963 0.2460702 -0.83853955 4.025946e-01
# cf.type2 0.72317588 0.4921405 1.46945012 1.430713e-01
# cf.type3 -0.08369434 0.2751149 -0.30421597 7.612372e-01
# cf.type8 0.17387840 0.2231278 0.77927713 4.366141e-01
# cf.type99 -0.53932949 0.6935709 -0.77761265 4.375931e-01
# time2 -0.49471558 0.1813017 -2.72868700 6.847850e-03
# time3 -0.34560481 0.1834099 -1.88432993 6.077612e-02
# time4 0.01830757 0.2391373 0.07655675 9.390424e-01
# cf.type1:time2 0.28441094 0.3152098 0.90229103 3.678421e-01
# cf.type2:time2 -0.44254640 0.5208981 -0.84958339 3.964364e-01
# cf.type3:time2 0.77070527 0.4209305 1.83095614 6.839524e-02
# cf.type8:time2 0.71423737 0.4165119 1.71480662 8.772179e-02
# cf.type99:time2 0.51203670 0.8460178 0.60523161 5.456192e-01
# cf.type1:time3 1.15793712 0.4035018 2.86971958 4.489380e-03
# cf.type2:time3 -0.66306413 0.5522639 -1.20062913 2.311248e-01
# cf.type3:time3 0.18818360 0.4218428 0.44609884 6.559437e-01
# cf.type8:time3 -0.12287313 0.3559152 -0.34523149 7.302345e-01
# cf.type1:time4 -0.06825029 0.3912267 -0.17445200 8.616630e-01
# cf.type2:time4 -1.06941453 0.6066406 -1.76284686 7.924862e-02
# cf.type3:time4 -0.37337606 0.4196728 -0.88968377 3.745613e-01
# cf.type8:time4 -0.06063658 0.3477727 -0.17435692 8.617377e-01
我正在尝试想出一种机制来防止 lm()
输出中出现空结果。确切地说,我想先找到它们,然后阻止它们进入新的 lm()
调用。
例如,在下面的例子中,tmp
输出中的cf.type99:time3
&cf.type99:time4
是NA
。因此,在检测到它们之后,我想防止它们被包含在第二个 lm()
调用中(new_tmp
)。
这在 R 中可行吗?
p.s。我希望 new_tmp
不适合其 model.matrix()
中的 cf.type99:time3
和 cf.type99:time4
,而不仅仅是从其输出中物理删除 NA。
d5 <- read.csv('https://raw.githubusercontent.com/rnorouzian/m/master/v14.csv')
d5[c("cf.type","time")] <- lapply(d5[c("cf.type","time")], as.factor)
tmp <- lm(dint~cf.type*time, data = d5)
(coef.na <- is.na(coef(tmp))) # detects the `NA` in the output:
# cf.type2:time3 cf.type3:time3 cf.type8:time3 cf.type99:time3 cf.type1:time4
# FALSE FALSE FALSE TRUE FALSE
# cf.type2:time4 cf.type3:time4 cf.type8:time4 cf.type99:time4
# FALSE FALSE FALSE TRUE
new_tmp <- lm(dint~cf.type*time, data = d5) # NOW PREVENT `cf.type99:time3` & `cf.type99:time4` from entering new_tmp
您可以将列从 model.matrix
、
X.star <- model.matrix(tmp)[,!is.na(tmp$coefficients)]
然后用lm.fit
计算系数,
y <- d5$dint
fit.star <- lm.fit(X.star, y)
(beta.hat <- fit.star$coe)
# (Intercept) cf.type1 cf.type2 cf.type3 cf.type8
# 0.72824752 -0.20633963 0.72317588 -0.08369434 0.17387840
# cf.type99 time2 time3 time4 cf.type1:time2
# -0.53932949 -0.49471558 -0.34560481 0.01830757 0.28441094
# cf.type2:time2 cf.type3:time2 cf.type8:time2 cf.type99:time2 cf.type1:time3
# -0.44254640 0.77070527 0.71423737 0.51203670 1.15793712
# cf.type2:time3 cf.type3:time3 cf.type8:time3 cf.type1:time4 cf.type2:time4
# -0.66306413 0.18818360 -0.12287313 -0.06825029 -1.06941453
# cf.type3:time4 cf.type8:time4
# -0.37337606 -0.06063658
以及可选的手动标准错误。
## Standard errors
y.hat <- X.star %*% beta.hat
e.hat <- y - y.hat
n <- dim(X.star)[2]
m <- dim(X.star)[1]
sig2 <- (t(e.hat) %*% e.hat) / (m - (n + 1))
res.se <- sqrt(sig2)
diag.xx <- diag(solve(t(X.star) %*% X.star))
beta.se <- as.vector(res.se)*sqrt(diag.xx)
结果
cbind(Estimate=beta.hat, `Std. Error`=beta.se, `t value`=beta.hat/beta.se,
`Pr(>|t|)`=2*pt(-abs(beta.hat/beta.se), df=fit.star$df.residual))
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.72824752 0.1420687 5.12602302 6.252427e-07
# cf.type1 -0.20633963 0.2460702 -0.83853955 4.025946e-01
# cf.type2 0.72317588 0.4921405 1.46945012 1.430713e-01
# cf.type3 -0.08369434 0.2751149 -0.30421597 7.612372e-01
# cf.type8 0.17387840 0.2231278 0.77927713 4.366141e-01
# cf.type99 -0.53932949 0.6935709 -0.77761265 4.375931e-01
# time2 -0.49471558 0.1813017 -2.72868700 6.847850e-03
# time3 -0.34560481 0.1834099 -1.88432993 6.077612e-02
# time4 0.01830757 0.2391373 0.07655675 9.390424e-01
# cf.type1:time2 0.28441094 0.3152098 0.90229103 3.678421e-01
# cf.type2:time2 -0.44254640 0.5208981 -0.84958339 3.964364e-01
# cf.type3:time2 0.77070527 0.4209305 1.83095614 6.839524e-02
# cf.type8:time2 0.71423737 0.4165119 1.71480662 8.772179e-02
# cf.type99:time2 0.51203670 0.8460178 0.60523161 5.456192e-01
# cf.type1:time3 1.15793712 0.4035018 2.86971958 4.489380e-03
# cf.type2:time3 -0.66306413 0.5522639 -1.20062913 2.311248e-01
# cf.type3:time3 0.18818360 0.4218428 0.44609884 6.559437e-01
# cf.type8:time3 -0.12287313 0.3559152 -0.34523149 7.302345e-01
# cf.type1:time4 -0.06825029 0.3912267 -0.17445200 8.616630e-01
# cf.type2:time4 -1.06941453 0.6066406 -1.76284686 7.924862e-02
# cf.type3:time4 -0.37337606 0.4196728 -0.88968377 3.745613e-01
# cf.type8:time4 -0.06063658 0.3477727 -0.17435692 8.617377e-01