mice:将长完成数据转换为 mids 对象
mice: Convert long completed data to a mids object
我有一个宽格式的纵向数据帧,波由后缀分隔(例如,“_Wave1”、“_Wave2”等),我想将其相乘并转换为长格式。 Similar unanswered question here.
首先,我尝试在 之后 将数据从宽转换为长,但我意识到 mice
会估算不应该估算的数据 - 因为例如,它会为第一波数据收集(“第 1 波”)估算第 1 波,但第 1 波从未完成第 1 波,因此不应为该波估算。在这一点上,我尝试编辑预测矩阵,但由于我已经在插补后转换为长格式,所以所有关于波的信息都丢失了。
然后,我尝试输入 before 从宽到长的转换,但遇到 as.mids
错误。从宽转换为长最终会使 .id
重复,这会导致 as.mids
出错。
如果有任何见解或建议,我将不胜感激。
library(mice, warn.conflicts = FALSE)
library(tidyverse)
library(missMethods)
set.seed(123)
studyid <- 1:10
color <- sample(c("red","blue","yellow"), 10, replace=TRUE, prob=c(0.25, 0.25, 0.5))
#scale_1_Wave1 <- NA
scale_2_Wave1 <- sample(1:25, 10, replace=TRUE)
scale_3_Wave1 <- sample(1:25, 10, replace=TRUE)
#scale_1_Wave2 <- NA
scale_2_Wave2 <- sample(1:25, 10, replace=TRUE)
scale_3_Wave2 <- sample(1:25, 10, replace=TRUE)
scale_1_Wave3 <- sample(1:25, 10, replace=TRUE)
scale_2_Wave3 <- sample(1:25, 10, replace=TRUE)
#scale_3_Wave3 <- NA
# Scale 1 wasn't given at Waves 1 or 2, only Wave 3
# Scale 3 wasn't given at Wave 3
df <- data.frame(studyid, color,
scale_2_Wave1, scale_3_Wave1,
scale_2_Wave2, scale_3_Wave2,
scale_1_Wave3, scale_2_Wave3)
# make some missingness
df <- delete_MCAR(df, p = 0.2, cols_mis = c(3:8))
rm(list=setdiff(ls(), "df"))
## Attempt #1: convert wide to long, then impute
df_long <- df %>%
pivot_longer(contains("Wave"), names_to = "scalename", values_to = "scalevalue") %>%
separate(scalename, c("scale", "Wave"), "_Wave") %>%
pivot_wider(names_from = scale, values_from = scalevalue)
imp1 <- mice(df_long, m = 5, print = FALSE)
#> Warning: Number of logged events: 2
complete1 <- complete(imp1, action = "long", include = TRUE)
# Scale 1 shouldn't have any data at Waves 1 or 2, but it does
ggplot(complete1, aes(x = Wave, y = scale_1)) + geom_boxplot()
#> Warning: Removed 22 rows containing non-finite values (stat_boxplot).
## Attempt #2: impute, then convert wide to long
imp2 <- mice(df, m = 5, print = FALSE)
#> Warning: Number of logged events: 2
complete2 <- complete(imp2, action = "long", include = TRUE)
# convert imputed data from wide to long
complete_long <- complete2 %>%
pivot_longer(contains("Wave"), names_to = "scalename", values_to = "scalevalue") %>%
separate(scalename, c("scale", "Wave"), "_Wave") %>%
pivot_wider(names_from = scale, values_from = scalevalue)
# now try to convert it back to mids
imp_long <- as.mids(complete_long, .imp = 1, .id = 2)
#> Warning: non-unique values when setting 'row.names': '1', '2', '3', '4', '5',
#> '6', '7', '8', '9', '10'
#> Error in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed
Created on 2022-01-05 by the reprex package (v2.0.1)
This post 帮助我为第二个选项编写了一个解决方案(我更喜欢它,因为它让我更好地控制什么会被估算,什么不会被估算)。
另外,如果你想计算任何 post 加长的东西(例如,一个新变量),请在 pivot_wider
之后但在 mutate(.id = 1:nrow(.))
.
之前进行计算
library(mice, warn.conflicts = FALSE)
library(tidyverse)
library(missMethods)
library(miceadds)
#> * miceadds 3.11-6 (2021-01-21 11:48:47)
set.seed(123)
df <- structure(list(studyid = 1:10,
color = c("yellow", "red", "yellow", "red", "red", "yellow", "blue", "red", "blue", "yellow"),
scale_2_Wave1 = c(NA, NA, 22L, 25L, 5L, 19L, 25L, 25L, 9L, 3L),
scale_3_Wave1 = c(8L, NA, 10L, NA, 19L, 4L, 14L, 17L, 11L, 7L),
scale_2_Wave2 = c(21L, 12L, 15L, 10L, NA, NA, 9L, 9L, 10L, 23L),
scale_3_Wave2 = c(21L, 7L, NA, 6L, 25L, 2L, 5L, 8L, NA, 13L),
scale_1_Wave3 = c(18L, 1L, 25L, NA, 6L, NA, 15L, 9L, 15L, 16L),
scale_2_Wave3 = c(20L, 6L, 11L, 8L, 22L, 22L, 7L, 16L, NA, NA)),
row.names = c(NA, -10L ), class = "data.frame")
imp <- mice(df, m = 5, print = FALSE)
#> Warning: Number of logged events: 1
complete <- mice::complete(imp, action = "long", include = TRUE)
working_dats <- list()
for(i in 0:max(complete$.imp)) {
working_dats[[i+1]] <-
complete %>%
subset(.imp == i) %>%
pivot_longer(contains("Wave"), names_to = "scalename", values_to = "scalevalue") %>%
separate(scalename, c("scale", "Wave"), "_Wave") %>%
pivot_wider(names_from = scale, values_from = scalevalue) %>%
mutate(.id = 1:nrow(.))
}
imputed_long <- as.mids(do.call(rbind, working_dats))
write.mice.imputation(mi.res = imputed_long, name = "imputed_long", mids2spss = FALSE)
我有一个宽格式的纵向数据帧,波由后缀分隔(例如,“_Wave1”、“_Wave2”等),我想将其相乘并转换为长格式。 Similar unanswered question here.
首先,我尝试在 之后 将数据从宽转换为长,但我意识到 mice
会估算不应该估算的数据 - 因为例如,它会为第一波数据收集(“第 1 波”)估算第 1 波,但第 1 波从未完成第 1 波,因此不应为该波估算。在这一点上,我尝试编辑预测矩阵,但由于我已经在插补后转换为长格式,所以所有关于波的信息都丢失了。
然后,我尝试输入 before 从宽到长的转换,但遇到 as.mids
错误。从宽转换为长最终会使 .id
重复,这会导致 as.mids
出错。
如果有任何见解或建议,我将不胜感激。
library(mice, warn.conflicts = FALSE)
library(tidyverse)
library(missMethods)
set.seed(123)
studyid <- 1:10
color <- sample(c("red","blue","yellow"), 10, replace=TRUE, prob=c(0.25, 0.25, 0.5))
#scale_1_Wave1 <- NA
scale_2_Wave1 <- sample(1:25, 10, replace=TRUE)
scale_3_Wave1 <- sample(1:25, 10, replace=TRUE)
#scale_1_Wave2 <- NA
scale_2_Wave2 <- sample(1:25, 10, replace=TRUE)
scale_3_Wave2 <- sample(1:25, 10, replace=TRUE)
scale_1_Wave3 <- sample(1:25, 10, replace=TRUE)
scale_2_Wave3 <- sample(1:25, 10, replace=TRUE)
#scale_3_Wave3 <- NA
# Scale 1 wasn't given at Waves 1 or 2, only Wave 3
# Scale 3 wasn't given at Wave 3
df <- data.frame(studyid, color,
scale_2_Wave1, scale_3_Wave1,
scale_2_Wave2, scale_3_Wave2,
scale_1_Wave3, scale_2_Wave3)
# make some missingness
df <- delete_MCAR(df, p = 0.2, cols_mis = c(3:8))
rm(list=setdiff(ls(), "df"))
## Attempt #1: convert wide to long, then impute
df_long <- df %>%
pivot_longer(contains("Wave"), names_to = "scalename", values_to = "scalevalue") %>%
separate(scalename, c("scale", "Wave"), "_Wave") %>%
pivot_wider(names_from = scale, values_from = scalevalue)
imp1 <- mice(df_long, m = 5, print = FALSE)
#> Warning: Number of logged events: 2
complete1 <- complete(imp1, action = "long", include = TRUE)
# Scale 1 shouldn't have any data at Waves 1 or 2, but it does
ggplot(complete1, aes(x = Wave, y = scale_1)) + geom_boxplot()
#> Warning: Removed 22 rows containing non-finite values (stat_boxplot).
## Attempt #2: impute, then convert wide to long
imp2 <- mice(df, m = 5, print = FALSE)
#> Warning: Number of logged events: 2
complete2 <- complete(imp2, action = "long", include = TRUE)
# convert imputed data from wide to long
complete_long <- complete2 %>%
pivot_longer(contains("Wave"), names_to = "scalename", values_to = "scalevalue") %>%
separate(scalename, c("scale", "Wave"), "_Wave") %>%
pivot_wider(names_from = scale, values_from = scalevalue)
# now try to convert it back to mids
imp_long <- as.mids(complete_long, .imp = 1, .id = 2)
#> Warning: non-unique values when setting 'row.names': '1', '2', '3', '4', '5',
#> '6', '7', '8', '9', '10'
#> Error in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed
Created on 2022-01-05 by the reprex package (v2.0.1)
This post 帮助我为第二个选项编写了一个解决方案(我更喜欢它,因为它让我更好地控制什么会被估算,什么不会被估算)。
另外,如果你想计算任何 post 加长的东西(例如,一个新变量),请在 pivot_wider
之后但在 mutate(.id = 1:nrow(.))
.
library(mice, warn.conflicts = FALSE)
library(tidyverse)
library(missMethods)
library(miceadds)
#> * miceadds 3.11-6 (2021-01-21 11:48:47)
set.seed(123)
df <- structure(list(studyid = 1:10,
color = c("yellow", "red", "yellow", "red", "red", "yellow", "blue", "red", "blue", "yellow"),
scale_2_Wave1 = c(NA, NA, 22L, 25L, 5L, 19L, 25L, 25L, 9L, 3L),
scale_3_Wave1 = c(8L, NA, 10L, NA, 19L, 4L, 14L, 17L, 11L, 7L),
scale_2_Wave2 = c(21L, 12L, 15L, 10L, NA, NA, 9L, 9L, 10L, 23L),
scale_3_Wave2 = c(21L, 7L, NA, 6L, 25L, 2L, 5L, 8L, NA, 13L),
scale_1_Wave3 = c(18L, 1L, 25L, NA, 6L, NA, 15L, 9L, 15L, 16L),
scale_2_Wave3 = c(20L, 6L, 11L, 8L, 22L, 22L, 7L, 16L, NA, NA)),
row.names = c(NA, -10L ), class = "data.frame")
imp <- mice(df, m = 5, print = FALSE)
#> Warning: Number of logged events: 1
complete <- mice::complete(imp, action = "long", include = TRUE)
working_dats <- list()
for(i in 0:max(complete$.imp)) {
working_dats[[i+1]] <-
complete %>%
subset(.imp == i) %>%
pivot_longer(contains("Wave"), names_to = "scalename", values_to = "scalevalue") %>%
separate(scalename, c("scale", "Wave"), "_Wave") %>%
pivot_wider(names_from = scale, values_from = scalevalue) %>%
mutate(.id = 1:nrow(.))
}
imputed_long <- as.mids(do.call(rbind, working_dats))
write.mice.imputation(mi.res = imputed_long, name = "imputed_long", mids2spss = FALSE)