tidyr::gather 多列不同类型
tidyr::gather multiple columns of varying types
我的问题类似于this question。我正在尝试 tidyr::gather
多列。但是,link 中提供的解决方案并不理想,因为所有列中的属性通常并不相同,因此它们被删除了。
注意,我知道如何使用 base R 执行此操作,但我正在尝试学习如何使用 tidyr and/or dplyr 执行等效操作。
下面我模拟了一些数据(虽然很差,但很快),这些数据说明了我经常遇到的情况(尽管我通常有更多的列遵循这种相同的模式)。我用 stats::reshape
提供了基本解决方案,因此您可以看到我想要的输出。
如有任何帮助,我们将不胜感激。
set.seed(123)
male_g6 <- rbinom(100, 1, .5)
ell_g6 <- rbinom(100, 1, .1)
sped_g6 <- rbinom(100, 1, .15)
pullouts_g6 <- rbinom(100, 5, .1)
disability_g6 <- replicate(100,
sample(
c("asd", "cd", "ed", "hi", "id", "ohi", "ld", "none"),
1,
prob = c(rep(0.01, 6), 0.05, 0.89)
)
)
score_g6 <- rnorm(100, 200, 10)
score_g7 <- score_g6 + 5 + rnorm(100, 0, 2)
score_g8 <- score_g7 + 5 + rnorm(100, 0, 2)
d <- data.frame(
SID = 1:100,
male_g6 = male_g6,
male_g7 = male_g6,
male_g8 = male_g6,
ell_g6 = ell_g6,
ell_g7 = ell_g6,
ell_g8 = ell_g6,
sped_g6 = sped_g6,
sped_g7 = sped_g6,
sped_g8 = sped_g6,
pullouts_g6 = pullouts_g6,
pullouts_g7 = pullouts_g6,
pullouts_g8 = pullouts_g6,
disability_g6 = disability_g6,
disability_g7 = disability_g6,
disability_g8 = disability_g6,
score_g6 = score_g6,
score_g7 = score_g7,
score_g8 = score_g8
)
基础重塑
ld <- stats::reshape(d,
idvar = "SID",
varying = list(
c("male_g6", "male_g7", "male_g8"),
c("ell_g6", "ell_g7", "ell_g8"),
c("sped_g6", "sped_g7", "sped_g8"),
c("pullouts_g6", "pullouts_g7", "pullouts_g8"),
c("disability_g6", "disability_g7", "disability_g8"),
c("score_g6", "score_g7", "score_g8")
),
v.names = c("male", "ell", "sped", "pullouts", "disability", "score"),
times = 6:8,
timevar = "Grade",
direction = "long"
)
ld <- ld[order(ld$SID), ]
你需要收集超出你想要结束的内容,这样你就可以将年级水平与 headers 分开,然后你可以传播回宽形式:
ld2 <- d %>% gather(var, val, -SID) %>% # gather to long form
# separate grade from variable names
separate(var, c('var', 'grade'), sep = '_g', convert = TRUE) %>%
spread(var, val, convert = TRUE) # spread back to wide
head(ld2)
## SID grade disability ell male pullouts score sped
## 1 1 6 cd 0 0 1 196.2440 0
## 2 1 7 cd 0 0 1 203.2739 0
## 3 1 8 cd 0 0 1 211.1347 0
## 4 2 6 none 0 1 0 194.3812 1
## 5 2 7 none 0 1 0 195.3957 1
## 6 2 8 none 0 1 0 202.4890 1
我的问题类似于this question。我正在尝试 tidyr::gather
多列。但是,link 中提供的解决方案并不理想,因为所有列中的属性通常并不相同,因此它们被删除了。
注意,我知道如何使用 base R 执行此操作,但我正在尝试学习如何使用 tidyr and/or dplyr 执行等效操作。
下面我模拟了一些数据(虽然很差,但很快),这些数据说明了我经常遇到的情况(尽管我通常有更多的列遵循这种相同的模式)。我用 stats::reshape
提供了基本解决方案,因此您可以看到我想要的输出。
如有任何帮助,我们将不胜感激。
set.seed(123)
male_g6 <- rbinom(100, 1, .5)
ell_g6 <- rbinom(100, 1, .1)
sped_g6 <- rbinom(100, 1, .15)
pullouts_g6 <- rbinom(100, 5, .1)
disability_g6 <- replicate(100,
sample(
c("asd", "cd", "ed", "hi", "id", "ohi", "ld", "none"),
1,
prob = c(rep(0.01, 6), 0.05, 0.89)
)
)
score_g6 <- rnorm(100, 200, 10)
score_g7 <- score_g6 + 5 + rnorm(100, 0, 2)
score_g8 <- score_g7 + 5 + rnorm(100, 0, 2)
d <- data.frame(
SID = 1:100,
male_g6 = male_g6,
male_g7 = male_g6,
male_g8 = male_g6,
ell_g6 = ell_g6,
ell_g7 = ell_g6,
ell_g8 = ell_g6,
sped_g6 = sped_g6,
sped_g7 = sped_g6,
sped_g8 = sped_g6,
pullouts_g6 = pullouts_g6,
pullouts_g7 = pullouts_g6,
pullouts_g8 = pullouts_g6,
disability_g6 = disability_g6,
disability_g7 = disability_g6,
disability_g8 = disability_g6,
score_g6 = score_g6,
score_g7 = score_g7,
score_g8 = score_g8
)
基础重塑
ld <- stats::reshape(d,
idvar = "SID",
varying = list(
c("male_g6", "male_g7", "male_g8"),
c("ell_g6", "ell_g7", "ell_g8"),
c("sped_g6", "sped_g7", "sped_g8"),
c("pullouts_g6", "pullouts_g7", "pullouts_g8"),
c("disability_g6", "disability_g7", "disability_g8"),
c("score_g6", "score_g7", "score_g8")
),
v.names = c("male", "ell", "sped", "pullouts", "disability", "score"),
times = 6:8,
timevar = "Grade",
direction = "long"
)
ld <- ld[order(ld$SID), ]
你需要收集超出你想要结束的内容,这样你就可以将年级水平与 headers 分开,然后你可以传播回宽形式:
ld2 <- d %>% gather(var, val, -SID) %>% # gather to long form
# separate grade from variable names
separate(var, c('var', 'grade'), sep = '_g', convert = TRUE) %>%
spread(var, val, convert = TRUE) # spread back to wide
head(ld2)
## SID grade disability ell male pullouts score sped
## 1 1 6 cd 0 0 1 196.2440 0
## 2 1 7 cd 0 0 1 203.2739 0
## 3 1 8 cd 0 0 1 211.1347 0
## 4 2 6 none 0 1 0 194.3812 1
## 5 2 7 none 0 1 0 195.3957 1
## 6 2 8 none 0 1 0 202.4890 1