重新编码调查多项选择题的输出
Recode output from survey multiple choice question
我用 limesurvey 进行了一项调查,并将结果导出为 csv.-file,我将其导入 R。
其中一个问题是多项选择题,参与者可以在其中命名他们学习的科目。 limesurvey 的输出看起来有点像这样(但有更多的主题和更多的参与者):
Participant | Maths | Physics | English | Biology
1 | Y | | Y |
2 | | Y | Y |
3 | Y | Y | | Y
我想要得到如下所示的结果
Participant | Subject 1 | Subject 2| Subject 3 |
1 | Maths | English | |
2 | Physics | English | |
3 | Maths | Physics | Biology |
如有任何指点,我将不胜感激。
我不太熟悉这种数据整理,但这里有一些建议。
首先假设您的数据是这种形式:
dtf <- structure(list(Participant = c("1", "2", "3", "4"),
Physics = c("Y", "Y", "N", "N"), Chemistry = c("Y", "N", "N",
"N"), Math = c("N", "Y", "Y", "Y"), Biology = c("N", "Y", "N",
"Y")), class = "data.frame", row.names = c(NA, -4L))
然后我们可以这样重新排列
wh <- which(dtf == "Y", arr.ind=TRUE)
tapply(wh[,2], wh[,1], function(x) colnames(dtf)[x])
# $`1`
# [1] "Physics" "Chemistry"
# $`2`
# [1] "Physics" "Math" "Biology"
# $`3`
# [1] "Math"
# $`4`
# [1] "Math" "Biology"
或
dtf2 <- dtf[1]
dtf2$Subject <- apply(dtf, 1, function(r) {c(names(dtf)[r == "Y"])})
dtf2
# Participant Subject
# 1 1 Physics, Chemistry
# 2 2 Physics, Math, Biology
# 3 3 Math
# 4 4 Math, Biology
或使用 melt()
和 dcast()
来自 reshape2
library(reshape2)
dtf.m <- melt(dtf, 1)
dcast(dtf.m[dtf.m$value == "Y", 1:2], Participant ~ variable)
# Participant Physics Chemistry Math Biology
# 1 1 Physics Chemistry <NA> <NA>
# 2 2 Physics <NA> Math Biology
# 3 3 <NA> <NA> Math <NA>
# 4 4 <NA> <NA> Math Biology
这是我根据要求生成预期数据帧的尝试:
library(tidyverse)
library(gtools)
rand_list = c('Y', NA)
df = data.frame(participant = seq(1,10, by = 1), # r starts counting from 0
Maths = sample(rand_list, 10, replace = TRUE),
Physics = sample(rand_list, 10, replace = TRUE),
English = sample(rand_list, 10, replace = TRUE),
Biology = sample(rand_list, 10, replace = TRUE))
df_to_new_format = function(data){
vector_subject = colnames(data)
vector_new_col = c()
for (i in 1:length(vector_subject)){
if (i == 1){
new_col = 'participant'
vector_new_col <- c(vector_new_col, new_col)
rm(new_col)
} else{
new_col = paste('Subject', as.character(i - 1))
vector_new_col <- c(vector_new_col, new_col)
rm(new_col)
}
}
for (j in 1:length(vector_subject)){
if (j == 1){
next
} else{
data[[j]] <- recode(data[[j]], 'Y' = vector_subject[j])
}
}
colnames(data) <- vector_new_col
return(data)
}
df = df_to_new_format(data = df)
df_new_format = c()
for (m in 1:nrow(df)){
temp = mixedsort(as.matrix(df[m,]))
print(temp)
df_new_format = rbind(df_new_format, temp)
}
df_new_format = as.data.frame(df_new_format, row.names = FALSE)
colnames(df_new_format) = colnames(df)
我用 limesurvey 进行了一项调查,并将结果导出为 csv.-file,我将其导入 R。
其中一个问题是多项选择题,参与者可以在其中命名他们学习的科目。 limesurvey 的输出看起来有点像这样(但有更多的主题和更多的参与者):
Participant | Maths | Physics | English | Biology
1 | Y | | Y |
2 | | Y | Y |
3 | Y | Y | | Y
我想要得到如下所示的结果
Participant | Subject 1 | Subject 2| Subject 3 |
1 | Maths | English | |
2 | Physics | English | |
3 | Maths | Physics | Biology |
如有任何指点,我将不胜感激。
我不太熟悉这种数据整理,但这里有一些建议。
首先假设您的数据是这种形式:
dtf <- structure(list(Participant = c("1", "2", "3", "4"),
Physics = c("Y", "Y", "N", "N"), Chemistry = c("Y", "N", "N",
"N"), Math = c("N", "Y", "Y", "Y"), Biology = c("N", "Y", "N",
"Y")), class = "data.frame", row.names = c(NA, -4L))
然后我们可以这样重新排列
wh <- which(dtf == "Y", arr.ind=TRUE)
tapply(wh[,2], wh[,1], function(x) colnames(dtf)[x])
# $`1`
# [1] "Physics" "Chemistry"
# $`2`
# [1] "Physics" "Math" "Biology"
# $`3`
# [1] "Math"
# $`4`
# [1] "Math" "Biology"
或
dtf2 <- dtf[1]
dtf2$Subject <- apply(dtf, 1, function(r) {c(names(dtf)[r == "Y"])})
dtf2
# Participant Subject
# 1 1 Physics, Chemistry
# 2 2 Physics, Math, Biology
# 3 3 Math
# 4 4 Math, Biology
或使用 melt()
和 dcast()
来自 reshape2
library(reshape2)
dtf.m <- melt(dtf, 1)
dcast(dtf.m[dtf.m$value == "Y", 1:2], Participant ~ variable)
# Participant Physics Chemistry Math Biology
# 1 1 Physics Chemistry <NA> <NA>
# 2 2 Physics <NA> Math Biology
# 3 3 <NA> <NA> Math <NA>
# 4 4 <NA> <NA> Math Biology
这是我根据要求生成预期数据帧的尝试:
library(tidyverse)
library(gtools)
rand_list = c('Y', NA)
df = data.frame(participant = seq(1,10, by = 1), # r starts counting from 0
Maths = sample(rand_list, 10, replace = TRUE),
Physics = sample(rand_list, 10, replace = TRUE),
English = sample(rand_list, 10, replace = TRUE),
Biology = sample(rand_list, 10, replace = TRUE))
df_to_new_format = function(data){
vector_subject = colnames(data)
vector_new_col = c()
for (i in 1:length(vector_subject)){
if (i == 1){
new_col = 'participant'
vector_new_col <- c(vector_new_col, new_col)
rm(new_col)
} else{
new_col = paste('Subject', as.character(i - 1))
vector_new_col <- c(vector_new_col, new_col)
rm(new_col)
}
}
for (j in 1:length(vector_subject)){
if (j == 1){
next
} else{
data[[j]] <- recode(data[[j]], 'Y' = vector_subject[j])
}
}
colnames(data) <- vector_new_col
return(data)
}
df = df_to_new_format(data = df)
df_new_format = c()
for (m in 1:nrow(df)){
temp = mixedsort(as.matrix(df[m,]))
print(temp)
df_new_format = rbind(df_new_format, temp)
}
df_new_format = as.data.frame(df_new_format, row.names = FALSE)
colnames(df_new_format) = colnames(df)