如何清理调查数据?
how to clean survey data?
我应该如何进行(通过 R
按以下方式整理我的数据集:
输入
预期输出
利用tidyr
的包
我正在考虑使用 tidyr
,但我还不知道如何继续。有什么建议吗?
数据
输入
input <- structure(list(ID = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 2L, 3L, 4L),
.Label = c("obs 1", "obs 10", "obs 11", "obs 12", "obs 2", "obs 3",
"obs 4", "obs 5", "obs 6", "obs 7", "obs 8", "obs 9"),
class = "factor"),
Proposal.1...first = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L),
.Label = c("", "first"), class = "factor"),
Proposal.1...second = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L),
.Label = c("", "second"), class = "factor"),
Proposal.1...last = structure(c(1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("", "last"), class = "factor"),
Proposal.2...first = structure(c(1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L),
.Label = c("", "first"), class = "factor"),
Proposal.2...second = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("", "second"), class = "factor"),
Proposal.2...last = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L),
.Label = c("", "last"), class = "factor"),
Proposal.3...first = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("", "first"), class = "factor"),
Proposal.3...second = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L),
.Label = c("", "second"), class = "factor"),
Proposal.3...last = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("", "last"), class = "factor"),
Proposal.4...first = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L),
.Label = c("", "first"), class = "factor"),
Proposal.4...second = structure(c(1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("", "second"), class = "factor"),
Proposal.4...last = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L),
.Label = c("", "last"), class = "factor")),
.Names = c("ID", "Proposal.1...first", "Proposal.1...second", "Proposal.1...last", "Proposal.2...first",
"Proposal.2...second", "Proposal.2...last", "Proposal.3...first","Proposal.3...second",
"Proposal.3...last", "Proposal.4...first", "Proposal.4...second", "Proposal.4...last"),
class = "data.frame",
row.names = c(NA, -12L))
预期产出
output <- structure(list(ID = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 2L, 3L, 4L),
.Label = c("obs 1", "obs 10", "obs 11", "obs 12", "obs 2", "obs 3", "obs 4", "obs 5",
"obs 6", "obs 7", "obs 8", "obs 9"), class = "factor"),
first = structure(c(1L, 1L, 2L, 4L, 2L, 3L, 2L, 4L, 1L, 1L, 4L, 2L),
.Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor"),
second = structure(c(2L, 4L, 3L, 3L, 4L, 1L, 3L, 1L, 3L, 3L, 3L, 3L),
.Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor"),
last = structure(c(3L, 3L, 1L, 1L, 1L, 2L, 4L, 2L, 4L, 4L, 2L, 4L),
.Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor")),
.Names = c("ID", "first", "second", "last"), class = "data.frame", row.names = c(NA, -12L))
谢谢!
对于 dplyr 和 tidyr,您可以结合使用 gather 和 spread:
library(dplyr)
library(tidyr)
gather(input,proposal,value,-ID) %>% ## turn to long form
mutate(proposal=sub("\.{3}.*","",proposal)) %>% ## removes "- first|second|last" from proposal
mutate(proposal=sub("\."," ",proposal)) %>% ## Not needed, but cleaner: changes "." to " " in proposal
filter(value != "") %>% ## removes lines with empty value
spread(value,proposal) %>% ## turn to wide form
select(ID,first,second,last) %>% ## Not needed, but cleaner: order columns
arrange(as.numeric(sub("obs ","",ID))) ## Not needed, but cleaner: order rows
输出
ID first second last
1 obs 1 Proposal 1 Proposal 2 Proposal 3
2 obs 2 Proposal 1 Proposal 4 Proposal 3
3 obs 3 Proposal 2 Proposal 3 Proposal 1
4 obs 4 Proposal 4 Proposal 3 Proposal 1
5 obs 5 Proposal 2 Proposal 4 Proposal 1
6 obs 6 Proposal 3 Proposal 1 Proposal 2
7 obs 7 Proposal 2 Proposal 3 Proposal 4
8 obs 8 Proposal 4 Proposal 1 Proposal 2
9 obs 9 Proposal 1 Proposal 3 Proposal 4
10 obs 10 Proposal 1 Proposal 3 Proposal 4
11 obs 11 Proposal 4 Proposal 3 Proposal 2
12 obs 12 Proposal 2 Proposal 3 Proposal 4
我应该如何进行(通过 R
按以下方式整理我的数据集:
输入
预期输出
利用tidyr
的包
我正在考虑使用 tidyr
,但我还不知道如何继续。有什么建议吗?
数据
输入input <- structure(list(ID = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 2L, 3L, 4L),
.Label = c("obs 1", "obs 10", "obs 11", "obs 12", "obs 2", "obs 3",
"obs 4", "obs 5", "obs 6", "obs 7", "obs 8", "obs 9"),
class = "factor"),
Proposal.1...first = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L),
.Label = c("", "first"), class = "factor"),
Proposal.1...second = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L),
.Label = c("", "second"), class = "factor"),
Proposal.1...last = structure(c(1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("", "last"), class = "factor"),
Proposal.2...first = structure(c(1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L),
.Label = c("", "first"), class = "factor"),
Proposal.2...second = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("", "second"), class = "factor"),
Proposal.2...last = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L),
.Label = c("", "last"), class = "factor"),
Proposal.3...first = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("", "first"), class = "factor"),
Proposal.3...second = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L),
.Label = c("", "second"), class = "factor"),
Proposal.3...last = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("", "last"), class = "factor"),
Proposal.4...first = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L),
.Label = c("", "first"), class = "factor"),
Proposal.4...second = structure(c(1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("", "second"), class = "factor"),
Proposal.4...last = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L),
.Label = c("", "last"), class = "factor")),
.Names = c("ID", "Proposal.1...first", "Proposal.1...second", "Proposal.1...last", "Proposal.2...first",
"Proposal.2...second", "Proposal.2...last", "Proposal.3...first","Proposal.3...second",
"Proposal.3...last", "Proposal.4...first", "Proposal.4...second", "Proposal.4...last"),
class = "data.frame",
row.names = c(NA, -12L))
预期产出
output <- structure(list(ID = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 2L, 3L, 4L),
.Label = c("obs 1", "obs 10", "obs 11", "obs 12", "obs 2", "obs 3", "obs 4", "obs 5",
"obs 6", "obs 7", "obs 8", "obs 9"), class = "factor"),
first = structure(c(1L, 1L, 2L, 4L, 2L, 3L, 2L, 4L, 1L, 1L, 4L, 2L),
.Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor"),
second = structure(c(2L, 4L, 3L, 3L, 4L, 1L, 3L, 1L, 3L, 3L, 3L, 3L),
.Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor"),
last = structure(c(3L, 3L, 1L, 1L, 1L, 2L, 4L, 2L, 4L, 4L, 2L, 4L),
.Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor")),
.Names = c("ID", "first", "second", "last"), class = "data.frame", row.names = c(NA, -12L))
谢谢!
对于 dplyr 和 tidyr,您可以结合使用 gather 和 spread:
library(dplyr)
library(tidyr)
gather(input,proposal,value,-ID) %>% ## turn to long form
mutate(proposal=sub("\.{3}.*","",proposal)) %>% ## removes "- first|second|last" from proposal
mutate(proposal=sub("\."," ",proposal)) %>% ## Not needed, but cleaner: changes "." to " " in proposal
filter(value != "") %>% ## removes lines with empty value
spread(value,proposal) %>% ## turn to wide form
select(ID,first,second,last) %>% ## Not needed, but cleaner: order columns
arrange(as.numeric(sub("obs ","",ID))) ## Not needed, but cleaner: order rows
输出
ID first second last
1 obs 1 Proposal 1 Proposal 2 Proposal 3
2 obs 2 Proposal 1 Proposal 4 Proposal 3
3 obs 3 Proposal 2 Proposal 3 Proposal 1
4 obs 4 Proposal 4 Proposal 3 Proposal 1
5 obs 5 Proposal 2 Proposal 4 Proposal 1
6 obs 6 Proposal 3 Proposal 1 Proposal 2
7 obs 7 Proposal 2 Proposal 3 Proposal 4
8 obs 8 Proposal 4 Proposal 1 Proposal 2
9 obs 9 Proposal 1 Proposal 3 Proposal 4
10 obs 10 Proposal 1 Proposal 3 Proposal 4
11 obs 11 Proposal 4 Proposal 3 Proposal 2
12 obs 12 Proposal 2 Proposal 3 Proposal 4