从宽到长的数据框,具有多个变量和 id R
Data frame from wide to long with multiple variables and ids R
我有一个数据框,其中包含参与者对两个文本的判断。假设每篇课文都有一个正确答案和一个标识符,每篇课文被判断多次。
set.seed(123)
wide_df = data.frame('participant_id' = LETTERS[1:12]
, 'judgment_1' = round(rnorm(12)*100)
, 'correct_1' = round(rnorm(12)*100)
, 'text_id_1' = sample(1:12, 12, replace = F)
, 'judgment_2' = round(rnorm(12)*100)
, 'correct_2' = round(rnorm(12)*100)
, 'text_id_2' = sample(13:24, 12, replace = F)
)
这样:
participant_id judgment_1 correct_1 text_id_1 judgment_2 correct_2 text_id_2
1 A -56 40 4 43 -127 17
2 B -23 11 10 -30 217 14
3 C 156 -56 1 90 121 22
4 D 7 179 12 88 -112 15
5 E 13 50 7 82 -40 13
...
我想将其转换为包含以下列的长格式:
participant_id text_id judgment correct
A 4 -56 40
A 17 43 127
...
我找到并遵循了 SO 建议 here:
wide_df %>%
gather(v, value, judgment_1:text_id_2) %>%
separate(v, c("var", "col")) %>%
arrange(participant_id) %>%
spread(col, value)
但是那种重塑的方式returns错误Error: Duplicate identifiers for rows (3, 6), (9, 12)
我想我做的事情在概念上是错误的,但找不到它。我的错误在哪里?谢谢!
一些玩弄data.table
:
library(data.table)
superlong_df <- melt(wide_df, id.vars = "participant_id")
setDT(superlong_df)[, `:=`(varn = gsub(".*_(\d)", "\1", variable),
variable = gsub("_\d$", "", variable))]
dcast(superlong_df, participant_id + varn ~ variable)[, !"varn"]
participant_id correct judgment text_id
1: A 40 -56 4
2: A -127 43 17
3: B 11 -23 10
4: B 217 -30 14
5: C -56 156 1
6: C 121 90 22
7: D 179 7 12
8: D -112 88 15
9: E 50 13 7
10: E -40 82 13
11: F -197 172 11
12: F -47 69 19
13: G 70 46 9
14: G 78 55 24
15: H -47 -127 2
16: H -8 -6 20
17: I -107 -69 8
18: I 25 -31 21
19: J -22 -45 3
20: J -3 -38 16
21: K -103 122 5
22: K -4 -69 23
23: L -73 36 6
24: L 137 -21 18
participant_id correct judgment text_id
这是 tidyr
的方法。基本上你会做 unite()
+ separate_rows()
。 unite
合并列,separate_rows
合并行 -
wide_df %>%
unite(text_id, text_id_1, text_id_2) %>%
unite(judgment, judgment_1, judgment_2) %>%
unite(correct, correct_1, correct_2) %>%
separate_rows(2:4, sep = "_")
participant_id judgment correct text_id
1 A -56 40 4
2 A 43 -127 17
3 B -23 11 10
4 B -30 217 14
5 C 156 -56 1
6 C 90 121 22
7 D 7 179 12
8 D 88 -112 15
9 E 13 50 7
10 E 82 -40 13
11 F 172 -197 11
12 F 69 -47 19
13 G 46 70 9
14 G 55 78 24
15 H -127 -47 2
16 H -6 -8 20
17 I -69 -107 8
18 I -31 25 21
19 J -45 -22 3
20 J -38 -3 16
21 K 122 -103 5
22 K -69 -4 23
23 L 36 -73 6
24 L -21 137 18
与我的其他答案相比,这是一种更动态的方式。这不需要手动合并所需的列,但确实依赖于列名称模式。
wide_df %>%
gather(variable, value, -participant_id) %>%
mutate(
variable = substr(variable, 1, nchar(variable)-2),
rn = ave(1:length(participant_id), participant_id, variable, FUN = seq_along)
) %>%
spread(variable, value) %>%
select(-rn)
participant_id correct judgment text_id
1 A 40 -56 4
2 A -127 43 17
3 B 11 -23 10
4 B 217 -30 14
5 C -56 156 1
6 C 121 90 22
7 D 179 7 12
8 D -112 88 15
9 E 50 13 7
10 E -40 82 13
11 F -197 172 11
12 F -47 69 19
13 G 70 46 9
14 G 78 55 24
15 H -47 -127 2
16 H -8 -6 20
17 I -107 -69 8
18 I 25 -31 21
19 J -22 -45 3
20 J -3 -38 16
21 K -103 122 5
22 K -4 -69 23
23 L -73 36 6
24 L 137 -21 18
答案已存在:
例如,
set.seed(123)
wide_df = data.frame('participant_id' = LETTERS[1:12]
, 'judgment_1' = round(rnorm(12)*100)
, 'correct_1' = round(rnorm(12)*100)
, 'text_id_1' = sample(1:12, 12, replace = F)
, 'judgment_2' = round(rnorm(12)*100)
, 'correct_2' = round(rnorm(12)*100)
, 'text_id_2' = sample(13:24, 12, replace = F)
)
dl <- reshape(data = wide_df,
idvar = "participant_id",
varying = list(judgment=c(2,5),correct=c(3,6),text_id=c(4,7)),
direction="long",
v.names = c("judgment","correct","text_id"),
sep="_")
结果:
participant_id time judgment correct text_id
A.1 A 1 -56 40 4
B.1 B 1 -23 11 10
C.1 C 1 156 -56 1
D.1 D 1 7 179 12
E.1 E 1 13 50 7
F.1 F 1 172 -197 11
G.1 G 1 46 70 9
H.1 H 1 -127 -47 2
I.1 I 1 -69 -107 8
J.1 J 1 -45 -22 3
K.1 K 1 122 -103 5
L.1 L 1 36 -73 6
A.2 A 2 43 -127 17
B.2 B 2 -30 217 14
C.2 C 2 90 121 22
D.2 D 2 88 -112 15
E.2 E 2 82 -40 13
F.2 F 2 69 -47 19
G.2 G 2 55 78 24
H.2 H 2 -6 -8 20
I.2 I 2 -31 25 21
J.2 J 2 -38 -3 16
K.2 K 2 -69 -4 23
L.2 L 2 -21 137 18
我有一个数据框,其中包含参与者对两个文本的判断。假设每篇课文都有一个正确答案和一个标识符,每篇课文被判断多次。
set.seed(123)
wide_df = data.frame('participant_id' = LETTERS[1:12]
, 'judgment_1' = round(rnorm(12)*100)
, 'correct_1' = round(rnorm(12)*100)
, 'text_id_1' = sample(1:12, 12, replace = F)
, 'judgment_2' = round(rnorm(12)*100)
, 'correct_2' = round(rnorm(12)*100)
, 'text_id_2' = sample(13:24, 12, replace = F)
)
这样:
participant_id judgment_1 correct_1 text_id_1 judgment_2 correct_2 text_id_2
1 A -56 40 4 43 -127 17
2 B -23 11 10 -30 217 14
3 C 156 -56 1 90 121 22
4 D 7 179 12 88 -112 15
5 E 13 50 7 82 -40 13
...
我想将其转换为包含以下列的长格式:
participant_id text_id judgment correct
A 4 -56 40
A 17 43 127
...
我找到并遵循了 SO 建议 here:
wide_df %>%
gather(v, value, judgment_1:text_id_2) %>%
separate(v, c("var", "col")) %>%
arrange(participant_id) %>%
spread(col, value)
但是那种重塑的方式returns错误Error: Duplicate identifiers for rows (3, 6), (9, 12)
我想我做的事情在概念上是错误的,但找不到它。我的错误在哪里?谢谢!
一些玩弄data.table
:
library(data.table)
superlong_df <- melt(wide_df, id.vars = "participant_id")
setDT(superlong_df)[, `:=`(varn = gsub(".*_(\d)", "\1", variable),
variable = gsub("_\d$", "", variable))]
dcast(superlong_df, participant_id + varn ~ variable)[, !"varn"]
participant_id correct judgment text_id
1: A 40 -56 4
2: A -127 43 17
3: B 11 -23 10
4: B 217 -30 14
5: C -56 156 1
6: C 121 90 22
7: D 179 7 12
8: D -112 88 15
9: E 50 13 7
10: E -40 82 13
11: F -197 172 11
12: F -47 69 19
13: G 70 46 9
14: G 78 55 24
15: H -47 -127 2
16: H -8 -6 20
17: I -107 -69 8
18: I 25 -31 21
19: J -22 -45 3
20: J -3 -38 16
21: K -103 122 5
22: K -4 -69 23
23: L -73 36 6
24: L 137 -21 18
participant_id correct judgment text_id
这是 tidyr
的方法。基本上你会做 unite()
+ separate_rows()
。 unite
合并列,separate_rows
合并行 -
wide_df %>%
unite(text_id, text_id_1, text_id_2) %>%
unite(judgment, judgment_1, judgment_2) %>%
unite(correct, correct_1, correct_2) %>%
separate_rows(2:4, sep = "_")
participant_id judgment correct text_id
1 A -56 40 4
2 A 43 -127 17
3 B -23 11 10
4 B -30 217 14
5 C 156 -56 1
6 C 90 121 22
7 D 7 179 12
8 D 88 -112 15
9 E 13 50 7
10 E 82 -40 13
11 F 172 -197 11
12 F 69 -47 19
13 G 46 70 9
14 G 55 78 24
15 H -127 -47 2
16 H -6 -8 20
17 I -69 -107 8
18 I -31 25 21
19 J -45 -22 3
20 J -38 -3 16
21 K 122 -103 5
22 K -69 -4 23
23 L 36 -73 6
24 L -21 137 18
与我的其他答案相比,这是一种更动态的方式。这不需要手动合并所需的列,但确实依赖于列名称模式。
wide_df %>%
gather(variable, value, -participant_id) %>%
mutate(
variable = substr(variable, 1, nchar(variable)-2),
rn = ave(1:length(participant_id), participant_id, variable, FUN = seq_along)
) %>%
spread(variable, value) %>%
select(-rn)
participant_id correct judgment text_id
1 A 40 -56 4
2 A -127 43 17
3 B 11 -23 10
4 B 217 -30 14
5 C -56 156 1
6 C 121 90 22
7 D 179 7 12
8 D -112 88 15
9 E 50 13 7
10 E -40 82 13
11 F -197 172 11
12 F -47 69 19
13 G 70 46 9
14 G 78 55 24
15 H -47 -127 2
16 H -8 -6 20
17 I -107 -69 8
18 I 25 -31 21
19 J -22 -45 3
20 J -3 -38 16
21 K -103 122 5
22 K -4 -69 23
23 L -73 36 6
24 L 137 -21 18
答案已存在:
例如,
set.seed(123)
wide_df = data.frame('participant_id' = LETTERS[1:12]
, 'judgment_1' = round(rnorm(12)*100)
, 'correct_1' = round(rnorm(12)*100)
, 'text_id_1' = sample(1:12, 12, replace = F)
, 'judgment_2' = round(rnorm(12)*100)
, 'correct_2' = round(rnorm(12)*100)
, 'text_id_2' = sample(13:24, 12, replace = F)
)
dl <- reshape(data = wide_df,
idvar = "participant_id",
varying = list(judgment=c(2,5),correct=c(3,6),text_id=c(4,7)),
direction="long",
v.names = c("judgment","correct","text_id"),
sep="_")
结果:
participant_id time judgment correct text_id
A.1 A 1 -56 40 4
B.1 B 1 -23 11 10
C.1 C 1 156 -56 1
D.1 D 1 7 179 12
E.1 E 1 13 50 7
F.1 F 1 172 -197 11
G.1 G 1 46 70 9
H.1 H 1 -127 -47 2
I.1 I 1 -69 -107 8
J.1 J 1 -45 -22 3
K.1 K 1 122 -103 5
L.1 L 1 36 -73 6
A.2 A 2 43 -127 17
B.2 B 2 -30 217 14
C.2 C 2 90 121 22
D.2 D 2 88 -112 15
E.2 E 2 82 -40 13
F.2 F 2 69 -47 19
G.2 G 2 55 78 24
H.2 H 2 -6 -8 20
I.2 I 2 -31 25 21
J.2 J 2 -38 -3 16
K.2 K 2 -69 -4 23
L.2 L 2 -21 137 18