通过重复措施部分分散整理数据
partially spread out tidy data by repeated measures
我有一些数据结构如下:
structure(list(subject = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("group1", "group2"), class = "factor"), measurement = c("color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time"), item_pos = c("1", "1", "2", "2", "3", "3", "4", "4", "1", "1", "2", "2", "3", "3", "4", "4", "1", "1", "2", "2", "3", "3", "4", "4", "1", "1", "2", "2", "3", "3", "4", "4"), value = c("blue", "1508", "orange", "752", "black", "585", "red", "842", "red", "879", "white", "1455", "green", "1757", "orange", "2241", "white", "2251", "yellow", "1740", "red", "1962", "yellow", "1854", "green", "1859", "blue", "2156", "yellow", "2494", "green", "1757"), item = c("A", "A", "B", "B", "B", "B", "A", "A", "A", "A", "B", "B", "B", "B", "A", "A", "C", "C", "C", "C", "D", "D", "D", "D", "C", "C", "C", "C", "D", "D", "D", "D")), .Names = c("subject", "group", "measurement", "item_pos", "value", "item"), row.names = c(NA, -32L), class = "data.frame")
每个项目都有多个观察结果,因此对象 1 的数据如下所示:
> filter(df.tidy, subject==1)
subject group measurement item_pos value item
1 1 group1 color 1 blue A
2 1 group1 time 1 1508 A
3 1 group1 color 2 orange B
4 1 group1 time 2 752 B
5 1 group1 color 3 black B
6 1 group1 time 3 585 B
7 1 group1 color 4 red A
8 1 group1 time 4 842 A
因此在 group
中每个 item
出现两次,并且对于每次出现都有 measurement
的颜色和时间。项目出现的顺序在 item_pos
中。
虽然我喜欢这种长格式,但一位同事稍微需要它 'wider',在他们自己的列中按项目重复颜色和时间度量。 所需格式如下:
subject group item color1 color2 time1 time2
1 group1 A blue red 1508 842
1 group1 B orange black 752 585
...
4 group2 D yellow green 2494 1757
我的感觉是,这应该可以使用 gather()
、spread()
和其他 dplyr 动词的组合,但我不确定这里的 dplyr 等价物是什么(在for-loop speak)按组循环遍历项目并在后续列中收集颜色和时间观察值。非常感谢帮助!
我咨询的相关问题:
- is it possible to use spread on multiple colums in tidyr similar do dcast?
我们可以从 library(data.table)
开始尝试 dcast
。将 'data.frame' 转换为 'data.table'(setDT(df.tidy)
,按 'subject'、'measurement' 和 'item' 分组,创建序列列 "N" 和然后使用 dcast
将 'long' 格式转换为 'wide' 格式。
library(data.table)
setDT(df.tidy)[, N:=1:.N, by = .(subject, measurement, item)]
dcast(df.tidy, subject+group + item ~measurement + N, value.var="value", sep="")
# subject group item color1 color2 time1 time2
#1: 1 group1 A blue red 1508 842
#2: 1 group1 B orange black 752 585
#3: 2 group1 A red orange 879 2241
#4: 2 group1 B white green 1455 1757
#5: 3 group2 C white yellow 2251 1740
#6: 3 group2 D red yellow 1962 1854
#7: 4 group2 C green blue 1859 2156
#8: 4 group2 D yellow green 2494 1757
或者使用dplyr/tidyr
,我们按同一列分组,创建一个序列列("N"),ungroup
,粘贴'measurement'和'N' 列以创建 'measurementN'(使用 unite
),然后 spread
将数据转换为 'wide' 格式。
library(dplyr)
library(tidyr)
df.tidy %>%
group_by(subject, measurement, item) %>%
mutate(N = row_number()) %>%
ungroup() %>%
unite(measurementN, measurement, N, sep='') %>%
select(-item_pos) %>%
spread(measurementN, value)
# subject group item color1 color2 time1 time2
# (int) (fctr) (chr) (chr) (chr) (chr) (chr)
#1 1 group1 A blue red 1508 842
#2 1 group1 B orange black 752 585
#3 2 group1 A red orange 879 2241
#4 2 group1 B white green 1455 1757
#5 3 group2 C white yellow 2251 1740
#6 3 group2 D red yellow 1962 1854
#7 4 group2 C green blue 1859 2156
#8 4 group2 D yellow green 2494 1757
我有一些数据结构如下:
structure(list(subject = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("group1", "group2"), class = "factor"), measurement = c("color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time"), item_pos = c("1", "1", "2", "2", "3", "3", "4", "4", "1", "1", "2", "2", "3", "3", "4", "4", "1", "1", "2", "2", "3", "3", "4", "4", "1", "1", "2", "2", "3", "3", "4", "4"), value = c("blue", "1508", "orange", "752", "black", "585", "red", "842", "red", "879", "white", "1455", "green", "1757", "orange", "2241", "white", "2251", "yellow", "1740", "red", "1962", "yellow", "1854", "green", "1859", "blue", "2156", "yellow", "2494", "green", "1757"), item = c("A", "A", "B", "B", "B", "B", "A", "A", "A", "A", "B", "B", "B", "B", "A", "A", "C", "C", "C", "C", "D", "D", "D", "D", "C", "C", "C", "C", "D", "D", "D", "D")), .Names = c("subject", "group", "measurement", "item_pos", "value", "item"), row.names = c(NA, -32L), class = "data.frame")
每个项目都有多个观察结果,因此对象 1 的数据如下所示:
> filter(df.tidy, subject==1)
subject group measurement item_pos value item
1 1 group1 color 1 blue A
2 1 group1 time 1 1508 A
3 1 group1 color 2 orange B
4 1 group1 time 2 752 B
5 1 group1 color 3 black B
6 1 group1 time 3 585 B
7 1 group1 color 4 red A
8 1 group1 time 4 842 A
因此在 group
中每个 item
出现两次,并且对于每次出现都有 measurement
的颜色和时间。项目出现的顺序在 item_pos
中。
虽然我喜欢这种长格式,但一位同事稍微需要它 'wider',在他们自己的列中按项目重复颜色和时间度量。 所需格式如下:
subject group item color1 color2 time1 time2
1 group1 A blue red 1508 842
1 group1 B orange black 752 585
...
4 group2 D yellow green 2494 1757
我的感觉是,这应该可以使用 gather()
、spread()
和其他 dplyr 动词的组合,但我不确定这里的 dplyr 等价物是什么(在for-loop speak)按组循环遍历项目并在后续列中收集颜色和时间观察值。非常感谢帮助!
我咨询的相关问题:
- is it possible to use spread on multiple colums in tidyr similar do dcast?
我们可以从 library(data.table)
开始尝试 dcast
。将 'data.frame' 转换为 'data.table'(setDT(df.tidy)
,按 'subject'、'measurement' 和 'item' 分组,创建序列列 "N" 和然后使用 dcast
将 'long' 格式转换为 'wide' 格式。
library(data.table)
setDT(df.tidy)[, N:=1:.N, by = .(subject, measurement, item)]
dcast(df.tidy, subject+group + item ~measurement + N, value.var="value", sep="")
# subject group item color1 color2 time1 time2
#1: 1 group1 A blue red 1508 842
#2: 1 group1 B orange black 752 585
#3: 2 group1 A red orange 879 2241
#4: 2 group1 B white green 1455 1757
#5: 3 group2 C white yellow 2251 1740
#6: 3 group2 D red yellow 1962 1854
#7: 4 group2 C green blue 1859 2156
#8: 4 group2 D yellow green 2494 1757
或者使用dplyr/tidyr
,我们按同一列分组,创建一个序列列("N"),ungroup
,粘贴'measurement'和'N' 列以创建 'measurementN'(使用 unite
),然后 spread
将数据转换为 'wide' 格式。
library(dplyr)
library(tidyr)
df.tidy %>%
group_by(subject, measurement, item) %>%
mutate(N = row_number()) %>%
ungroup() %>%
unite(measurementN, measurement, N, sep='') %>%
select(-item_pos) %>%
spread(measurementN, value)
# subject group item color1 color2 time1 time2
# (int) (fctr) (chr) (chr) (chr) (chr) (chr)
#1 1 group1 A blue red 1508 842
#2 1 group1 B orange black 752 585
#3 2 group1 A red orange 879 2241
#4 2 group1 B white green 1455 1757
#5 3 group2 C white yellow 2251 1740
#6 3 group2 D red yellow 1962 1854
#7 4 group2 C green blue 1859 2156
#8 4 group2 D yellow green 2494 1757