当变量是列名时清洗数据
Cleaning Data When Variables are Column Names
我经常得到这样的数据集,其中列 headers 中有变量,并且还包括相应的误差测量值。
https://drive.google.com/file/d/0BwSh24a5hm4kSERESlFkeHZXOFE/view?usp=sharing
我的问题是如何快速简单地整理这个数据集,使其看起来像这样:
https://drive.google.com/file/d/0BwSh24a5hm4kRDNiSFJoaWFub0E/view?usp=sharing
我对使用 dplyr + tidyr 和不使用 dplyr + tidyr 的答案很感兴趣。
感谢您的帮助!
蛮力我会说只使用 dplyr
library(dplyr)
df <- data.frame(Timepoint=c(0L, 7L, 14L, 21L, 28L), Group1=c(50L, 60L, 66L, 88L, 90L),
Error_Group1=c(3, 4, 6, 8, 2), Group2=c(30L, 60L, 90L, 120L, 150L),
Error_Group2=c(10L, 14L, 16L, 13L, 25L), Group3=c(44L, 78L, 64L, 88L, 91L),
Error_Group3=c(2L, 13L, 16L, 4L, 9L))
df <- lapply(1:3, function(x){
temp <- df %>% select(Timepoint, ends_with(as.character(x))) %>% mutate(Group=x)
names(temp) <- c("Timepoint", "Measure", "Error", "Group")
temp <- temp %>% select(Timepoint, Group, Measure, Error)
})
df <- do.call(rbind, df)
df
还有 tidyr
更优雅
library(dplyr); library(tidyr)
df <- df %>% gather(temp, Timepoint)
names(df) <- c("Timepoint", "temp", "values")
df <- df %>% mutate(Group = sub("\D+", "", temp), temp=sub("\d", "", temp)) %>%
spread(temp, values)
names(df) <- c("Timepoint", "Group", "Error", "Measure")
df
从 v1.9.5 开始,data.table 可以同时 melt
多列。它既快速又节省内存。
require(data.table) ## v1.9.5+
melt(setDT(df), id=1L, measure=patterns("^Group", "^Error"),
variable.name="Group", value.name = c("Measure", "Error"))
# Timepoint Group Measure Error
# 1: 0 1 50 3
# 2: 7 1 60 4
# 3: 14 1 66 6
# 4: 21 1 88 8
# 5: 28 1 90 2
# ...
与 dplyr
和 tidyr
:
df %>%
# 1. Pivot the table
gather (g, m, -Timepoint) %>%
# 2. Get the final Group ID in mGroup
separate (g, c("Measure", "mGroup"), -2) %>%
# 3. Spread the actual Error and Measure in two columns
spread (Measure, m) %>%
# 4. Assign the correct names to final columns
select (Timepoint, Group = mGroup, Measure = Group, Error = Error_Group) %>%
# 5. Sort as requested
arrange (Group, Timepoint)
我经常得到这样的数据集,其中列 headers 中有变量,并且还包括相应的误差测量值。
https://drive.google.com/file/d/0BwSh24a5hm4kSERESlFkeHZXOFE/view?usp=sharing
我的问题是如何快速简单地整理这个数据集,使其看起来像这样:
https://drive.google.com/file/d/0BwSh24a5hm4kRDNiSFJoaWFub0E/view?usp=sharing
我对使用 dplyr + tidyr 和不使用 dplyr + tidyr 的答案很感兴趣。
感谢您的帮助!
蛮力我会说只使用 dplyr
library(dplyr)
df <- data.frame(Timepoint=c(0L, 7L, 14L, 21L, 28L), Group1=c(50L, 60L, 66L, 88L, 90L),
Error_Group1=c(3, 4, 6, 8, 2), Group2=c(30L, 60L, 90L, 120L, 150L),
Error_Group2=c(10L, 14L, 16L, 13L, 25L), Group3=c(44L, 78L, 64L, 88L, 91L),
Error_Group3=c(2L, 13L, 16L, 4L, 9L))
df <- lapply(1:3, function(x){
temp <- df %>% select(Timepoint, ends_with(as.character(x))) %>% mutate(Group=x)
names(temp) <- c("Timepoint", "Measure", "Error", "Group")
temp <- temp %>% select(Timepoint, Group, Measure, Error)
})
df <- do.call(rbind, df)
df
还有 tidyr
更优雅
library(dplyr); library(tidyr)
df <- df %>% gather(temp, Timepoint)
names(df) <- c("Timepoint", "temp", "values")
df <- df %>% mutate(Group = sub("\D+", "", temp), temp=sub("\d", "", temp)) %>%
spread(temp, values)
names(df) <- c("Timepoint", "Group", "Error", "Measure")
df
从 v1.9.5 开始,data.table 可以同时 melt
多列。它既快速又节省内存。
require(data.table) ## v1.9.5+
melt(setDT(df), id=1L, measure=patterns("^Group", "^Error"),
variable.name="Group", value.name = c("Measure", "Error"))
# Timepoint Group Measure Error
# 1: 0 1 50 3
# 2: 7 1 60 4
# 3: 14 1 66 6
# 4: 21 1 88 8
# 5: 28 1 90 2
# ...
与 dplyr
和 tidyr
:
df %>%
# 1. Pivot the table
gather (g, m, -Timepoint) %>%
# 2. Get the final Group ID in mGroup
separate (g, c("Measure", "mGroup"), -2) %>%
# 3. Spread the actual Error and Measure in two columns
spread (Measure, m) %>%
# 4. Assign the correct names to final columns
select (Timepoint, Group = mGroup, Measure = Group, Error = Error_Group) %>%
# 5. Sort as requested
arrange (Group, Timepoint)