4 列宽数据框到 3 列长数据框
Wide data frame with 4 columns to long data frame with 3 columns
我有一个数据框(下面的例子),如下:
df = structure(list(Stage1yBefore = c("3.1", "1", "4", "2", "NA"),
Stage2yBefore = c("NA", "2", "3.2", "2", "NA"), ClinicalActivity1yBefore =
c(TRUE,
TRUE, TRUE, TRUE, FALSE), ClinicalActivity2yBefore = c(FALSE,
TRUE, TRUE, TRUE, FALSE)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L))
我想使用 dplyr 将其转换为长格式,但由于某种原因出现错误。
输出应如下所示(转换 df 的第一行):
Output = data_frame(TimeFrame = c("1y", "2y"), Stage = c(3, NA), Clinical =
c(T, F))
因此 df 的每一行在输出中变成 2 行。
我试过的方法不起作用(实际上我不确定具体该怎么做):
Output = gather(df, TimeFrame, Stage, Clinical, Stage1yBefore:ClinicalActivity2yBefore)
我得到:
Error in .f(.x[[i]],...): Object 'Clinical' not found.
有什么想法吗?
library(dplyr)
library(stringr)
library(tidyr)
df %>% rownames_to_column() %>%
gather(TimeFrame, Stage, Stage1yBefore:ClinicalActivity2yBefore) %>%
#From TimeFrame extract a digit followed by y, also Stage or Clinical
mutate(Time=str_extract(TimeFrame,'\dy'), Key=str_extract(TimeFrame,'Stage|Clinical')) %>%
dplyr::select(-TimeFrame) %>%
spread(Key,Stage)
# A tibble: 10 x 4
rowname Time Clinical Stage
<chr> <chr> <chr> <chr>
1 1 1y TRUE 3.1
2 1 2y FALSE NA
3 2 1y TRUE 1
4 2 2y TRUE 2
5 3 1y TRUE 4
6 3 2y TRUE 3.2
7 4 1y TRUE 2
8 4 2y TRUE 2
9 5 1y FALSE NA
10 5 2y FALSE NA
这是使用 tidyr
中的 extract
的另一个选项
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
gather(key, value, -row) %>%
extract(key, c("key", "Time"), regex = "(Stage|Clinical.*)(\d+y)") %>%
spread(key, value) %>%
select(-row)
# Time ClinicalActivity Stage
# <chr> <chr> <chr>
# 1 1y TRUE 3.1
# 2 2y FALSE NA
# 3 1y TRUE 1
# 4 2y TRUE 2
# 5 1y TRUE 4
# 6 2y TRUE 3.2
# 7 1y TRUE 2
# 8 2y TRUE 2
# 9 1y FALSE NA
#10 2y FALSE NA
我们可以在 data.table
中轻松做到这一点,它可以使用 measure
参数 melt
多列
library(data.table)
melt(setDT(df), measure = patterns("^Stage", "Clinical"),
value.name = c("Stage", "Clinical"),
variable.name = "Time")[, Time := paste0(Time, "y")][]
# Time Stage Clinical
# 1: 1y 3.1 TRUE
# 2: 1y 1 TRUE
# 3: 1y 4 TRUE
# 4: 1y 2 TRUE
# 5: 1y NA FALSE
# 6: 2y NA FALSE
# 7: 2y 2 TRUE
# 8: 2y 3.2 TRUE
# 9: 2y 2 TRUE
#10: 2y NA FALSE
我有一个数据框(下面的例子),如下:
df = structure(list(Stage1yBefore = c("3.1", "1", "4", "2", "NA"),
Stage2yBefore = c("NA", "2", "3.2", "2", "NA"), ClinicalActivity1yBefore =
c(TRUE,
TRUE, TRUE, TRUE, FALSE), ClinicalActivity2yBefore = c(FALSE,
TRUE, TRUE, TRUE, FALSE)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L))
我想使用 dplyr 将其转换为长格式,但由于某种原因出现错误。
输出应如下所示(转换 df 的第一行):
Output = data_frame(TimeFrame = c("1y", "2y"), Stage = c(3, NA), Clinical =
c(T, F))
因此 df 的每一行在输出中变成 2 行。
我试过的方法不起作用(实际上我不确定具体该怎么做):
Output = gather(df, TimeFrame, Stage, Clinical, Stage1yBefore:ClinicalActivity2yBefore)
我得到:
Error in .f(.x[[i]],...): Object 'Clinical' not found.
有什么想法吗?
library(dplyr)
library(stringr)
library(tidyr)
df %>% rownames_to_column() %>%
gather(TimeFrame, Stage, Stage1yBefore:ClinicalActivity2yBefore) %>%
#From TimeFrame extract a digit followed by y, also Stage or Clinical
mutate(Time=str_extract(TimeFrame,'\dy'), Key=str_extract(TimeFrame,'Stage|Clinical')) %>%
dplyr::select(-TimeFrame) %>%
spread(Key,Stage)
# A tibble: 10 x 4
rowname Time Clinical Stage
<chr> <chr> <chr> <chr>
1 1 1y TRUE 3.1
2 1 2y FALSE NA
3 2 1y TRUE 1
4 2 2y TRUE 2
5 3 1y TRUE 4
6 3 2y TRUE 3.2
7 4 1y TRUE 2
8 4 2y TRUE 2
9 5 1y FALSE NA
10 5 2y FALSE NA
这是使用 tidyr
extract
的另一个选项
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
gather(key, value, -row) %>%
extract(key, c("key", "Time"), regex = "(Stage|Clinical.*)(\d+y)") %>%
spread(key, value) %>%
select(-row)
# Time ClinicalActivity Stage
# <chr> <chr> <chr>
# 1 1y TRUE 3.1
# 2 2y FALSE NA
# 3 1y TRUE 1
# 4 2y TRUE 2
# 5 1y TRUE 4
# 6 2y TRUE 3.2
# 7 1y TRUE 2
# 8 2y TRUE 2
# 9 1y FALSE NA
#10 2y FALSE NA
我们可以在 data.table
中轻松做到这一点,它可以使用 measure
参数 melt
多列
library(data.table)
melt(setDT(df), measure = patterns("^Stage", "Clinical"),
value.name = c("Stage", "Clinical"),
variable.name = "Time")[, Time := paste0(Time, "y")][]
# Time Stage Clinical
# 1: 1y 3.1 TRUE
# 2: 1y 1 TRUE
# 3: 1y 4 TRUE
# 4: 1y 2 TRUE
# 5: 1y NA FALSE
# 6: 2y NA FALSE
# 7: 2y 2 TRUE
# 8: 2y 3.2 TRUE
# 9: 2y 2 TRUE
#10: 2y NA FALSE