根据R中的日期变量重组多个变量
Reorganize multiple variables based on date variable in R
如果我的数据集包含在不同时间点收集的相同度量的分数,我该如何组织这些 dates/times 以便它们代表特定日期之后的时间点?这可以在 R 中完成,还是我在另一个程序中更容易完成?
我有一个目前看起来像这样的数据集:
id date score1_date score1 score2_date score2 score3_date score3
101 1/6/2020 1/1/2020 20 1/8/2020 18 1/15/2020 16
102 2/27/2020 2/14/2020 16 2/21/2020 16 2/28/2020 10
103 1/10/2020 1/7/2020 30 1/14/2020 25 1/21/2020 20
104 3/5/2020 3/6/2020 40 3/13/2020 42 3/20/2020 40
我想找到最接近 [date] 的 [score#_date] 并将其标识为 [time1],然后将后面的所有内容设为 [time2]、[time3] 等
这是上面的代码 table:
structure(list(id = c(101, 102, 103, 104), date = structure(c(18267,
18319, 18271, 18326), class = "Date"), score1_date = structure(c(18262,
18306, 18268, 18327), class = "Date"), score1 = c(20, 16, 30,
40), score2_date = structure(c(18269, 18313, 18275, 18334), class = "Date"),
score2 = c(18, 16, 25, 42), score3_date = structure(c(18276,
18320, 18282, 18341), class = "Date"), score3 = c(16, 10,
20, 40)), row.names = c(NA, -4L), class = c("tbl_df", "tbl",
"data.frame"))
所以我最终希望数据集具有如下所示的变量:
id date time1_date time1_score time2_date time2_score time3_date time3_score
101 1/6/2020 1/8/2020 18 1/15/2020 16 NA NA
102 2/27/2020 2/28/2020 10 NA NA NA NA
103 1/10/2020 1/7/2020 30 1/14/2020 25 1/21/2020 20
104 3/5/2020 3/6/2020 40 3/13/2020 42 3/20/2020 40
非常感谢!
使用 tidyverse
功能你可以做:
library(dplyr)
library(tidyr)
df %>%
#Rename date column to base_date
rename(base_date = date) %>%
#Rename score1, score2 etc to score1_value, score2_value etc
rename_with(~paste0(., '_value'), matches('^score\d+$')) %>%
#get the data in long format with date and value as two columns
pivot_longer(cols = starts_with('score'),
names_to = c('score', '.value'),
names_sep = '_') %>%
group_by(id) %>%
#Keep only those date where the date is greater than closest date
filter(date >= date[which.min(abs(date - base_date))]) %>%
#Arrange the data
arrange(id, date) %>%
#Create new column name
mutate(score = paste0('time', row_number())) %>%
ungroup %>%
#Get the data in wide format
pivot_wider(names_from = score, values_from = c(date, value)) %>%
#Arrange the columns
select(id, base_date, order(suppressWarnings(readr::parse_number(names(.)))))
# id base_date date_time1 value_time1 date_time2 value_time2 date_time3 value_time3
# <dbl> <date> <date> <dbl> <date> <dbl> <date> <dbl>
#1 101 2020-01-06 2020-01-08 18 2020-01-15 16 NA NA
#2 102 2020-02-27 2020-02-28 10 NA NA NA NA
#3 103 2020-01-10 2020-01-07 30 2020-01-14 25 2020-01-21 20
#4 104 2020-03-05 2020-03-06 40 2020-03-13 42 2020-03-20 40
这里有一个稍微不同的方法,包括重新排序最后的列:
library(tidyverse)
data %>%
rename(target = date) %>%
mutate(across(everything(),as.character)) %>%
pivot_longer(-c(id,target),names_pattern = "score([0-9]+)_?(.*)", names_to = c("num","variable")) %>%
mutate(variable = case_when(variable == "" ~ "score", TRUE ~ variable)) %>%
pivot_wider(id_cols = c("id","num","target"),names_from = variable, values_from = "value") %>%
filter(date >= target) %>%
group_by(id) %>%
arrange(date) %>%
mutate(new_num = row_number()) %>%
select(id,target,new_num,date,score) %>%
pivot_wider(id_cols = c("id","target"), names_from = new_num, values_from = c("date","score")) %>%
group_by(id,target) %>%
select(paste0(rep(c("date","score"),times = ncol(.)/2),"_",rep(1:3,each = ncol(.)/3)))
#Adding missing grouping variables: `id`, `target`
## A tibble: 4 x 8
## Groups: id, target [4]
# id target date_1 score_1 date_2 score_2 date_3 score_3
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 101 2020-01-06 2020-01-08 18 2020-01-15 16 NA NA
#2 103 2020-01-10 2020-01-14 25 2020-01-21 20 NA NA
#3 102 2020-02-27 2020-02-28 10 NA NA NA NA
#4 104 2020-03-05 2020-03-06 40 2020-03-13 42 2020-03-20 40
我建议采用不同的策略来过滤行,使用 cummin
和 abs
函数作为 under
df %>%
rename(base_date = date) %>%
rename_with(~paste0(., '_value'), matches('^score\d+$')) %>%
pivot_longer(cols = starts_with('score'),
names_to = c('score', '.value'),
names_sep = '_') %>%
group_by(id) %>%
mutate(d = abs(as.numeric(difftime(base_date, date, units = "days"))),
score = str_remove(score, "\d$")) %>%
filter(cumsum(cummin(d) == min(d)) != 0) %>%
mutate(score = paste0(score, row_number())) %>%
pivot_wider(id_cols = c(id,base_date), names_from = score, values_from = c(date, value))
# A tibble: 4 x 8
# Groups: id [4]
id base_date date_score1 date_score2 date_score3 value_score1 value_score2 value_score3
<dbl> <date> <date> <date> <date> <dbl> <dbl> <dbl>
1 101 2020-01-06 2020-01-08 2020-01-15 NA 18 16 NA
2 102 2020-02-27 2020-02-28 NA NA 10 NA NA
3 103 2020-01-10 2020-01-07 2020-01-14 2020-01-21 30 25 20
4 104 2020-03-05 2020-03-06 2020-03-13 2020-03-20 40 42 40
pivot_wider
进行了列重排 issue #839 still open on Github, so either you can manually rearrange the columns or adopt the strategy proposed by Ian in last line of his
如果我的数据集包含在不同时间点收集的相同度量的分数,我该如何组织这些 dates/times 以便它们代表特定日期之后的时间点?这可以在 R 中完成,还是我在另一个程序中更容易完成?
我有一个目前看起来像这样的数据集:
id date score1_date score1 score2_date score2 score3_date score3
101 1/6/2020 1/1/2020 20 1/8/2020 18 1/15/2020 16
102 2/27/2020 2/14/2020 16 2/21/2020 16 2/28/2020 10
103 1/10/2020 1/7/2020 30 1/14/2020 25 1/21/2020 20
104 3/5/2020 3/6/2020 40 3/13/2020 42 3/20/2020 40
我想找到最接近 [date] 的 [score#_date] 并将其标识为 [time1],然后将后面的所有内容设为 [time2]、[time3] 等
这是上面的代码 table:
structure(list(id = c(101, 102, 103, 104), date = structure(c(18267,
18319, 18271, 18326), class = "Date"), score1_date = structure(c(18262,
18306, 18268, 18327), class = "Date"), score1 = c(20, 16, 30,
40), score2_date = structure(c(18269, 18313, 18275, 18334), class = "Date"),
score2 = c(18, 16, 25, 42), score3_date = structure(c(18276,
18320, 18282, 18341), class = "Date"), score3 = c(16, 10,
20, 40)), row.names = c(NA, -4L), class = c("tbl_df", "tbl",
"data.frame"))
所以我最终希望数据集具有如下所示的变量:
id date time1_date time1_score time2_date time2_score time3_date time3_score
101 1/6/2020 1/8/2020 18 1/15/2020 16 NA NA
102 2/27/2020 2/28/2020 10 NA NA NA NA
103 1/10/2020 1/7/2020 30 1/14/2020 25 1/21/2020 20
104 3/5/2020 3/6/2020 40 3/13/2020 42 3/20/2020 40
非常感谢!
使用 tidyverse
功能你可以做:
library(dplyr)
library(tidyr)
df %>%
#Rename date column to base_date
rename(base_date = date) %>%
#Rename score1, score2 etc to score1_value, score2_value etc
rename_with(~paste0(., '_value'), matches('^score\d+$')) %>%
#get the data in long format with date and value as two columns
pivot_longer(cols = starts_with('score'),
names_to = c('score', '.value'),
names_sep = '_') %>%
group_by(id) %>%
#Keep only those date where the date is greater than closest date
filter(date >= date[which.min(abs(date - base_date))]) %>%
#Arrange the data
arrange(id, date) %>%
#Create new column name
mutate(score = paste0('time', row_number())) %>%
ungroup %>%
#Get the data in wide format
pivot_wider(names_from = score, values_from = c(date, value)) %>%
#Arrange the columns
select(id, base_date, order(suppressWarnings(readr::parse_number(names(.)))))
# id base_date date_time1 value_time1 date_time2 value_time2 date_time3 value_time3
# <dbl> <date> <date> <dbl> <date> <dbl> <date> <dbl>
#1 101 2020-01-06 2020-01-08 18 2020-01-15 16 NA NA
#2 102 2020-02-27 2020-02-28 10 NA NA NA NA
#3 103 2020-01-10 2020-01-07 30 2020-01-14 25 2020-01-21 20
#4 104 2020-03-05 2020-03-06 40 2020-03-13 42 2020-03-20 40
这里有一个稍微不同的方法,包括重新排序最后的列:
library(tidyverse)
data %>%
rename(target = date) %>%
mutate(across(everything(),as.character)) %>%
pivot_longer(-c(id,target),names_pattern = "score([0-9]+)_?(.*)", names_to = c("num","variable")) %>%
mutate(variable = case_when(variable == "" ~ "score", TRUE ~ variable)) %>%
pivot_wider(id_cols = c("id","num","target"),names_from = variable, values_from = "value") %>%
filter(date >= target) %>%
group_by(id) %>%
arrange(date) %>%
mutate(new_num = row_number()) %>%
select(id,target,new_num,date,score) %>%
pivot_wider(id_cols = c("id","target"), names_from = new_num, values_from = c("date","score")) %>%
group_by(id,target) %>%
select(paste0(rep(c("date","score"),times = ncol(.)/2),"_",rep(1:3,each = ncol(.)/3)))
#Adding missing grouping variables: `id`, `target`
## A tibble: 4 x 8
## Groups: id, target [4]
# id target date_1 score_1 date_2 score_2 date_3 score_3
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 101 2020-01-06 2020-01-08 18 2020-01-15 16 NA NA
#2 103 2020-01-10 2020-01-14 25 2020-01-21 20 NA NA
#3 102 2020-02-27 2020-02-28 10 NA NA NA NA
#4 104 2020-03-05 2020-03-06 40 2020-03-13 42 2020-03-20 40
我建议采用不同的策略来过滤行,使用 cummin
和 abs
函数作为 under
df %>%
rename(base_date = date) %>%
rename_with(~paste0(., '_value'), matches('^score\d+$')) %>%
pivot_longer(cols = starts_with('score'),
names_to = c('score', '.value'),
names_sep = '_') %>%
group_by(id) %>%
mutate(d = abs(as.numeric(difftime(base_date, date, units = "days"))),
score = str_remove(score, "\d$")) %>%
filter(cumsum(cummin(d) == min(d)) != 0) %>%
mutate(score = paste0(score, row_number())) %>%
pivot_wider(id_cols = c(id,base_date), names_from = score, values_from = c(date, value))
# A tibble: 4 x 8
# Groups: id [4]
id base_date date_score1 date_score2 date_score3 value_score1 value_score2 value_score3
<dbl> <date> <date> <date> <date> <dbl> <dbl> <dbl>
1 101 2020-01-06 2020-01-08 2020-01-15 NA 18 16 NA
2 102 2020-02-27 2020-02-28 NA NA 10 NA NA
3 103 2020-01-10 2020-01-07 2020-01-14 2020-01-21 30 25 20
4 104 2020-03-05 2020-03-06 2020-03-13 2020-03-20 40 42 40
pivot_wider
进行了列重排 issue #839 still open on Github, so either you can manually rearrange the columns or adopt the strategy proposed by Ian in last line of his