如何在 R 中的数据框中查找开始和结束日期以及记录的每一列的总天数?
How to find start and end dates, and total days recorded for each column in a dataframe in R?
我想提供一些关于我在 R 中的数据框的摘要信息。我想知道 Start Date/Time
、End Date/Time
和 Total Days values were recorded
。这是数据框的示例
df = structure(list(Date_Time_GMT_3 = structure(c(1594233000, 1594533900, 1597235700,
1595234800, 1594336600, 1595237500),
class = c("POSIXct", "POSIXt"), tzone = "EST"),
`20874285_33MR` = c(14.996, 15.091, 15.187, 15.282, 15.378, 15.378),
`20874290_103MR` = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`20874287_102MR` = c(NA_real_, 15.091, 15.187, 15.282, NA_real_, NA_real_),
`20874299_54MR` = c(NA_real_, 15.378, 15.378, NA_real_, NA_real_, NA_real_),
`20874316_AIR_90MR` = c(NA_real_, NA_real_, NA_real_,15.091, 15.187, 15.282)),
row.names = c(NA, 6L), class = "data.frame")
我有这段代码告诉我每列有值的总天数(不包括 N/A)
library(dplyr)
df %>%
group_by(date = as.Date(Date_Time_GMT_3)) %>%
summarise(across(everything(), ~any(!is.na(.)))) %>%
summarise(across(-date, sum))
并且我有这段代码可以为每一列找到 Start
和 End
date/times(其中 NA 无关紧要,因此它是第一条记录的开始日期到最后一条记录的结束日期)。
df_MetadataStart = df %>%
pivot_longer(-c(Date_Time_GMT_3)) %>%
select(name, Date_Time_GMT_3) %>%
group_by(name, col = rep(c('StartTime', 'EndTime'), length.out = n())) %>%
mutate(id = row_number()) %>%
tidyr::pivot_wider(names_from = col, values_from = Date_Time_GMT_3) %>%
ungroup() %>%
select(-id)
但是这些代码有两个问题。首先,我为 Start/End Dates/Times
编写的代码最终有 1 列的多个输入。其次,使用这两个单独的代码,我没有得到我想要的结果。最后,我希望最终的数据框也像这样
Name Start Date End Date Total Days
<chr> <Pos> <Pos> <int>
使用辅助函数
myrleid <- function(x) {
r <- rle(x)
rep(seq_along(r$lengths), times = r$lengths)
}
我们可以按 name
分组,然后按非 NA
值的运行来产生这个:
library(dplyr)
library(tidyr) # pivot_longer
df %>%
pivot_longer(-Date_Time_GMT_3) %>%
arrange(Date_Time_GMT_3) %>%
group_by(name) %>%
mutate(grp = myrleid(is.na(value))) %>%
group_by(name, grp) %>%
summarize(
Start = min(Date_Time_GMT_3),
End = max(Date_Time_GMT_3),
TotalDays = as.numeric(max(Date_Time_GMT_3) - min(Date_Time_GMT_3), units = "days"),
value1 = value[1]) %>%
ungroup() %>%
filter(!is.na(value1)) %>%
select(-grp, -value1)
# # A tibble: 7 x 4
# name Start End TotalDays
# <chr> <dttm> <dttm> <dbl>
# 1 20874285_33MR 2020-07-08 13:30:00 2020-08-12 07:35:00 34.8
# 2 20874287_102MR 2020-07-12 01:05:00 2020-07-20 03:46:40 8.11
# 3 20874287_102MR 2020-08-12 07:35:00 2020-08-12 07:35:00 0
# 4 20874299_54MR 2020-07-12 01:05:00 2020-07-12 01:05:00 0
# 5 20874299_54MR 2020-08-12 07:35:00 2020-08-12 07:35:00 0
# 6 20874316_AIR_90MR 2020-07-09 18:16:40 2020-07-09 18:16:40 0
# 7 20874316_AIR_90MR 2020-07-20 03:46:40 2020-07-20 04:31:40 0.0312
仅供参考:102MR
和其他行的两行是由时间戳乱序引起的:我推断它们应该在按非 NA
集群分组之前排序,并且按时间戳排列会在 name
组中产生两个集群。
我想提供一些关于我在 R 中的数据框的摘要信息。我想知道 Start Date/Time
、End Date/Time
和 Total Days values were recorded
。这是数据框的示例
df = structure(list(Date_Time_GMT_3 = structure(c(1594233000, 1594533900, 1597235700,
1595234800, 1594336600, 1595237500),
class = c("POSIXct", "POSIXt"), tzone = "EST"),
`20874285_33MR` = c(14.996, 15.091, 15.187, 15.282, 15.378, 15.378),
`20874290_103MR` = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`20874287_102MR` = c(NA_real_, 15.091, 15.187, 15.282, NA_real_, NA_real_),
`20874299_54MR` = c(NA_real_, 15.378, 15.378, NA_real_, NA_real_, NA_real_),
`20874316_AIR_90MR` = c(NA_real_, NA_real_, NA_real_,15.091, 15.187, 15.282)),
row.names = c(NA, 6L), class = "data.frame")
我有这段代码告诉我每列有值的总天数(不包括 N/A)
library(dplyr)
df %>%
group_by(date = as.Date(Date_Time_GMT_3)) %>%
summarise(across(everything(), ~any(!is.na(.)))) %>%
summarise(across(-date, sum))
并且我有这段代码可以为每一列找到 Start
和 End
date/times(其中 NA 无关紧要,因此它是第一条记录的开始日期到最后一条记录的结束日期)。
df_MetadataStart = df %>%
pivot_longer(-c(Date_Time_GMT_3)) %>%
select(name, Date_Time_GMT_3) %>%
group_by(name, col = rep(c('StartTime', 'EndTime'), length.out = n())) %>%
mutate(id = row_number()) %>%
tidyr::pivot_wider(names_from = col, values_from = Date_Time_GMT_3) %>%
ungroup() %>%
select(-id)
但是这些代码有两个问题。首先,我为 Start/End Dates/Times
编写的代码最终有 1 列的多个输入。其次,使用这两个单独的代码,我没有得到我想要的结果。最后,我希望最终的数据框也像这样
Name Start Date End Date Total Days
<chr> <Pos> <Pos> <int>
使用辅助函数
myrleid <- function(x) {
r <- rle(x)
rep(seq_along(r$lengths), times = r$lengths)
}
我们可以按 name
分组,然后按非 NA
值的运行来产生这个:
library(dplyr)
library(tidyr) # pivot_longer
df %>%
pivot_longer(-Date_Time_GMT_3) %>%
arrange(Date_Time_GMT_3) %>%
group_by(name) %>%
mutate(grp = myrleid(is.na(value))) %>%
group_by(name, grp) %>%
summarize(
Start = min(Date_Time_GMT_3),
End = max(Date_Time_GMT_3),
TotalDays = as.numeric(max(Date_Time_GMT_3) - min(Date_Time_GMT_3), units = "days"),
value1 = value[1]) %>%
ungroup() %>%
filter(!is.na(value1)) %>%
select(-grp, -value1)
# # A tibble: 7 x 4
# name Start End TotalDays
# <chr> <dttm> <dttm> <dbl>
# 1 20874285_33MR 2020-07-08 13:30:00 2020-08-12 07:35:00 34.8
# 2 20874287_102MR 2020-07-12 01:05:00 2020-07-20 03:46:40 8.11
# 3 20874287_102MR 2020-08-12 07:35:00 2020-08-12 07:35:00 0
# 4 20874299_54MR 2020-07-12 01:05:00 2020-07-12 01:05:00 0
# 5 20874299_54MR 2020-08-12 07:35:00 2020-08-12 07:35:00 0
# 6 20874316_AIR_90MR 2020-07-09 18:16:40 2020-07-09 18:16:40 0
# 7 20874316_AIR_90MR 2020-07-20 03:46:40 2020-07-20 04:31:40 0.0312
仅供参考:102MR
和其他行的两行是由时间戳乱序引起的:我推断它们应该在按非 NA
集群分组之前排序,并且按时间戳排列会在 name
组中产生两个集群。