如何在 R 中按分钟按组填写缺失日期
How to fill in missing dates by minute by group in R
我正在尝试从具有不同组的数据框中填充缺失的分钟数。我希望缺失的分钟用零补上
我尝试使用此 但找不到填补缺失分钟数的方法。
Datetime | Group | Value |
2019-01-01 00:00:00 | 1 | 5 |
2019-01-01 00:00:00 | 2 | 4 |
2019-01-01 00:00:00 | 3 | 2 |
2019-01-01 00:01:00 | 1 | 1 |
2019-01-01 00:02:00 | 1 | 2 |
2019-01-01 00:02:00 | 2 | 2 |
2019-01-01 00:02:00 | 3 | 1 |
2019-01-01 00:03:00 | 1 | 1 |
2019-01-01 00:03:00 | 2 | 2 |
2019-01-01 00:04:00 | 1 | 1 |
我希望最后的 table 看起来像这样 -
Datetime | Group | Value |
2019-01-01 00:00:00 | 1 | 5 |
2019-01-01 00:00:00 | 2 | 4 |
2019-01-01 00:00:00 | 3 | 2 |
2019-01-01 00:01:00 | 1 | 1 |
2019-01-01 00:01:00 | 2 | 0 |
2019-01-01 00:01:00 | 3 | 0 |
2019-01-01 00:02:00 | 1 | 2 |
2019-01-01 00:02:00 | 2 | 2 |
2019-01-01 00:02:00 | 3 | 1 |
2019-01-01 00:03:00 | 1 | 1 |
2019-01-01 00:03:00 | 2 | 2 |
2019-01-01 00:03:00 | 3 | 0 |
2019-01-01 00:04:00 | 1 | 1 |
2019-01-01 00:04:00 | 2 | 0 |
2019-01-01 00:04:00 | 3 | 0 |
library(dplyr); library(padr)
df %>%
pad(group = 'Group', interval = 'min') %>% # Explicitly fill by 1 min
fill_by_value(Value)
#pad applied on the interval: min
# Datetime Group Value
#1 2019-01-01 00:00:00 1 5
#2 2019-01-01 00:01:00 1 1
#3 2019-01-01 00:02:00 1 2
#4 2019-01-01 00:03:00 1 1
#5 2019-01-01 00:04:00 1 1
#6 2019-01-01 00:00:00 2 4
#7 2019-01-01 00:01:00 2 0 # added
#8 2019-01-01 00:02:00 2 2
#9 2019-01-01 00:03:00 2 2
#10 2019-01-01 00:00:00 3 2
#11 2019-01-01 00:01:00 3 0 # added
#12 2019-01-01 00:02:00 3 1
数据
df <- read.table(
header = T,
stringsAsFactors = F, sep = "|",
text = "Datetime | Group | Value
2019-01-01 00:00:00 | 1 | 5
2019-01-01 00:00:00 | 2 | 4
2019-01-01 00:00:00 | 3 | 2
2019-01-01 00:01:00 | 1 | 1
2019-01-01 00:02:00 | 1 | 2
2019-01-01 00:02:00 | 2 | 2
2019-01-01 00:02:00 | 3 | 1
2019-01-01 00:03:00 | 1 | 1
2019-01-01 00:03:00 | 2 | 2
2019-01-01 00:04:00 | 1 | 1"
)
df$Datetime = lubridate::ymd_hms(df$Datetime)
使用base
:
date_groups <- expand.grid(Datetime= seq(min(df$Datetime), max(df$Datetime), "min"),
Group = c(1:3))
date_groups <- merge(date_groups, df, all.x = TRUE)
date_groups[is.na(date_groups)] <- 0
我们可以使用complete
library(tidyverse)
df %>%
complete(Group, Datetime = seq(min(Datetime),
max(Datetime), by = "1 min"), fill = list(Value = 0)) %>%
arrange(Datetime) %>%
select(names(df))
# A tibble: 15 x 3
# Datetime Group Value
# <dttm> <dbl> <dbl>
# 1 2019-01-01 00:00:00 1 5
# 2 2019-01-01 00:00:00 2 4
# 3 2019-01-01 00:00:00 3 2
# 4 2019-01-01 00:01:00 1 1
# 5 2019-01-01 00:01:00 2 0
# 6 2019-01-01 00:01:00 3 0
# 7 2019-01-01 00:02:00 1 2
# 8 2019-01-01 00:02:00 2 2
# 9 2019-01-01 00:02:00 3 1
#10 2019-01-01 00:03:00 1 1
#11 2019-01-01 00:03:00 2 2
#12 2019-01-01 00:03:00 3 0
#13 2019-01-01 00:04:00 1 1
#14 2019-01-01 00:04:00 2 0
#15 2019-01-01 00:04:00 3 0
数据
df <- structure(list(Datetime = structure(c(1546300800, 1546300800,
1546300800, 1546300860, 1546300920, 1546300920, 1546300920, 1546300980,
1546300980, 1546301040), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
Group = c(1, 2, 3, 1, 1, 2, 3, 1, 2, 1), Value = c(5, 4,
2, 1, 2, 2, 1, 1, 2, 1)), row.names = c(NA, -10L), class = "data.frame")
我正在尝试从具有不同组的数据框中填充缺失的分钟数。我希望缺失的分钟用零补上
我尝试使用此
Datetime | Group | Value |
2019-01-01 00:00:00 | 1 | 5 |
2019-01-01 00:00:00 | 2 | 4 |
2019-01-01 00:00:00 | 3 | 2 |
2019-01-01 00:01:00 | 1 | 1 |
2019-01-01 00:02:00 | 1 | 2 |
2019-01-01 00:02:00 | 2 | 2 |
2019-01-01 00:02:00 | 3 | 1 |
2019-01-01 00:03:00 | 1 | 1 |
2019-01-01 00:03:00 | 2 | 2 |
2019-01-01 00:04:00 | 1 | 1 |
我希望最后的 table 看起来像这样 -
Datetime | Group | Value |
2019-01-01 00:00:00 | 1 | 5 |
2019-01-01 00:00:00 | 2 | 4 |
2019-01-01 00:00:00 | 3 | 2 |
2019-01-01 00:01:00 | 1 | 1 |
2019-01-01 00:01:00 | 2 | 0 |
2019-01-01 00:01:00 | 3 | 0 |
2019-01-01 00:02:00 | 1 | 2 |
2019-01-01 00:02:00 | 2 | 2 |
2019-01-01 00:02:00 | 3 | 1 |
2019-01-01 00:03:00 | 1 | 1 |
2019-01-01 00:03:00 | 2 | 2 |
2019-01-01 00:03:00 | 3 | 0 |
2019-01-01 00:04:00 | 1 | 1 |
2019-01-01 00:04:00 | 2 | 0 |
2019-01-01 00:04:00 | 3 | 0 |
library(dplyr); library(padr)
df %>%
pad(group = 'Group', interval = 'min') %>% # Explicitly fill by 1 min
fill_by_value(Value)
#pad applied on the interval: min
# Datetime Group Value
#1 2019-01-01 00:00:00 1 5
#2 2019-01-01 00:01:00 1 1
#3 2019-01-01 00:02:00 1 2
#4 2019-01-01 00:03:00 1 1
#5 2019-01-01 00:04:00 1 1
#6 2019-01-01 00:00:00 2 4
#7 2019-01-01 00:01:00 2 0 # added
#8 2019-01-01 00:02:00 2 2
#9 2019-01-01 00:03:00 2 2
#10 2019-01-01 00:00:00 3 2
#11 2019-01-01 00:01:00 3 0 # added
#12 2019-01-01 00:02:00 3 1
数据
df <- read.table(
header = T,
stringsAsFactors = F, sep = "|",
text = "Datetime | Group | Value
2019-01-01 00:00:00 | 1 | 5
2019-01-01 00:00:00 | 2 | 4
2019-01-01 00:00:00 | 3 | 2
2019-01-01 00:01:00 | 1 | 1
2019-01-01 00:02:00 | 1 | 2
2019-01-01 00:02:00 | 2 | 2
2019-01-01 00:02:00 | 3 | 1
2019-01-01 00:03:00 | 1 | 1
2019-01-01 00:03:00 | 2 | 2
2019-01-01 00:04:00 | 1 | 1"
)
df$Datetime = lubridate::ymd_hms(df$Datetime)
使用base
:
date_groups <- expand.grid(Datetime= seq(min(df$Datetime), max(df$Datetime), "min"),
Group = c(1:3))
date_groups <- merge(date_groups, df, all.x = TRUE)
date_groups[is.na(date_groups)] <- 0
我们可以使用complete
library(tidyverse)
df %>%
complete(Group, Datetime = seq(min(Datetime),
max(Datetime), by = "1 min"), fill = list(Value = 0)) %>%
arrange(Datetime) %>%
select(names(df))
# A tibble: 15 x 3
# Datetime Group Value
# <dttm> <dbl> <dbl>
# 1 2019-01-01 00:00:00 1 5
# 2 2019-01-01 00:00:00 2 4
# 3 2019-01-01 00:00:00 3 2
# 4 2019-01-01 00:01:00 1 1
# 5 2019-01-01 00:01:00 2 0
# 6 2019-01-01 00:01:00 3 0
# 7 2019-01-01 00:02:00 1 2
# 8 2019-01-01 00:02:00 2 2
# 9 2019-01-01 00:02:00 3 1
#10 2019-01-01 00:03:00 1 1
#11 2019-01-01 00:03:00 2 2
#12 2019-01-01 00:03:00 3 0
#13 2019-01-01 00:04:00 1 1
#14 2019-01-01 00:04:00 2 0
#15 2019-01-01 00:04:00 3 0
数据
df <- structure(list(Datetime = structure(c(1546300800, 1546300800,
1546300800, 1546300860, 1546300920, 1546300920, 1546300920, 1546300980,
1546300980, 1546301040), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
Group = c(1, 2, 3, 1, 1, 2, 3, 1, 2, 1), Value = c(5, 4,
2, 1, 2, 2, 1, 1, 2, 1)), row.names = c(NA, -10L), class = "data.frame")