根据多个时间段计算列的平均值
Calculating average of a column based on multiple time periods
我需要帮助来弄清楚如何每 ___ 小时计算一个变量的平均值。我想每 1/2 小时计算一次平均值,然后每 1、2、4 和 6 小时计算一次平均值。
这是我的数据集:
dput(head(R3L12, 10))
structure(list(Date = c("2015-05-23", "2015-05-23", "2015-05-23",
"2015-05-23", "2015-05-23", "2015-05-23", "2015-05-23", "2015-05-23",
"2015-05-23", "2015-05-23"), Time = c("07:25:00", "07:40:00",
"07:45:00", "09:10:00", "11:45:00", "11:55:00", "12:05:00", "12:35:00",
"12:45:00", "13:30:00"), Turtle = structure(c(3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("R3L1", "R3L11", "R3L12",
"R3L2", "R3L4", "R3L8", "R3L9", "R4L8", "R8L1", "R8L4", "R8NAT123"
), class = "factor"), Tex = c(11.891, 12.008, 12.055, 13.219,
18.727, 18.992, 19.477, 20.367, 20.641, 28.305), m.Tb = c(12.477,
12.54, 12.54, 12.978, 16.362, 16.612, 17.238, 19.617, 19.993,
24.371), m.HR = c(7.56457, 6.66759, 17.51107, 9.72277, 19.44553,
13.07674, 28.115, 14.99467, 17.16947, 40.40479), season = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("beginning",
"end", "middle"), class = "factor"), year = c(2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L), Mass = c(360L,
360L, 360L, 360L, 360L, 360L, 360L, 360L, 360L, 360L)), row.names = c(NA,
10L), class = "data.frame")
我希望能够计算每个日期每个时间段的平均值 m.Tb。例如,对于 2015-05-23,我希望每 30 分钟、1 小时、2 小时、4 小时和 6 小时的平均值为 m.Tb。然后我想在第二天重复一遍。有时“时间”列中有“缺失”行,这是因为 NA 行已被删除。
如果您需要说明或有任何疑问,请告诉我,因为我还是 r 的新手。
希望这就是您要找的。由于生成的数据帧具有不同的行号,我不得不将它们存储在列表中。为此,我首先创建了一个包含所有你想要计算平均值的时间跨度的字符向量,然后我使用 purrr
包中的 map
函数来迭代它们,替换 breaks
参数在 cut
中创建您想要的时间跨度。
library(dplyr)
library(lubridate)
library(purrr)
breaks <- c("15 min", "30 min", "1 hour", "2 hour", "4 hour", "6 hour")
breaks %>%
map(~ df %>%
unite("Date-Time", c("Date", "Time"), sep = " ", remove = FALSE) %>%
mutate(`Date-Time` = ymd_hms(`Date-Time`)) %>%
mutate(DS = cut(`Date-Time`, breaks = .x)) %>%
group_by(ymd(Date), DS) %>%
summarise(avg = mean(m.Tb))) %>%
set_names(breaks)
$`15 min`
# A tibble: 8 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:25:00 12.5
2 2015-05-23 2015-05-23 07:40:00 12.5
3 2015-05-23 2015-05-23 09:10:00 13.0
4 2015-05-23 2015-05-23 11:40:00 16.4
5 2015-05-23 2015-05-23 11:55:00 16.9
6 2015-05-23 2015-05-23 12:25:00 19.6
7 2015-05-23 2015-05-23 12:40:00 20.0
8 2015-05-23 2015-05-23 13:25:00 24.4
$`30 min`
# A tibble: 6 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:25:00 12.5
2 2015-05-23 2015-05-23 08:55:00 13.0
3 2015-05-23 2015-05-23 11:25:00 16.4
4 2015-05-23 2015-05-23 11:55:00 16.9
5 2015-05-23 2015-05-23 12:25:00 19.8
6 2015-05-23 2015-05-23 13:25:00 24.4
$`1 hour`
# A tibble: 5 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:00:00 12.5
2 2015-05-23 2015-05-23 09:00:00 13.0
3 2015-05-23 2015-05-23 11:00:00 16.5
4 2015-05-23 2015-05-23 12:00:00 18.9
5 2015-05-23 2015-05-23 13:00:00 24.4
$`2 hour`
# A tibble: 4 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:00:00 12.5
2 2015-05-23 2015-05-23 09:00:00 13.0
3 2015-05-23 2015-05-23 11:00:00 18.0
4 2015-05-23 2015-05-23 13:00:00 24.4
$`4 hour`
# A tibble: 2 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:00:00 12.6
2 2015-05-23 2015-05-23 11:00:00 19.0
$`6 hour`
# A tibble: 2 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:00:00 15.6
2 2015-05-23 2015-05-23 13:00:00 24.4
我们可以使用 lubridate
中的 ceiling_date
library(lubridate)
library(dplyr)
library(stringr)
R3L12 %>%
group_by(DS = ceiling_date(as.POSIXct(str_c(Date, Time, sep=" ")),
unit = '30 min' )) %>%
summarise(avg_30 = mean(m.Tb)) %>%
mutate(date = as.Date(DS))
-输出
# A tibble: 7 x 3
# DS avg_30 date
# <dttm> <dbl> <date>
#1 2015-05-23 07:30:00 12.5 2015-05-23
#2 2015-05-23 08:00:00 12.5 2015-05-23
#3 2015-05-23 09:30:00 13.0 2015-05-23
#4 2015-05-23 12:00:00 16.5 2015-05-23
#5 2015-05-23 12:30:00 17.2 2015-05-23
#6 2015-05-23 13:00:00 19.8 2015-05-23
#7 2015-05-23 13:30:00 24.4 2015-05-23
这就是我要做的,你有很多缺失的时期,所以它不是半小时聚合的最佳输出
data_example <- structure(list(Date = c("2015-05-23", "2015-05-23", "2015-05-23",
"2015-05-23", "2015-05-23", "2015-05-23", "2015-05-23", "2015-05-23",
"2015-05-23", "2015-05-23"), Time = c("07:25:00", "07:40:00",
"07:45:00", "09:10:00", "11:45:00", "11:55:00", "12:05:00", "12:35:00",
"12:45:00", "13:30:00"), Turtle = structure(c(3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("R3L1", "R3L11", "R3L12",
"R3L2", "R3L4", "R3L8", "R3L9", "R4L8", "R8L1", "R8L4", "R8NAT123"
), class = "factor"), Tex = c(11.891, 12.008, 12.055, 13.219,
18.727, 18.992, 19.477, 20.367, 20.641, 28.305), m.Tb = c(12.477,
12.54, 12.54, 12.978, 16.362, 16.612, 17.238, 19.617, 19.993,
24.371), m.HR = c(7.56457, 6.66759, 17.51107, 9.72277, 19.44553,
13.07674, 28.115, 14.99467, 17.16947, 40.40479), season = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("beginning",
"end", "middle"), class = "factor"), year = c(2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L), Mass = c(360L,
360L, 360L, 360L, 360L, 360L, 360L, 360L, 360L, 360L)), row.names = c(NA,
10L), class = "data.frame")
library(tidyverse)
floor_30 <- function(x) clock::date_floor(x = x,precision = "minute",n = 30)
mean_at_inteval <- function(data,date_col,interval_func) {
data |>
group_by(interval = {{date_col}} |> interval_func()) |>
summarise(sum_interval = sum(m.Tb)) |>
summarise(mean_interval = mean(sum_interval))
}
nest_example_data <- data_example %>%
mutate(date_timer = str_c(Date,Time) %>% clock::date_time_parse(zone = "UTC")) |>
nest_by(Date)
final_data <- nest_example_data |> mutate(floor_30 = data |> mean_at_inteval(date_col = date_timer,interval_func = floor_30))
final_data
#> # A tibble: 1 x 3
#> # Rowwise: Date
#> Date data floor_30$mean_interval
#> <chr> <list<tibble[,9]>> <dbl>
#> 1 2015-05-23 [10 x 9] 23.5
由 reprex package (v2.0.0)
于 2021-05-30 创建
我需要帮助来弄清楚如何每 ___ 小时计算一个变量的平均值。我想每 1/2 小时计算一次平均值,然后每 1、2、4 和 6 小时计算一次平均值。
这是我的数据集:
dput(head(R3L12, 10))
structure(list(Date = c("2015-05-23", "2015-05-23", "2015-05-23",
"2015-05-23", "2015-05-23", "2015-05-23", "2015-05-23", "2015-05-23",
"2015-05-23", "2015-05-23"), Time = c("07:25:00", "07:40:00",
"07:45:00", "09:10:00", "11:45:00", "11:55:00", "12:05:00", "12:35:00",
"12:45:00", "13:30:00"), Turtle = structure(c(3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("R3L1", "R3L11", "R3L12",
"R3L2", "R3L4", "R3L8", "R3L9", "R4L8", "R8L1", "R8L4", "R8NAT123"
), class = "factor"), Tex = c(11.891, 12.008, 12.055, 13.219,
18.727, 18.992, 19.477, 20.367, 20.641, 28.305), m.Tb = c(12.477,
12.54, 12.54, 12.978, 16.362, 16.612, 17.238, 19.617, 19.993,
24.371), m.HR = c(7.56457, 6.66759, 17.51107, 9.72277, 19.44553,
13.07674, 28.115, 14.99467, 17.16947, 40.40479), season = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("beginning",
"end", "middle"), class = "factor"), year = c(2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L), Mass = c(360L,
360L, 360L, 360L, 360L, 360L, 360L, 360L, 360L, 360L)), row.names = c(NA,
10L), class = "data.frame")
我希望能够计算每个日期每个时间段的平均值 m.Tb。例如,对于 2015-05-23,我希望每 30 分钟、1 小时、2 小时、4 小时和 6 小时的平均值为 m.Tb。然后我想在第二天重复一遍。有时“时间”列中有“缺失”行,这是因为 NA 行已被删除。
如果您需要说明或有任何疑问,请告诉我,因为我还是 r 的新手。
希望这就是您要找的。由于生成的数据帧具有不同的行号,我不得不将它们存储在列表中。为此,我首先创建了一个包含所有你想要计算平均值的时间跨度的字符向量,然后我使用 purrr
包中的 map
函数来迭代它们,替换 breaks
参数在 cut
中创建您想要的时间跨度。
library(dplyr)
library(lubridate)
library(purrr)
breaks <- c("15 min", "30 min", "1 hour", "2 hour", "4 hour", "6 hour")
breaks %>%
map(~ df %>%
unite("Date-Time", c("Date", "Time"), sep = " ", remove = FALSE) %>%
mutate(`Date-Time` = ymd_hms(`Date-Time`)) %>%
mutate(DS = cut(`Date-Time`, breaks = .x)) %>%
group_by(ymd(Date), DS) %>%
summarise(avg = mean(m.Tb))) %>%
set_names(breaks)
$`15 min`
# A tibble: 8 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:25:00 12.5
2 2015-05-23 2015-05-23 07:40:00 12.5
3 2015-05-23 2015-05-23 09:10:00 13.0
4 2015-05-23 2015-05-23 11:40:00 16.4
5 2015-05-23 2015-05-23 11:55:00 16.9
6 2015-05-23 2015-05-23 12:25:00 19.6
7 2015-05-23 2015-05-23 12:40:00 20.0
8 2015-05-23 2015-05-23 13:25:00 24.4
$`30 min`
# A tibble: 6 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:25:00 12.5
2 2015-05-23 2015-05-23 08:55:00 13.0
3 2015-05-23 2015-05-23 11:25:00 16.4
4 2015-05-23 2015-05-23 11:55:00 16.9
5 2015-05-23 2015-05-23 12:25:00 19.8
6 2015-05-23 2015-05-23 13:25:00 24.4
$`1 hour`
# A tibble: 5 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:00:00 12.5
2 2015-05-23 2015-05-23 09:00:00 13.0
3 2015-05-23 2015-05-23 11:00:00 16.5
4 2015-05-23 2015-05-23 12:00:00 18.9
5 2015-05-23 2015-05-23 13:00:00 24.4
$`2 hour`
# A tibble: 4 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:00:00 12.5
2 2015-05-23 2015-05-23 09:00:00 13.0
3 2015-05-23 2015-05-23 11:00:00 18.0
4 2015-05-23 2015-05-23 13:00:00 24.4
$`4 hour`
# A tibble: 2 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:00:00 12.6
2 2015-05-23 2015-05-23 11:00:00 19.0
$`6 hour`
# A tibble: 2 x 3
# Groups: ymd(Date) [1]
`ymd(Date)` DS avg
<date> <fct> <dbl>
1 2015-05-23 2015-05-23 07:00:00 15.6
2 2015-05-23 2015-05-23 13:00:00 24.4
我们可以使用 lubridate
ceiling_date
library(lubridate)
library(dplyr)
library(stringr)
R3L12 %>%
group_by(DS = ceiling_date(as.POSIXct(str_c(Date, Time, sep=" ")),
unit = '30 min' )) %>%
summarise(avg_30 = mean(m.Tb)) %>%
mutate(date = as.Date(DS))
-输出
# A tibble: 7 x 3
# DS avg_30 date
# <dttm> <dbl> <date>
#1 2015-05-23 07:30:00 12.5 2015-05-23
#2 2015-05-23 08:00:00 12.5 2015-05-23
#3 2015-05-23 09:30:00 13.0 2015-05-23
#4 2015-05-23 12:00:00 16.5 2015-05-23
#5 2015-05-23 12:30:00 17.2 2015-05-23
#6 2015-05-23 13:00:00 19.8 2015-05-23
#7 2015-05-23 13:30:00 24.4 2015-05-23
这就是我要做的,你有很多缺失的时期,所以它不是半小时聚合的最佳输出
data_example <- structure(list(Date = c("2015-05-23", "2015-05-23", "2015-05-23",
"2015-05-23", "2015-05-23", "2015-05-23", "2015-05-23", "2015-05-23",
"2015-05-23", "2015-05-23"), Time = c("07:25:00", "07:40:00",
"07:45:00", "09:10:00", "11:45:00", "11:55:00", "12:05:00", "12:35:00",
"12:45:00", "13:30:00"), Turtle = structure(c(3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("R3L1", "R3L11", "R3L12",
"R3L2", "R3L4", "R3L8", "R3L9", "R4L8", "R8L1", "R8L4", "R8NAT123"
), class = "factor"), Tex = c(11.891, 12.008, 12.055, 13.219,
18.727, 18.992, 19.477, 20.367, 20.641, 28.305), m.Tb = c(12.477,
12.54, 12.54, 12.978, 16.362, 16.612, 17.238, 19.617, 19.993,
24.371), m.HR = c(7.56457, 6.66759, 17.51107, 9.72277, 19.44553,
13.07674, 28.115, 14.99467, 17.16947, 40.40479), season = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("beginning",
"end", "middle"), class = "factor"), year = c(2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L), Mass = c(360L,
360L, 360L, 360L, 360L, 360L, 360L, 360L, 360L, 360L)), row.names = c(NA,
10L), class = "data.frame")
library(tidyverse)
floor_30 <- function(x) clock::date_floor(x = x,precision = "minute",n = 30)
mean_at_inteval <- function(data,date_col,interval_func) {
data |>
group_by(interval = {{date_col}} |> interval_func()) |>
summarise(sum_interval = sum(m.Tb)) |>
summarise(mean_interval = mean(sum_interval))
}
nest_example_data <- data_example %>%
mutate(date_timer = str_c(Date,Time) %>% clock::date_time_parse(zone = "UTC")) |>
nest_by(Date)
final_data <- nest_example_data |> mutate(floor_30 = data |> mean_at_inteval(date_col = date_timer,interval_func = floor_30))
final_data
#> # A tibble: 1 x 3
#> # Rowwise: Date
#> Date data floor_30$mean_interval
#> <chr> <list<tibble[,9]>> <dbl>
#> 1 2015-05-23 [10 x 9] 23.5
由 reprex package (v2.0.0)
于 2021-05-30 创建