从具有行中每小时值和多个感兴趣列的数据框中计算每日参数
Calculate daily parameters from a dataframe with hourly-values in rows and with several columns of interest
数据框df1
总结了每小时([=17]不同深度(T5m
、T15m
、T25m
、T35m
)的水温=]).作为数据框的示例:
df1<- data.frame(Datetime=c("2016-08-12 12:00:00","2016-08-12 13:00:00","2016-08-12 14:00:00","2016-08-12 15:00:00","2016-08-13 12:00:00","2016-08-13 13:00:00","2016-08-13 14:00:00","2016-08-13 15:00:00"),
T5m= c(10,20,20,10,10,20,20,10),
T15m=c(10,20,10,20,10,20,10,20),
T25m=c(20,20,20,30,20,20,20,30),
T35m=c(20,20,10,10,20,20,10,10))
df1$Datetime<- as.POSIXct(df1$Datetime, format="%Y-%m-%d %H")
df1
Datetime T5m T15m T25m T35m
1 2016-08-12 12:00:00 10 10 20 20
2 2016-08-12 13:00:00 20 20 20 20
3 2016-08-12 14:00:00 20 10 20 10
4 2016-08-12 15:00:00 10 20 30 10
5 2016-08-13 12:00:00 10 10 20 20
6 2016-08-13 13:00:00 20 20 20 20
7 2016-08-13 14:00:00 20 10 20 10
8 2016-08-13 15:00:00 10 20 30 10
我想创建一个新的数据框 df2
,其中我有每个深度间隔和整个水柱的每天平均水温以及标准误差估计。我希望是这样的(我是手工计算的,所以可能会有一些错误):
> df2
Date meanT5m meanT15m meanT25m meanT35m meanTotal seT5m seT15m seT25m seT35m seTotal
1 2016-08-12 15 15 22.5 15 16.875 2.88 2.88 2.5 2.88 1.29
2 2016-08-13 15 15 22.5 15 16.875 2.88 2.88 2.5 2.88 1.29
我特别想知道如何使用 data.table
来完成它,因为我将使用巨大的 data.frames 并且我认为 data.table
非常有效。
为了计算标准误差,我知道包 plotrix
.
中的函数 std.error()
这是一种使用dplyr
和tidyr
分两部分计算的方法
library(dplyr)
library(tidyr)
df2 <- df1 %>%
mutate(Datetime = as.Date(Datetime)) %>%
gather(key, value, -Datetime) %>%
group_by(Datetime, key) %>%
summarise(se = plotrix::std.error(value),
mean = mean(value)) %>%
gather(total, value, -key, -Datetime)
bind_rows(df2, df2 %>%
group_by(Datetime, total) %>%
summarise(value = sum(value)) %>%
mutate(key = paste("total", c("mean", "se"), sep = "_"))) %>%
unite(key, key, total) %>%
spread(key, value)
# A tibble: 2 x 11
# Groups: Datetime [2]
# Datetime T15m_mean T15m_se T25m_mean T25m_se T35m_mean
# <date> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2016-08-12 15 2.89 22.5 2.5 15
#2 2016-08-13 15 2.89 22.5 2.5 15
# … with 5 more variables: T35m_se <dbl>, T5m_mean <dbl>,
# T5m_se <dbl>, total_mean_mean <dbl>, total_se_se <dbl>
根据@chinsoon 的评论更新
首先将你的数据框转换成数据table:
library(data.table)
setDT(df1)
创建一个 total
列:
df1[, total := rowSums(.SD), .SDcols = grep("T[0-9]+m", names(df1))][]
# Datetime T5m T15m T25m T35m total
# 1: 2016-08-12 12:00:00 10 10 20 20 60
# 2: 2016-08-12 13:00:00 20 20 20 20 80
# 3: 2016-08-12 14:00:00 20 10 20 10 60
# 4: 2016-08-12 15:00:00 10 20 30 10 70
# 5: 2016-08-13 12:00:00 10 10 20 20 60
# 6: 2016-08-13 13:00:00 20 20 20 20 80
# 7: 2016-08-13 14:00:00 20 10 20 10 60
# 8: 2016-08-13 15:00:00 10 20 30 10 70
应用每个 day
:
的函数
library(lubridate)
(df3 <- df1[, as.list(unlist(lapply(.SD, function (x)
c(mean = mean(x), sem = sd(x) / sqrt(length(x)))))),
day(Datetime)])
# day T5m.mean T5m.sem T15m.mean T15m.sem T25m.mean T25m.sem T35m.mean
# 1: 12 15 2.886751 15 2.886751 22.5 2.5 15
# 2: 13 15 2.886751 15 2.886751 22.5 2.5 15
# T35m.sem total.mean total.sem
# 1: 2.886751 67.5 4.787136
# 2: 2.886751 67.5 4.787136
数据框df1
总结了每小时([=17]不同深度(T5m
、T15m
、T25m
、T35m
)的水温=]).作为数据框的示例:
df1<- data.frame(Datetime=c("2016-08-12 12:00:00","2016-08-12 13:00:00","2016-08-12 14:00:00","2016-08-12 15:00:00","2016-08-13 12:00:00","2016-08-13 13:00:00","2016-08-13 14:00:00","2016-08-13 15:00:00"),
T5m= c(10,20,20,10,10,20,20,10),
T15m=c(10,20,10,20,10,20,10,20),
T25m=c(20,20,20,30,20,20,20,30),
T35m=c(20,20,10,10,20,20,10,10))
df1$Datetime<- as.POSIXct(df1$Datetime, format="%Y-%m-%d %H")
df1
Datetime T5m T15m T25m T35m
1 2016-08-12 12:00:00 10 10 20 20
2 2016-08-12 13:00:00 20 20 20 20
3 2016-08-12 14:00:00 20 10 20 10
4 2016-08-12 15:00:00 10 20 30 10
5 2016-08-13 12:00:00 10 10 20 20
6 2016-08-13 13:00:00 20 20 20 20
7 2016-08-13 14:00:00 20 10 20 10
8 2016-08-13 15:00:00 10 20 30 10
我想创建一个新的数据框 df2
,其中我有每个深度间隔和整个水柱的每天平均水温以及标准误差估计。我希望是这样的(我是手工计算的,所以可能会有一些错误):
> df2
Date meanT5m meanT15m meanT25m meanT35m meanTotal seT5m seT15m seT25m seT35m seTotal
1 2016-08-12 15 15 22.5 15 16.875 2.88 2.88 2.5 2.88 1.29
2 2016-08-13 15 15 22.5 15 16.875 2.88 2.88 2.5 2.88 1.29
我特别想知道如何使用 data.table
来完成它,因为我将使用巨大的 data.frames 并且我认为 data.table
非常有效。
为了计算标准误差,我知道包 plotrix
.
std.error()
这是一种使用dplyr
和tidyr
分两部分计算的方法
library(dplyr)
library(tidyr)
df2 <- df1 %>%
mutate(Datetime = as.Date(Datetime)) %>%
gather(key, value, -Datetime) %>%
group_by(Datetime, key) %>%
summarise(se = plotrix::std.error(value),
mean = mean(value)) %>%
gather(total, value, -key, -Datetime)
bind_rows(df2, df2 %>%
group_by(Datetime, total) %>%
summarise(value = sum(value)) %>%
mutate(key = paste("total", c("mean", "se"), sep = "_"))) %>%
unite(key, key, total) %>%
spread(key, value)
# A tibble: 2 x 11
# Groups: Datetime [2]
# Datetime T15m_mean T15m_se T25m_mean T25m_se T35m_mean
# <date> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2016-08-12 15 2.89 22.5 2.5 15
#2 2016-08-13 15 2.89 22.5 2.5 15
# … with 5 more variables: T35m_se <dbl>, T5m_mean <dbl>,
# T5m_se <dbl>, total_mean_mean <dbl>, total_se_se <dbl>
根据@chinsoon 的评论更新
首先将你的数据框转换成数据table:
library(data.table) setDT(df1)
创建一个
total
列:df1[, total := rowSums(.SD), .SDcols = grep("T[0-9]+m", names(df1))][] # Datetime T5m T15m T25m T35m total # 1: 2016-08-12 12:00:00 10 10 20 20 60 # 2: 2016-08-12 13:00:00 20 20 20 20 80 # 3: 2016-08-12 14:00:00 20 10 20 10 60 # 4: 2016-08-12 15:00:00 10 20 30 10 70 # 5: 2016-08-13 12:00:00 10 10 20 20 60 # 6: 2016-08-13 13:00:00 20 20 20 20 80 # 7: 2016-08-13 14:00:00 20 10 20 10 60 # 8: 2016-08-13 15:00:00 10 20 30 10 70
应用每个
的函数day
:library(lubridate) (df3 <- df1[, as.list(unlist(lapply(.SD, function (x) c(mean = mean(x), sem = sd(x) / sqrt(length(x)))))), day(Datetime)]) # day T5m.mean T5m.sem T15m.mean T15m.sem T25m.mean T25m.sem T35m.mean # 1: 12 15 2.886751 15 2.886751 22.5 2.5 15 # 2: 13 15 2.886751 15 2.886751 22.5 2.5 15 # T35m.sem total.mean total.sem # 1: 2.886751 67.5 4.787136 # 2: 2.886751 67.5 4.787136