如何在将此添加到数据集时计算 4 年数据集上一天的最高温度?
How to calculate the maximum temperature of a day on a 4 year dataset while adding this to the dataset?
我有一个 4 年的数据集,我想计算每天的最高温度(提供每小时 1 次测量)。如何将此温度添加到我的数据集中的附加列中?我不知道如何在不删除其他列的情况下执行此操作。
我的数据集是这样的:
structure(list(DateTime = structure(c(1420070400, 1420074000,
1420077600, 1420081200, 1420084800, 1420088400, 1420092000, 1420095600,
1420099200, 1420102800, 1420106400, 1420110000, 1420113600, 1420117200,
1420120800, 1420124400, 1420128000, 1420131600, 1420135200, 1420138800,
1420142400, 1420146000, 1420149600, 1420153200, 1420156800, 1420160400,
1420164000, 1420167600, 1420171200, 1420174800, 1420178400, 1420182000,
1420185600, 1420189200, 1420192800, 1420196400, 1420200000, 1420203600,
1420207200, 1420210800, 1420214400, 1420218000, 1420221600, 1420225200,
1420228800, 1420232400, 1420236000, 1420239600), tzone = "UTC", class = c("POSIXct",
"POSIXt")), Tmin = c(3.33733696, 3.2377765, 2.83953466, 3.03865558,
2.7399742, 2.44129282, 2.34173236, 2.2421719, 2.34173236, 2.54085328,
2.7399742, 2.83953466, 3.33733696, 3.43689742, 3.53645788, 2.93909512,
2.7399742, 2.83953466, 2.83953466, 2.7399742, 2.7399742, 2.7399742,
2.83953466, 2.93909512, 3.03865558, 3.43689742, 3.7355788, 3.7355788,
3.43689742, 4.03426018, 4.7311834, 3.2377765, 3.33733696, 4.53206248,
4.93030432, 6.02546938, 7.02107398, 4.93030432, 4.7311834, 4.33294156,
4.7311834, 3.83513926, 4.83074386, 4.33294156, 3.83513926, 3.2377765,
2.93909512, 2.7399742), Tmax = c(3.77972493, 3.38212841, 3.18333015,
3.18333015, 2.98453189, 2.48753624, 2.38813711, 2.28873798, 2.38813711,
2.78573363, 2.78573363, 3.18333015, 3.48152754, 4.27672058, 4.17732145,
3.48152754, 2.88513276, 2.88513276, 2.88513276, 2.88513276, 2.78573363,
2.6863345, 3.18333015, 2.98453189, 3.28272928, 3.6803258, 3.97852319,
4.27672058, 3.87912406, 4.6743171, 4.87311536, 4.47551884, 4.47551884,
4.77371623, 5.76770753, 6.86109796, 7.85508926, 8.74968143, 5.37011101,
4.47551884, 6.16530405, 4.77371623, 4.97251449, 5.17131275, 4.87311536,
4.6743171, 3.18333015, 2.98453189), Tmean = c(3.62254694166667,
3.30742526, 3.00888893, 3.07523033666667, 2.87620611666667, 2.49474302833333,
2.39523091833333, 2.262548105, 2.37864556666667, 2.7103526, 2.74352330333333,
2.959132875, 3.37376666666667, 3.77181510666667, 3.854741865,
3.32401061166667, 2.80986471, 2.84303541333333, 2.82645006166667,
2.80986471, 2.760108655, 2.7103526, 2.92596217166667, 2.97571822666667,
3.09181568833333, 3.58937623833333, 3.82157116166667, 3.98742467833333,
3.63913229333333, 4.41864382166667, 4.83327761333333, 3.62254694166667,
4.08693678833333, 4.68400944833333, 5.33083816333333, 6.49181278,
7.56986063833333, 7.28790966, 5.06547253666667, 4.451814525,
5.71230125166667, 4.38547311833333, 4.849862965, 4.849862965,
4.50157058, 3.98742467833333, 2.99230357833333, 2.79327935833333
)), row.names = c(NA, -48L), class = c("tbl_df", "tbl", "data.frame"
))
因为我在等模特运行:
library(dplyr)
d <-
data.frame(date = sample(seq(as.Date('2021-01-01'), as.Date('2021-01-31'), by = 'day'), 75, replace = T),
value = rnorm(75))
d %>%
left_join(d %>%
group_by(date) %>%
summarize(max = max(value))) %>%
arrange(date)
dplyr()
库非常有用。它是 tidyverse
软件包的一部分,具有许多有用的数据管理功能。
我创建了一组数据来展示如何使用 summarize()
计算每个 group_by
日期的 max()
值,然后 left_join()
返回汇总数据集到原来的。
编辑: 这是@dario 的建议:
d %>%
group_by(date) %>%
mutate(max_val = max(value)) %>%
arrange(date)
更容易处理,不确定我是如何在第一轮中错过这个的!
编辑:
添加了日期时间到日期的转换:
在变异中使用总结:
其中 dd
是问题中提供的数据结构:
library(dplyr)
dd_new <- dd %>%
mutate(dmy = as.POSIXct(as.Date(DateTime), "%d-%m-%Y")) %>%
group_by(dmy) %>%
mutate(max_temp = max(Tmax)) %>%
ungroup() %>%
as.data.frame()
str(dd_new)
Returns 现在:
'data.frame': 48 obs. of 6 variables:
$ DateTime: POSIXct, format: "2015-01-01 00:00:00" "2015-01-01 01:00:00" "2015-01-01 02:00:00" ...
$ Tmin : num 3.34 3.24 2.84 3.04 2.74 ...
$ Tmax : num 3.78 3.38 3.18 3.18 2.98 ...
$ Tmean : num 3.62 3.31 3.01 3.08 2.88 ...
$ dmy : POSIXct, format: "2015-01-01 01:00:00" "2015-01-01 01:00:00" "2015-01-01 01:00:00" ...
$ max_temp: num 4.28 4.28 4.28 4.28 4.28 ...
我有一个 4 年的数据集,我想计算每天的最高温度(提供每小时 1 次测量)。如何将此温度添加到我的数据集中的附加列中?我不知道如何在不删除其他列的情况下执行此操作。
我的数据集是这样的:
structure(list(DateTime = structure(c(1420070400, 1420074000,
1420077600, 1420081200, 1420084800, 1420088400, 1420092000, 1420095600,
1420099200, 1420102800, 1420106400, 1420110000, 1420113600, 1420117200,
1420120800, 1420124400, 1420128000, 1420131600, 1420135200, 1420138800,
1420142400, 1420146000, 1420149600, 1420153200, 1420156800, 1420160400,
1420164000, 1420167600, 1420171200, 1420174800, 1420178400, 1420182000,
1420185600, 1420189200, 1420192800, 1420196400, 1420200000, 1420203600,
1420207200, 1420210800, 1420214400, 1420218000, 1420221600, 1420225200,
1420228800, 1420232400, 1420236000, 1420239600), tzone = "UTC", class = c("POSIXct",
"POSIXt")), Tmin = c(3.33733696, 3.2377765, 2.83953466, 3.03865558,
2.7399742, 2.44129282, 2.34173236, 2.2421719, 2.34173236, 2.54085328,
2.7399742, 2.83953466, 3.33733696, 3.43689742, 3.53645788, 2.93909512,
2.7399742, 2.83953466, 2.83953466, 2.7399742, 2.7399742, 2.7399742,
2.83953466, 2.93909512, 3.03865558, 3.43689742, 3.7355788, 3.7355788,
3.43689742, 4.03426018, 4.7311834, 3.2377765, 3.33733696, 4.53206248,
4.93030432, 6.02546938, 7.02107398, 4.93030432, 4.7311834, 4.33294156,
4.7311834, 3.83513926, 4.83074386, 4.33294156, 3.83513926, 3.2377765,
2.93909512, 2.7399742), Tmax = c(3.77972493, 3.38212841, 3.18333015,
3.18333015, 2.98453189, 2.48753624, 2.38813711, 2.28873798, 2.38813711,
2.78573363, 2.78573363, 3.18333015, 3.48152754, 4.27672058, 4.17732145,
3.48152754, 2.88513276, 2.88513276, 2.88513276, 2.88513276, 2.78573363,
2.6863345, 3.18333015, 2.98453189, 3.28272928, 3.6803258, 3.97852319,
4.27672058, 3.87912406, 4.6743171, 4.87311536, 4.47551884, 4.47551884,
4.77371623, 5.76770753, 6.86109796, 7.85508926, 8.74968143, 5.37011101,
4.47551884, 6.16530405, 4.77371623, 4.97251449, 5.17131275, 4.87311536,
4.6743171, 3.18333015, 2.98453189), Tmean = c(3.62254694166667,
3.30742526, 3.00888893, 3.07523033666667, 2.87620611666667, 2.49474302833333,
2.39523091833333, 2.262548105, 2.37864556666667, 2.7103526, 2.74352330333333,
2.959132875, 3.37376666666667, 3.77181510666667, 3.854741865,
3.32401061166667, 2.80986471, 2.84303541333333, 2.82645006166667,
2.80986471, 2.760108655, 2.7103526, 2.92596217166667, 2.97571822666667,
3.09181568833333, 3.58937623833333, 3.82157116166667, 3.98742467833333,
3.63913229333333, 4.41864382166667, 4.83327761333333, 3.62254694166667,
4.08693678833333, 4.68400944833333, 5.33083816333333, 6.49181278,
7.56986063833333, 7.28790966, 5.06547253666667, 4.451814525,
5.71230125166667, 4.38547311833333, 4.849862965, 4.849862965,
4.50157058, 3.98742467833333, 2.99230357833333, 2.79327935833333
)), row.names = c(NA, -48L), class = c("tbl_df", "tbl", "data.frame"
))
因为我在等模特运行:
library(dplyr)
d <-
data.frame(date = sample(seq(as.Date('2021-01-01'), as.Date('2021-01-31'), by = 'day'), 75, replace = T),
value = rnorm(75))
d %>%
left_join(d %>%
group_by(date) %>%
summarize(max = max(value))) %>%
arrange(date)
dplyr()
库非常有用。它是 tidyverse
软件包的一部分,具有许多有用的数据管理功能。
我创建了一组数据来展示如何使用 summarize()
计算每个 group_by
日期的 max()
值,然后 left_join()
返回汇总数据集到原来的。
编辑: 这是@dario 的建议:
d %>%
group_by(date) %>%
mutate(max_val = max(value)) %>%
arrange(date)
更容易处理,不确定我是如何在第一轮中错过这个的!
编辑:
添加了日期时间到日期的转换:
在变异中使用总结:
其中 dd
是问题中提供的数据结构:
library(dplyr)
dd_new <- dd %>%
mutate(dmy = as.POSIXct(as.Date(DateTime), "%d-%m-%Y")) %>%
group_by(dmy) %>%
mutate(max_temp = max(Tmax)) %>%
ungroup() %>%
as.data.frame()
str(dd_new)
Returns 现在:
'data.frame': 48 obs. of 6 variables: $ DateTime: POSIXct, format: "2015-01-01 00:00:00" "2015-01-01 01:00:00" "2015-01-01 02:00:00" ... $ Tmin : num 3.34 3.24 2.84 3.04 2.74 ... $ Tmax : num 3.78 3.38 3.18 3.18 2.98 ... $ Tmean : num 3.62 3.31 3.01 3.08 2.88 ... $ dmy : POSIXct, format: "2015-01-01 01:00:00" "2015-01-01 01:00:00" "2015-01-01 01:00:00" ... $ max_temp: num 4.28 4.28 4.28 4.28 4.28 ...