基于一天中的时间的密度图
Density plot based on time of the day
我有以下数据集:
https://app.box.com/s/au58xaw60r1hyeek5cua6q20byumgvmj
我想根据一天中的时间创建密度图。这是我到目前为止所做的:
library("ggplot2")
library("scales")
library("lubridate")
timestamp_df$timestamp_time <- format(ymd_hms(hn_tweets$timestamp), "%H:%M:%S")
ggplot(timestamp_df, aes(timestamp_time)) +
geom_density(aes(fill = ..count..)) +
scale_x_datetime(breaks = date_breaks("2 hours"),labels=date_format("%H:%M"))
它给出了以下错误:
Error: Invalid input: time_trans works with objects of class POSIXct only
如果我将其转换为 POSIXct
,它会将日期添加到数据中。
更新 1
以下数据转换为'NA'
timestamp_df$timestamp_time <- as.POSIXct(timestamp_df$timestamp_time, format = "%H:%M%:%S", tz = "UTC"
更新 2
以下是我想要实现的:
这是一种方法:
library(ggplot2)
library(lubridate)
library(scales)
df <- read.csv("data.csv") #given in OP
将字符转换为 POSIXct
df$timestamp <- as.POSIXct(strptime(df$timestamp, "%m/%d/%Y %H:%M", tz = "UTC"))
library(hms)
提取小时和分钟:
df$time <- hms::hms(second(df$timestamp), minute(df$timestamp), hour(df$timestamp))
再次转换为 POSIXct
,因为 ggplot 不适用于 class hms
。
df$time <- as.POSIXct(df$time)
ggplot(df, aes(time)) +
geom_density(fill = "red", alpha = 0.5) + #also play with adjust such as adjust = 0.5
scale_x_datetime(breaks = date_breaks("2 hours"), labels=date_format("%H:%M"))
将其绘制为 1:
ggplot(df) +
geom_density( aes(x = time, y = ..scaled..), fill = "red", alpha = 0.5) +
scale_x_datetime(breaks = date_breaks("2 hours"), labels=date_format("%H:%M"))
其中 ..scaled..
是在绘图创建期间为 stat_density
计算的变量。
这里发布的解决方案的一个问题是他们忽略了这个数据是 circular/polar 的事实(即 00hrs == 24hrs)。您可以在另一个答案的图表上看到图表的末端彼此不匹配。这不会对这个特定的数据集产生太大影响,但对于发生在午夜附近的事件,这可能是一个极度有偏见的密度估计。这是我的解决方案,考虑到时间数据的循环性质:
# modified code from https://freakonometrics.hypotheses.org/2239
library(dplyr)
library(ggplot2)
library(lubridate)
library(circular)
df = read.csv("data.csv")
datetimes = df$timestamp %>%
lubridate::parse_date_time("%m/%d/%Y %h:%M")
times_in_decimal = lubridate::hour(datetimes) + lubridate::minute(datetimes) / 60
times_in_radians = 2 * pi * (times_in_decimal / 24)
# Doing this just for bandwidth estimation:
basic_dens = density(times_in_radians, from = 0, to = 2 * pi)
res = circular::density.circular(circular::circular(times_in_radians,
type = "angle",
units = "radians",
rotation = "clock"),
kernel = "wrappednormal",
bw = basic_dens$bw)
time_pdf = data.frame(time = as.numeric(24 * (2 * pi + res$x) / (2 * pi)), # Convert from radians back to 24h clock
likelihood = res$y)
p = ggplot(time_pdf) +
geom_area(aes(x = time, y = likelihood), fill = "#619CFF") +
scale_x_continuous("Hour of Day", labels = 0:24, breaks = 0:24) +
scale_y_continuous("Likelihood of Data") +
theme_classic()
请注意,密度图的值和斜率在 00h 和 24h 点匹配。
我有以下数据集:
https://app.box.com/s/au58xaw60r1hyeek5cua6q20byumgvmj
我想根据一天中的时间创建密度图。这是我到目前为止所做的:
library("ggplot2")
library("scales")
library("lubridate")
timestamp_df$timestamp_time <- format(ymd_hms(hn_tweets$timestamp), "%H:%M:%S")
ggplot(timestamp_df, aes(timestamp_time)) +
geom_density(aes(fill = ..count..)) +
scale_x_datetime(breaks = date_breaks("2 hours"),labels=date_format("%H:%M"))
它给出了以下错误:
Error: Invalid input: time_trans works with objects of class POSIXct only
如果我将其转换为 POSIXct
,它会将日期添加到数据中。
更新 1
以下数据转换为'NA'
timestamp_df$timestamp_time <- as.POSIXct(timestamp_df$timestamp_time, format = "%H:%M%:%S", tz = "UTC"
更新 2
以下是我想要实现的:
这是一种方法:
library(ggplot2)
library(lubridate)
library(scales)
df <- read.csv("data.csv") #given in OP
将字符转换为 POSIXct
df$timestamp <- as.POSIXct(strptime(df$timestamp, "%m/%d/%Y %H:%M", tz = "UTC"))
library(hms)
提取小时和分钟:
df$time <- hms::hms(second(df$timestamp), minute(df$timestamp), hour(df$timestamp))
再次转换为 POSIXct
,因为 ggplot 不适用于 class hms
。
df$time <- as.POSIXct(df$time)
ggplot(df, aes(time)) +
geom_density(fill = "red", alpha = 0.5) + #also play with adjust such as adjust = 0.5
scale_x_datetime(breaks = date_breaks("2 hours"), labels=date_format("%H:%M"))
将其绘制为 1:
ggplot(df) +
geom_density( aes(x = time, y = ..scaled..), fill = "red", alpha = 0.5) +
scale_x_datetime(breaks = date_breaks("2 hours"), labels=date_format("%H:%M"))
其中 ..scaled..
是在绘图创建期间为 stat_density
计算的变量。
这里发布的解决方案的一个问题是他们忽略了这个数据是 circular/polar 的事实(即 00hrs == 24hrs)。您可以在另一个答案的图表上看到图表的末端彼此不匹配。这不会对这个特定的数据集产生太大影响,但对于发生在午夜附近的事件,这可能是一个极度有偏见的密度估计。这是我的解决方案,考虑到时间数据的循环性质:
# modified code from https://freakonometrics.hypotheses.org/2239
library(dplyr)
library(ggplot2)
library(lubridate)
library(circular)
df = read.csv("data.csv")
datetimes = df$timestamp %>%
lubridate::parse_date_time("%m/%d/%Y %h:%M")
times_in_decimal = lubridate::hour(datetimes) + lubridate::minute(datetimes) / 60
times_in_radians = 2 * pi * (times_in_decimal / 24)
# Doing this just for bandwidth estimation:
basic_dens = density(times_in_radians, from = 0, to = 2 * pi)
res = circular::density.circular(circular::circular(times_in_radians,
type = "angle",
units = "radians",
rotation = "clock"),
kernel = "wrappednormal",
bw = basic_dens$bw)
time_pdf = data.frame(time = as.numeric(24 * (2 * pi + res$x) / (2 * pi)), # Convert from radians back to 24h clock
likelihood = res$y)
p = ggplot(time_pdf) +
geom_area(aes(x = time, y = likelihood), fill = "#619CFF") +
scale_x_continuous("Hour of Day", labels = 0:24, breaks = 0:24) +
scale_y_continuous("Likelihood of Data") +
theme_classic()
请注意,密度图的值和斜率在 00h 和 24h 点匹配。