在 R 中有条件地按组计算事件之间的平均时间
Conditionally calculating average time between events by group in R
我正在处理来自电话热线服务的通话记录数据集。通话结果分为三种:已接听、已放弃和已接听。我试图找出每个来电者在放弃上一个电话后再次联系热线所花费的平均时间。时差可以是秒、分钟、小时或天,但如果可能的话我想得到所有四个。
这是一些模拟数据,其中包含我正在使用的变量:-
library(wakefield)#for generating the Status variable
library(dplyr)
library(stringi)
library(Pareto)
library(uuid)
n_users<-1300
n_rows <- 365000
set.seed(1)
#data<-data.frame()
Date<-seq(as.Date("2015-01-01"), as.Date("2015-12-31"), by = "1 day")
Date<-sample(rep(Date,each=1000),replace = T)
u <- runif(length(Date), 0, 60*60*12) # "noise" to add or subtract from some timepoint
CallDateTime<-as.POSIXlt(u, origin = paste0(Date,"00:00:00"))
CallDateTime
CallOutcome<-r_sample_factor(x = c("Answered", "Abandoned", "Engaged"), n=length(Date))
CallOutcome
data<-data.frame(Date,CallDateTime,CallOutcome)
relative_probs <- rPareto(n = n_users, t = 1, alpha = 0.3, truncation = 500)
unique_ids <- UUIDgenerate(n = n_users)
data$CallerId <- sample(unique_ids, size = n_rows, prob = relative_probs, replace = TRUE)
data<-data%>%arrange(CallDateTime)
head(data)
所以重申一下,如果呼叫者放弃了他们的呼叫(在 CallOutcome 列中用“放弃”表示),我想知道呼叫者再次调用该服务所花费的平均时间,在四个我提到的时间单位。任何关于我如何实现这一点的指示都会很棒 :)
在数据中保留当前行为 "Abandoned"
而下一行不是每个 ID
的行。找出每两行之间的时间差异,以获得调用者在放弃后再次调用服务所需的时间,取每个持续时间的平均值以获得平均时间。
library(dplyr)
data %>%
#Test the answer on smaller subset
#slice(1:1000) %>%
arrange(CallerId, CallDateTime) %>%
group_by(CallerId) %>%
filter(CallOutcome == 'Abandoned' & dplyr::lead(CallOutcome) != 'Abandoned' |
CallOutcome != 'Abandoned' & dplyr::lag(CallOutcome) == 'Abandoned') %>%
mutate(group = rep(row_number(), each = 2, length.out = n())) %>%
group_by(group, .add = TRUE) %>%
summarise(avg_sec = difftime(CallDateTime[2], CallDateTime[1], units = 'secs')) %>%
mutate(avg_sec = as.numeric(mean(avg_sec)),
avg_min = avg_sec/60,
avg_hour = avg_min/60,
avg_day = avg_hour/24) -> result
result
首先,我将创建引导变量(基本上按组计算“下一个”值是什么。然后就可以简单地为 difftime
使用您想要的任何单位。密度图可以帮助您分析这些差异,如下图
data <-
data %>%
group_by(CallerId) %>%
mutate(CallDateTime_Next = lead(CallDateTime)) %>%
ungroup() %>%
mutate(
diff_days = difftime(CallDateTime_Next, CallDateTime, units = 'days'),
diff_hours = difftime(CallDateTime_Next, CallDateTime, units = 'hours'),
diff_mins = difftime(CallDateTime_Next, CallDateTime, units = 'mins'),
diff_secs = difftime(CallDateTime_Next, CallDateTime, units = 'secs')
)
data %>%
filter(CallOutcome == 'Abandoned') %>%
ggplot() +
geom_density(aes(x = diff_days))
我正在处理来自电话热线服务的通话记录数据集。通话结果分为三种:已接听、已放弃和已接听。我试图找出每个来电者在放弃上一个电话后再次联系热线所花费的平均时间。时差可以是秒、分钟、小时或天,但如果可能的话我想得到所有四个。
这是一些模拟数据,其中包含我正在使用的变量:-
library(wakefield)#for generating the Status variable
library(dplyr)
library(stringi)
library(Pareto)
library(uuid)
n_users<-1300
n_rows <- 365000
set.seed(1)
#data<-data.frame()
Date<-seq(as.Date("2015-01-01"), as.Date("2015-12-31"), by = "1 day")
Date<-sample(rep(Date,each=1000),replace = T)
u <- runif(length(Date), 0, 60*60*12) # "noise" to add or subtract from some timepoint
CallDateTime<-as.POSIXlt(u, origin = paste0(Date,"00:00:00"))
CallDateTime
CallOutcome<-r_sample_factor(x = c("Answered", "Abandoned", "Engaged"), n=length(Date))
CallOutcome
data<-data.frame(Date,CallDateTime,CallOutcome)
relative_probs <- rPareto(n = n_users, t = 1, alpha = 0.3, truncation = 500)
unique_ids <- UUIDgenerate(n = n_users)
data$CallerId <- sample(unique_ids, size = n_rows, prob = relative_probs, replace = TRUE)
data<-data%>%arrange(CallDateTime)
head(data)
所以重申一下,如果呼叫者放弃了他们的呼叫(在 CallOutcome 列中用“放弃”表示),我想知道呼叫者再次调用该服务所花费的平均时间,在四个我提到的时间单位。任何关于我如何实现这一点的指示都会很棒 :)
在数据中保留当前行为 "Abandoned"
而下一行不是每个 ID
的行。找出每两行之间的时间差异,以获得调用者在放弃后再次调用服务所需的时间,取每个持续时间的平均值以获得平均时间。
library(dplyr)
data %>%
#Test the answer on smaller subset
#slice(1:1000) %>%
arrange(CallerId, CallDateTime) %>%
group_by(CallerId) %>%
filter(CallOutcome == 'Abandoned' & dplyr::lead(CallOutcome) != 'Abandoned' |
CallOutcome != 'Abandoned' & dplyr::lag(CallOutcome) == 'Abandoned') %>%
mutate(group = rep(row_number(), each = 2, length.out = n())) %>%
group_by(group, .add = TRUE) %>%
summarise(avg_sec = difftime(CallDateTime[2], CallDateTime[1], units = 'secs')) %>%
mutate(avg_sec = as.numeric(mean(avg_sec)),
avg_min = avg_sec/60,
avg_hour = avg_min/60,
avg_day = avg_hour/24) -> result
result
首先,我将创建引导变量(基本上按组计算“下一个”值是什么。然后就可以简单地为 difftime
使用您想要的任何单位。密度图可以帮助您分析这些差异,如下图
data <-
data %>%
group_by(CallerId) %>%
mutate(CallDateTime_Next = lead(CallDateTime)) %>%
ungroup() %>%
mutate(
diff_days = difftime(CallDateTime_Next, CallDateTime, units = 'days'),
diff_hours = difftime(CallDateTime_Next, CallDateTime, units = 'hours'),
diff_mins = difftime(CallDateTime_Next, CallDateTime, units = 'mins'),
diff_secs = difftime(CallDateTime_Next, CallDateTime, units = 'secs')
)
data %>%
filter(CallOutcome == 'Abandoned') %>%
ggplot() +
geom_density(aes(x = diff_days))