如何计算 sparklyr 中随时间推移的唯一值(给出的示例)?
How to count unique values over time in sparklyr (example given)?
我正在尝试根据时间戳(四舍五入为分钟)计算最近 10 分钟内看到的唯一设备。我可以在 data.table 中做到这一点,但不知道如何在 R 中的 sparklyr 中复制相同的内容。540 指的是添加到当前时间戳的 # 秒数。
下面提供了示例来解释我的问题。
给定数据
df<-data.frame(device_subscriber_id=c("x","a","z","x","a","z","x","y","a","z"),
start_timestamp=c("2020-12-11 14:21:00","2020-12-11 14:22:00","2020-12-11 14:23:00",
"2020-12-11 14:26:00","2020-12-11 14:24:00","2020-12-11 14:25:00",
"2020-12-11 14:26:00","2020-12-11 14:28:00","2020-12-11 14:31:00","2020-12-11 14:38:00"))
df$start_timestamp<-as.POSIXct(df$start_timestamp,format="%Y-%m-%d %H:%M:%S")
dt<-setDT(df)
预期数据
expected_dt<-dt[dt[ , .(start_timestamp3=start_timestamp, start_timestamp2 = start_timestamp - 540, device_subscriber_id)],
on = .(start_timestamp >= start_timestamp2, start_timestamp<=start_timestamp3),
allow.cartesian = TRUE][ , .(unique_devices_seen = uniqueN(device_subscriber_id)),by = .(start_timestamp = start_timestamp + 540)]
expected_dt
start_timestamp unique_devices_seen
2020-12-11 14:21:00 1
2020-12-11 14:22:00 2
2020-12-11 14:23:00 3
2020-12-11 14:26:00 3
2020-12-11 14:24:00 3
2020-12-11 14:25:00 3
2020-12-11 14:28:00 4
2020-12-11 14:31:00 4
2020-12-11 14:38:00 2
如果使用SQL算,我们可以使用dbi查询Spark集群:
library(data.table)
library(sparklyr)
sc <- spark_connect(master = "local")
copy_to(sc, dt)
sdf_sql(sc, "
SELECT COUNT(DISTINCT dt1.device_subscriber_id) as unique_devices_seen
, dt2.start_timestamp
FROM dt dt1
INNER JOIN dt dt2 ON dt1.start_timestamp >= dt2.start_timestamp - INTERVAL 9 minutes
AND dt1.start_timestamp <= dt2.start_timestamp
GROUP BY dt2.start_timestamp
ORDER BY start_timestamp
")
## # Source: spark<?> [?? x 2]
## unique_devices_seen start_timestamp
## <dbl> <dttm>
## 1 1 2020-12-11 19:21:00
## 2 2 2020-12-11 19:22:00
## 3 3 2020-12-11 19:23:00
## 4 3 2020-12-11 19:24:00
## 5 3 2020-12-11 19:25:00
## 6 3 2020-12-11 19:26:00
## 7 4 2020-12-11 19:28:00
## 8 4 2020-12-11 19:31:00
## 9 2 2020-12-11 19:38:00
SQL 似乎是一个很好的中间立场 - data.table 非常适合翻译成 SQL.
我建议在当前行和前 540 秒之间使用 SQL window 函数 OVER
。 count(distinct device_subscriber_id)
到 Error: org.apache.spark.sql.AnalysisException: Distinct window functions are not supported
。解决方法是收集一组唯一 ID 和 return 数组的大小。时间戳已转换为纪元,以便以秒为单位使用范围值。
library(sparklyr)
library(tidyverse)
sc <- spark_connect(master="local[4]", version = "3.0.1")
sdf <- copy_to(sc, df, name = "df", overwrite = TRUE)
sdf_sql(sc, "
SELECT
start_timestamp,
size(collect_set(device_subscriber_id)
OVER (ORDER BY start_ts_epoch ASC
RANGE BETWEEN 540 PRECEDING AND CURRENT ROW)) as unique_devices_seen
FROM (SELECT *, unix_timestamp(start_timestamp) as start_ts_epoch FROM `df`)")
结果:
# Source: spark<?> [?? x 2]
start_timestamp unique_devices_seen
<dttm> <int>
1 2020-12-11 13:21:00 1
2 2020-12-11 13:22:00 2
3 2020-12-11 13:23:00 3
4 2020-12-11 13:24:00 3
5 2020-12-11 13:25:00 3
6 2020-12-11 13:26:00 3
7 2020-12-11 13:26:00 3
8 2020-12-11 13:28:00 4
9 2020-12-11 13:31:00 4
10 2020-12-11 13:38:00 2
引用Spark SQL Window Functions API
奖励:如果需要丢失的时间戳,您需要使用包含所有可能时间戳的 table 加入设备数据。缺少的时间戳将使设备 ID 为空,并且不会影响计数。
df_ts <- data.frame(start_timestamp=seq(min(df$start_timestamp), max(df$start_timestamp), by = "min"))
sdf_ts <- copy_to(sc, df_ts, name = "df_ts", overwrite = TRUE)
sdf_sql(sc, "
SELECT DISTINCT
start_timestamp
, size(collect_set(device_subscriber_id)
OVER (ORDER BY start_ts_epoch ASC
RANGE BETWEEN 540 PRECEDING AND CURRENT ROW)) as unique_devices_seen
, concat_ws(',', collect_set(device_subscriber_id)
OVER (ORDER BY start_ts_epoch ASC
RANGE BETWEEN 540 PRECEDING AND CURRENT ROW)) as unique_devices_seen_csv
FROM (SELECT
device_subscriber_id
, df_ts.start_timestamp
, unix_timestamp(df_ts.start_timestamp) as start_ts_epoch
FROM df
FULL JOIN df_ts ON (df.start_timestamp = df_ts.start_timestamp))") %>% print(n=30)
请注意,我添加了 unique_devices_seen_csv
以显示幕后发生的事情。它连接滑动 window.
的设备 ID
结果:
# Source: spark<?> [?? x 3]
start_timestamp unique_devices_seen unique_devices_seen_csv
<dttm> <int> <chr>
1 2020-12-11 13:21:00 1 x
2 2020-12-11 13:22:00 2 x,a
3 2020-12-11 13:23:00 3 z,x,a
4 2020-12-11 13:24:00 3 z,x,a
5 2020-12-11 13:25:00 3 z,x,a
6 2020-12-11 13:26:00 3 z,x,a
7 2020-12-11 13:27:00 3 z,x,a
8 2020-12-11 13:28:00 4 z,y,x,a
9 2020-12-11 13:29:00 4 z,y,x,a
10 2020-12-11 13:30:00 4 z,y,x,a
11 2020-12-11 13:31:00 4 z,y,x,a
12 2020-12-11 13:32:00 4 z,y,x,a
13 2020-12-11 13:33:00 4 z,y,x,a
14 2020-12-11 13:34:00 4 z,y,x,a
15 2020-12-11 13:35:00 3 y,x,a
16 2020-12-11 13:36:00 2 y,a
17 2020-12-11 13:37:00 2 y,a
18 2020-12-11 13:38:00 2 z,a
我正在尝试根据时间戳(四舍五入为分钟)计算最近 10 分钟内看到的唯一设备。我可以在 data.table 中做到这一点,但不知道如何在 R 中的 sparklyr 中复制相同的内容。540 指的是添加到当前时间戳的 # 秒数。
下面提供了示例来解释我的问题。
给定数据
df<-data.frame(device_subscriber_id=c("x","a","z","x","a","z","x","y","a","z"),
start_timestamp=c("2020-12-11 14:21:00","2020-12-11 14:22:00","2020-12-11 14:23:00",
"2020-12-11 14:26:00","2020-12-11 14:24:00","2020-12-11 14:25:00",
"2020-12-11 14:26:00","2020-12-11 14:28:00","2020-12-11 14:31:00","2020-12-11 14:38:00"))
df$start_timestamp<-as.POSIXct(df$start_timestamp,format="%Y-%m-%d %H:%M:%S")
dt<-setDT(df)
预期数据
expected_dt<-dt[dt[ , .(start_timestamp3=start_timestamp, start_timestamp2 = start_timestamp - 540, device_subscriber_id)],
on = .(start_timestamp >= start_timestamp2, start_timestamp<=start_timestamp3),
allow.cartesian = TRUE][ , .(unique_devices_seen = uniqueN(device_subscriber_id)),by = .(start_timestamp = start_timestamp + 540)]
expected_dt
start_timestamp unique_devices_seen
2020-12-11 14:21:00 1
2020-12-11 14:22:00 2
2020-12-11 14:23:00 3
2020-12-11 14:26:00 3
2020-12-11 14:24:00 3
2020-12-11 14:25:00 3
2020-12-11 14:28:00 4
2020-12-11 14:31:00 4
2020-12-11 14:38:00 2
如果使用SQL算,我们可以使用dbi查询Spark集群:
library(data.table)
library(sparklyr)
sc <- spark_connect(master = "local")
copy_to(sc, dt)
sdf_sql(sc, "
SELECT COUNT(DISTINCT dt1.device_subscriber_id) as unique_devices_seen
, dt2.start_timestamp
FROM dt dt1
INNER JOIN dt dt2 ON dt1.start_timestamp >= dt2.start_timestamp - INTERVAL 9 minutes
AND dt1.start_timestamp <= dt2.start_timestamp
GROUP BY dt2.start_timestamp
ORDER BY start_timestamp
")
## # Source: spark<?> [?? x 2]
## unique_devices_seen start_timestamp
## <dbl> <dttm>
## 1 1 2020-12-11 19:21:00
## 2 2 2020-12-11 19:22:00
## 3 3 2020-12-11 19:23:00
## 4 3 2020-12-11 19:24:00
## 5 3 2020-12-11 19:25:00
## 6 3 2020-12-11 19:26:00
## 7 4 2020-12-11 19:28:00
## 8 4 2020-12-11 19:31:00
## 9 2 2020-12-11 19:38:00
SQL 似乎是一个很好的中间立场 - data.table 非常适合翻译成 SQL.
我建议在当前行和前 540 秒之间使用 SQL window 函数 OVER
。 count(distinct device_subscriber_id)
到 Error: org.apache.spark.sql.AnalysisException: Distinct window functions are not supported
。解决方法是收集一组唯一 ID 和 return 数组的大小。时间戳已转换为纪元,以便以秒为单位使用范围值。
library(sparklyr)
library(tidyverse)
sc <- spark_connect(master="local[4]", version = "3.0.1")
sdf <- copy_to(sc, df, name = "df", overwrite = TRUE)
sdf_sql(sc, "
SELECT
start_timestamp,
size(collect_set(device_subscriber_id)
OVER (ORDER BY start_ts_epoch ASC
RANGE BETWEEN 540 PRECEDING AND CURRENT ROW)) as unique_devices_seen
FROM (SELECT *, unix_timestamp(start_timestamp) as start_ts_epoch FROM `df`)")
结果:
# Source: spark<?> [?? x 2]
start_timestamp unique_devices_seen
<dttm> <int>
1 2020-12-11 13:21:00 1
2 2020-12-11 13:22:00 2
3 2020-12-11 13:23:00 3
4 2020-12-11 13:24:00 3
5 2020-12-11 13:25:00 3
6 2020-12-11 13:26:00 3
7 2020-12-11 13:26:00 3
8 2020-12-11 13:28:00 4
9 2020-12-11 13:31:00 4
10 2020-12-11 13:38:00 2
引用Spark SQL Window Functions API
奖励:如果需要丢失的时间戳,您需要使用包含所有可能时间戳的 table 加入设备数据。缺少的时间戳将使设备 ID 为空,并且不会影响计数。
df_ts <- data.frame(start_timestamp=seq(min(df$start_timestamp), max(df$start_timestamp), by = "min"))
sdf_ts <- copy_to(sc, df_ts, name = "df_ts", overwrite = TRUE)
sdf_sql(sc, "
SELECT DISTINCT
start_timestamp
, size(collect_set(device_subscriber_id)
OVER (ORDER BY start_ts_epoch ASC
RANGE BETWEEN 540 PRECEDING AND CURRENT ROW)) as unique_devices_seen
, concat_ws(',', collect_set(device_subscriber_id)
OVER (ORDER BY start_ts_epoch ASC
RANGE BETWEEN 540 PRECEDING AND CURRENT ROW)) as unique_devices_seen_csv
FROM (SELECT
device_subscriber_id
, df_ts.start_timestamp
, unix_timestamp(df_ts.start_timestamp) as start_ts_epoch
FROM df
FULL JOIN df_ts ON (df.start_timestamp = df_ts.start_timestamp))") %>% print(n=30)
请注意,我添加了 unique_devices_seen_csv
以显示幕后发生的事情。它连接滑动 window.
结果:
# Source: spark<?> [?? x 3]
start_timestamp unique_devices_seen unique_devices_seen_csv
<dttm> <int> <chr>
1 2020-12-11 13:21:00 1 x
2 2020-12-11 13:22:00 2 x,a
3 2020-12-11 13:23:00 3 z,x,a
4 2020-12-11 13:24:00 3 z,x,a
5 2020-12-11 13:25:00 3 z,x,a
6 2020-12-11 13:26:00 3 z,x,a
7 2020-12-11 13:27:00 3 z,x,a
8 2020-12-11 13:28:00 4 z,y,x,a
9 2020-12-11 13:29:00 4 z,y,x,a
10 2020-12-11 13:30:00 4 z,y,x,a
11 2020-12-11 13:31:00 4 z,y,x,a
12 2020-12-11 13:32:00 4 z,y,x,a
13 2020-12-11 13:33:00 4 z,y,x,a
14 2020-12-11 13:34:00 4 z,y,x,a
15 2020-12-11 13:35:00 3 y,x,a
16 2020-12-11 13:36:00 2 y,a
17 2020-12-11 13:37:00 2 y,a
18 2020-12-11 13:38:00 2 z,a