使用 2 个表制作新功能
Make new feature using 2 tables
table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))
我想使用 table2 向 table1 添加新功能。
新特性是每个用户对每个产品的订购间隔。
例如,我们来看table1。第一行有 (user_id==1), (product_id==14), (first order==1) and (last order ==4).这意味着产品 14 是在订单 1、2、3、4 订购的。我们可以在 table 2 中找到此订单号。
新功能是关于第一个订单和最后一个订单之间的订单间隔。我们可以在 table 2 中推导出它,使用 "days_cumsum" 这是先前订单后天数的累计总和。第一行的新特征值为 26(=26-0)。
我觉得可以用join来完成,但是我不能用join因为每个table都非常实际上很大。
所以我在下面将这个函数与 for 循环一起使用:
f <- function(i){
a <- table2 %>%
filter(user_id==table1[i, 1]) %>%
filter(order_number==table1[i, 3] | order_number==table1[i, 4])
ifelse(nrow(a)==2, a[2, 3] - a[1, 3], 999999) # first_order==last_order
}
逐行计算每一个新的特征值,但是速度很慢,需要大量的计算。我经常遇到这个问题(make new feature using two tables),但每次都遇到困难。
有更好的代码吗?我在等你的帮助。
您能否分享 join
和使用循环的版本的 runtime/computation 时间比较?
下面是一个使用连接的解决方案。
library(tidyverse)
df1 <- as.data.frame(table1)
df2 <- as.data.frame(table2)
df1 %>%
left_join(df2, by = c("user_id"="user_id", "first_order" = "order_number")) %>%
rename(dayMin = days_cumsum) %>%
left_join(df2, by = c("user_id"="user_id", "last_order" = "order_number")) %>%
rename(dayMax = days_cumsum) %>%
mutate(newVar = dayMax-dayMin) %>%
select(user_id, product_id, first_order, last_order, newVar)
给出:
user_id product_id first_order last_order newVar
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 14 1 4 26
2 1 24 2 7 46
3 1 38 1 5 34
4 1 40 4 8 33
5 1 66 5 8 25
6 1 2 3 3 0
7 1 19 2 4 19
8 1 30 4 7 27
9 1 71 2 5 27
10 1 98 4 9 40
11 2 7 2 4 13
12 2 16 3 5 12
为了比较,一些解决方案使用data.table
。
table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))
library(data.table)
setDT(table1)
setDT(table2)
table1
#> user_id product_id first_order last_order
#> 1: 1 14 1 4
#> 2: 1 24 2 7
#> 3: 1 38 1 5
#> 4: 1 40 4 8
#> 5: 1 66 5 8
#> 6: 1 2 3 3
#> 7: 1 19 2 4
#> 8: 1 30 4 7
#> 9: 1 71 2 5
#> 10: 1 98 4 9
#> 11: 2 7 2 4
#> 12: 2 16 3 5
table2
#> user_id order_number days_cumsum
#> 1: 1 1 0
#> 2: 1 2 7
#> 3: 1 3 15
#> 4: 1 4 26
#> 5: 1 5 34
#> 6: 1 6 43
#> 7: 1 7 53
#> 8: 1 8 59
#> 9: 1 9 66
#> 10: 1 10 74
#> 11: 1 11 82
#> 12: 1 12 91
#> 13: 2 1 5
#> 14: 2 2 11
#> 15: 2 3 17
#> 16: 2 4 24
#> 17: 2 5 29
#> 18: 2 6 35
DayMin <- table1[table2, on = .(user_id, first_order = order_number), nomatch = 0]
setnames(DayMin, "days_cumsum", "dayMin")
DayMax <- table1[table2, on = .(user_id, last_order = order_number), nomatch = 0]
setnames(DayMax, "days_cumsum", "dayMax")
res <- DayMin[DayMax, on = .(user_id, product_id, first_order, last_order)]
# calculate diff and delete column
res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
res[]
#> user_id product_id first_order last_order diff
#> 1: 1 2 3 3 0
#> 2: 1 14 1 4 26
#> 3: 1 19 2 4 19
#> 4: 1 38 1 5 34
#> 5: 1 71 2 5 27
#> 6: 1 24 2 7 46
#> 7: 1 30 4 7 27
#> 8: 1 40 4 8 33
#> 9: 1 66 5 8 25
#> 10: 1 98 4 9 40
#> 11: 2 7 2 4 13
#> 12: 2 16 3 5 12
"piped-like" 未重命名的版本
table1[table2, on = .(user_id, first_order = order_number), nomatch = 0][
table2, on = .(user_id , last_order = order_number), nomatch = 0][
, `:=`(
diff = i.days_cumsum - days_cumsum,
days_cumsum = NULL,
i.days_cumsum = NULL
)][]
#> user_id product_id first_order last_order diff
#> 1: 1 2 3 3 0
#> 2: 1 14 1 4 26
#> 3: 1 19 2 4 19
#> 4: 1 38 1 5 34
#> 5: 1 71 2 5 27
#> 6: 1 24 2 7 46
#> 7: 1 30 4 7 27
#> 8: 1 40 4 8 33
#> 9: 1 66 5 8 25
#> 10: 1 98 4 9 40
#> 11: 2 7 2 4 13
#> 12: 2 16 3 5 12
仅对一次合并使用整形
tab <- melt(table1, id = 1:2, value.name = "order_number")[table2, on = .(user_id, order_number), nomatch = 0]
res <- dcast(tab, user_id + product_id ~ variable, value.var = c("order_number", "days_cumsum"), sep = "#")
setnames(res, c("user_id", "product_id", "first_order", "last_order", "dayMin", "dayMax"))
res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
res
#> user_id product_id first_order last_order diff
#> 1: 1 2 3 3 0
#> 2: 1 14 1 4 26
#> 3: 1 19 2 4 19
#> 4: 1 24 2 7 46
#> 5: 1 30 4 7 27
#> 6: 1 38 1 5 34
#> 7: 1 40 4 8 33
#> 8: 1 66 5 8 25
#> 9: 1 71 2 5 27
#> 10: 1 98 4 9 40
#> 11: 2 7 2 4 13
#> 12: 2 16 3 5 12
table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))
我想使用 table2 向 table1 添加新功能。 新特性是每个用户对每个产品的订购间隔。
例如,我们来看table1。第一行有 (user_id==1), (product_id==14), (first order==1) and (last order ==4).这意味着产品 14 是在订单 1、2、3、4 订购的。我们可以在 table 2 中找到此订单号。 新功能是关于第一个订单和最后一个订单之间的订单间隔。我们可以在 table 2 中推导出它,使用 "days_cumsum" 这是先前订单后天数的累计总和。第一行的新特征值为 26(=26-0)。
我觉得可以用join来完成,但是我不能用join因为每个table都非常实际上很大。
所以我在下面将这个函数与 for 循环一起使用:
f <- function(i){
a <- table2 %>%
filter(user_id==table1[i, 1]) %>%
filter(order_number==table1[i, 3] | order_number==table1[i, 4])
ifelse(nrow(a)==2, a[2, 3] - a[1, 3], 999999) # first_order==last_order
}
逐行计算每一个新的特征值,但是速度很慢,需要大量的计算。我经常遇到这个问题(make new feature using two tables),但每次都遇到困难。
有更好的代码吗?我在等你的帮助。
您能否分享 join
和使用循环的版本的 runtime/computation 时间比较?
下面是一个使用连接的解决方案。
library(tidyverse)
df1 <- as.data.frame(table1)
df2 <- as.data.frame(table2)
df1 %>%
left_join(df2, by = c("user_id"="user_id", "first_order" = "order_number")) %>%
rename(dayMin = days_cumsum) %>%
left_join(df2, by = c("user_id"="user_id", "last_order" = "order_number")) %>%
rename(dayMax = days_cumsum) %>%
mutate(newVar = dayMax-dayMin) %>%
select(user_id, product_id, first_order, last_order, newVar)
给出:
user_id product_id first_order last_order newVar
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 14 1 4 26
2 1 24 2 7 46
3 1 38 1 5 34
4 1 40 4 8 33
5 1 66 5 8 25
6 1 2 3 3 0
7 1 19 2 4 19
8 1 30 4 7 27
9 1 71 2 5 27
10 1 98 4 9 40
11 2 7 2 4 13
12 2 16 3 5 12
为了比较,一些解决方案使用data.table
。
table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))
library(data.table)
setDT(table1)
setDT(table2)
table1
#> user_id product_id first_order last_order
#> 1: 1 14 1 4
#> 2: 1 24 2 7
#> 3: 1 38 1 5
#> 4: 1 40 4 8
#> 5: 1 66 5 8
#> 6: 1 2 3 3
#> 7: 1 19 2 4
#> 8: 1 30 4 7
#> 9: 1 71 2 5
#> 10: 1 98 4 9
#> 11: 2 7 2 4
#> 12: 2 16 3 5
table2
#> user_id order_number days_cumsum
#> 1: 1 1 0
#> 2: 1 2 7
#> 3: 1 3 15
#> 4: 1 4 26
#> 5: 1 5 34
#> 6: 1 6 43
#> 7: 1 7 53
#> 8: 1 8 59
#> 9: 1 9 66
#> 10: 1 10 74
#> 11: 1 11 82
#> 12: 1 12 91
#> 13: 2 1 5
#> 14: 2 2 11
#> 15: 2 3 17
#> 16: 2 4 24
#> 17: 2 5 29
#> 18: 2 6 35
DayMin <- table1[table2, on = .(user_id, first_order = order_number), nomatch = 0]
setnames(DayMin, "days_cumsum", "dayMin")
DayMax <- table1[table2, on = .(user_id, last_order = order_number), nomatch = 0]
setnames(DayMax, "days_cumsum", "dayMax")
res <- DayMin[DayMax, on = .(user_id, product_id, first_order, last_order)]
# calculate diff and delete column
res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
res[]
#> user_id product_id first_order last_order diff
#> 1: 1 2 3 3 0
#> 2: 1 14 1 4 26
#> 3: 1 19 2 4 19
#> 4: 1 38 1 5 34
#> 5: 1 71 2 5 27
#> 6: 1 24 2 7 46
#> 7: 1 30 4 7 27
#> 8: 1 40 4 8 33
#> 9: 1 66 5 8 25
#> 10: 1 98 4 9 40
#> 11: 2 7 2 4 13
#> 12: 2 16 3 5 12
"piped-like" 未重命名的版本
table1[table2, on = .(user_id, first_order = order_number), nomatch = 0][
table2, on = .(user_id , last_order = order_number), nomatch = 0][
, `:=`(
diff = i.days_cumsum - days_cumsum,
days_cumsum = NULL,
i.days_cumsum = NULL
)][]
#> user_id product_id first_order last_order diff
#> 1: 1 2 3 3 0
#> 2: 1 14 1 4 26
#> 3: 1 19 2 4 19
#> 4: 1 38 1 5 34
#> 5: 1 71 2 5 27
#> 6: 1 24 2 7 46
#> 7: 1 30 4 7 27
#> 8: 1 40 4 8 33
#> 9: 1 66 5 8 25
#> 10: 1 98 4 9 40
#> 11: 2 7 2 4 13
#> 12: 2 16 3 5 12
仅对一次合并使用整形
tab <- melt(table1, id = 1:2, value.name = "order_number")[table2, on = .(user_id, order_number), nomatch = 0]
res <- dcast(tab, user_id + product_id ~ variable, value.var = c("order_number", "days_cumsum"), sep = "#")
setnames(res, c("user_id", "product_id", "first_order", "last_order", "dayMin", "dayMax"))
res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
res
#> user_id product_id first_order last_order diff
#> 1: 1 2 3 3 0
#> 2: 1 14 1 4 26
#> 3: 1 19 2 4 19
#> 4: 1 24 2 7 46
#> 5: 1 30 4 7 27
#> 6: 1 38 1 5 34
#> 7: 1 40 4 8 33
#> 8: 1 66 5 8 25
#> 9: 1 71 2 5 27
#> 10: 1 98 4 9 40
#> 11: 2 7 2 4 13
#> 12: 2 16 3 5 12