如何在 sparklyr 包中 运行 FPGrowth
how to run FPGrowth in sparklyr package
我有数据“li
”,我想 运行 算法 FPGrowth,但我不知道如何
set.seed(123)
# make fake data
li <- list()
for(i in 1:10) li[[i]] <- make.unique(letters[sample(1:26,sample(5:20,1),rep = T)])
require(sparklyr)
sc <- spark_connect(master = "local",version = "3.0.1")
df <- copy_to(sc, **....??????what should be here??????...** )
fp_growth_model <- ml_fpgrowth(df)
有一个类似的答案但是它不起作用,我得到了错误
sc <- spark_connect(master = "local", version = "2.3")
tb <- tibble::tibble(items=c("a b c", "a b", "c f g", "b c"))
df <- copy_to(sc, tb) %>%
mutate(items = split(items, "\\s+"))
Error in mutate(., items = split(items, "\\s+")) :
could not find function "mutate"
/// plyr::mutate
df <- copy_to(sc, tb) %>%
plyr::mutate(items = split(items, "\\s+"))
Error in sdf_import.default(x, sc, name, memory, repartition, overwrite, :
table tb already exists (pass overwrite = TRUE to overwrite)
/// SparkR::mutate
df <- copy_to(sc, tb) %>%
SparkR::mutate(items = split(items, "\\s+"))
Error in sdf_import.default(x, sc, name, memory, repartition, overwrite, :
table tb already exists (pass overwrite = TRUE to overwrite)
上述答案中的代码示例有效。第一个错误是因为 mutate
没有加载。第二个是因为对象 tb
已经加载到 Spark 中。
在新会话中尝试运行以下代码:
library(tidyverse)
library(sparklyr)
sc <- spark_connect(master = "local")
tb <- tibble::tibble(items=c("a b c", "a b", "c f g", "b c"))
df <- copy_to(sc, tb) %>%
mutate(items = split(items, "\\s+"))
fp_growth_model <- ml_fpgrowth(df)
ml_association_rules(fp_growth_model)
ml_freq_itemsets(fp_growth_model)
要使用您的数据集 li
执行 FP-growth,您需要更改格式。
函数 ml_fpgrowth
需要一个带有一列包含序列的列表的 SparkDataFrame。您不能将带有列表的 R DataFrame 直接传输到 Spark。首先,您创建一个 SparkDataFrame 并将序列作为字符串,然后使用 mutate
和 split
函数生成列表。
这是应用于您的数据的代码。
> tb_li <- tibble(items=sapply(li, function(x) paste(x, collapse=" ")))
> tb_li
# A tibble: 10 x 1
items
<chr>
1 o s n c j r v k e t n.1 v.1 y z e.1 s.1 y.1 y.2 i
2 c h z g j i s d n q k g.1 u l o j.1 m
3 i i.1 j w u g u.1 f y b e
4 l m r a y y.1 f u o i o.1 z
5 p t f k h v v.1 g p.1 q v.2 r q.1 b d m
6 v s y t v.1 y.1 n y.2 w
7 h p l y n c n.1
8 g c w v z o u e h s j r j.1 l b j.2 v.1
9 l t n q n.1 v c h n.2 s o x q.1 w k g o.1 w.1 z
10 n g j e f p x u w k
将数据传输到 Spark 并生成列表:
> df_li <- copy_to(sc, tb_li, overwrite = TRUE) %>%
+ mutate(items = split(items, "\\s+"))
> df_li
# Source: spark<?> [?? x 1]
items
<list>
1 <list [19]>
2 <list [17]>
3 <list [11]>
4 <list [12]>
5 <list [16]>
6 <list [9]>
7 <list [7]>
8 <list [17]>
9 <list [19]>
10 <list [10]>
数据已准备好供模型使用,如上例所示。
> fp_growth_model_li <- ml_fpgrowth(df_li)
> ml_association_rules(fp_growth_model_li)
# Source: spark<?> [?? x 4]
antecedent consequent confidence lift
<list> <list> <dbl> <dbl>
1 <list [4]> <list [1]> 1 2
2 <list [3]> <list [1]> 1 2
3 <list [3]> <list [1]> 1 2
4 <list [3]> <list [1]> 1 2
5 <list [5]> <list [1]> 1 2
6 <list [5]> <list [1]> 1 2
7 <list [3]> <list [1]> 1 2
8 <list [3]> <list [1]> 1 2
9 <list [3]> <list [1]> 1 2
10 <list [3]> <list [1]> 1 2
# ... with more rows
> ml_freq_itemsets(fp_growth_model_li)
# Source: spark<?> [?? x 2]
items freq
<list> <dbl>
1 <list [1]> 3
2 <list [2]> 3
3 <list [3]> 3
4 <list [2]> 3
5 <list [1]> 5
6 <list [2]> 3
7 <list [3]> 3
8 <list [3]> 3
9 <list [4]> 3
10 <list [2]> 4
# ... with more rows
我有数据“li
”,我想 运行 算法 FPGrowth,但我不知道如何
set.seed(123)
# make fake data
li <- list()
for(i in 1:10) li[[i]] <- make.unique(letters[sample(1:26,sample(5:20,1),rep = T)])
require(sparklyr)
sc <- spark_connect(master = "local",version = "3.0.1")
df <- copy_to(sc, **....??????what should be here??????...** )
fp_growth_model <- ml_fpgrowth(df)
有一个类似的答案
sc <- spark_connect(master = "local", version = "2.3")
tb <- tibble::tibble(items=c("a b c", "a b", "c f g", "b c"))
df <- copy_to(sc, tb) %>%
mutate(items = split(items, "\\s+"))
Error in mutate(., items = split(items, "\\s+")) :
could not find function "mutate"
/// plyr::mutate
df <- copy_to(sc, tb) %>%
plyr::mutate(items = split(items, "\\s+"))
Error in sdf_import.default(x, sc, name, memory, repartition, overwrite, :
table tb already exists (pass overwrite = TRUE to overwrite)
/// SparkR::mutate
df <- copy_to(sc, tb) %>%
SparkR::mutate(items = split(items, "\\s+"))
Error in sdf_import.default(x, sc, name, memory, repartition, overwrite, :
table tb already exists (pass overwrite = TRUE to overwrite)
上述答案中的代码示例有效。第一个错误是因为 mutate
没有加载。第二个是因为对象 tb
已经加载到 Spark 中。
在新会话中尝试运行以下代码:
library(tidyverse)
library(sparklyr)
sc <- spark_connect(master = "local")
tb <- tibble::tibble(items=c("a b c", "a b", "c f g", "b c"))
df <- copy_to(sc, tb) %>%
mutate(items = split(items, "\\s+"))
fp_growth_model <- ml_fpgrowth(df)
ml_association_rules(fp_growth_model)
ml_freq_itemsets(fp_growth_model)
要使用您的数据集 li
执行 FP-growth,您需要更改格式。
函数 ml_fpgrowth
需要一个带有一列包含序列的列表的 SparkDataFrame。您不能将带有列表的 R DataFrame 直接传输到 Spark。首先,您创建一个 SparkDataFrame 并将序列作为字符串,然后使用 mutate
和 split
函数生成列表。
这是应用于您的数据的代码。
> tb_li <- tibble(items=sapply(li, function(x) paste(x, collapse=" ")))
> tb_li
# A tibble: 10 x 1
items
<chr>
1 o s n c j r v k e t n.1 v.1 y z e.1 s.1 y.1 y.2 i
2 c h z g j i s d n q k g.1 u l o j.1 m
3 i i.1 j w u g u.1 f y b e
4 l m r a y y.1 f u o i o.1 z
5 p t f k h v v.1 g p.1 q v.2 r q.1 b d m
6 v s y t v.1 y.1 n y.2 w
7 h p l y n c n.1
8 g c w v z o u e h s j r j.1 l b j.2 v.1
9 l t n q n.1 v c h n.2 s o x q.1 w k g o.1 w.1 z
10 n g j e f p x u w k
将数据传输到 Spark 并生成列表:
> df_li <- copy_to(sc, tb_li, overwrite = TRUE) %>%
+ mutate(items = split(items, "\\s+"))
> df_li
# Source: spark<?> [?? x 1]
items
<list>
1 <list [19]>
2 <list [17]>
3 <list [11]>
4 <list [12]>
5 <list [16]>
6 <list [9]>
7 <list [7]>
8 <list [17]>
9 <list [19]>
10 <list [10]>
数据已准备好供模型使用,如上例所示。
> fp_growth_model_li <- ml_fpgrowth(df_li)
> ml_association_rules(fp_growth_model_li)
# Source: spark<?> [?? x 4]
antecedent consequent confidence lift
<list> <list> <dbl> <dbl>
1 <list [4]> <list [1]> 1 2
2 <list [3]> <list [1]> 1 2
3 <list [3]> <list [1]> 1 2
4 <list [3]> <list [1]> 1 2
5 <list [5]> <list [1]> 1 2
6 <list [5]> <list [1]> 1 2
7 <list [3]> <list [1]> 1 2
8 <list [3]> <list [1]> 1 2
9 <list [3]> <list [1]> 1 2
10 <list [3]> <list [1]> 1 2
# ... with more rows
> ml_freq_itemsets(fp_growth_model_li)
# Source: spark<?> [?? x 2]
items freq
<list> <dbl>
1 <list [1]> 3
2 <list [2]> 3
3 <list [3]> 3
4 <list [2]> 3
5 <list [1]> 5
6 <list [2]> 3
7 <list [3]> 3
8 <list [3]> 3
9 <list [4]> 3
10 <list [2]> 4
# ... with more rows