通过匹配 sparklyr 中的字符串创建新变量
Creating new variable by matching on strings in sparklyr
我是第一次使用 sparklyr,我在匹配两个向量的字符串以大规模创建新变量时遇到了问题。我的问题具有以下一般结构:
我有一个大型的 url 数据集:
df_1 <- data.frame(
col1 = c(1,2,3,4,5,6,7,8,9,10),
col2 = c("john.com/abcd", "ringo.com/defg", "paul.com/hijk", "george.com/lmno", "rob.com/pqrs", "sam.com/tuvw",
"matt.com/xyza", "lenny.com/bcde", "bob.com/fghi", "tom.com/jklm"))
col1 col2
1 john.com/abcd
2 ringo.com/defg
3 paul.com/hijk
4 george.com/lmno
5 rob.com/pqrs
6 sam.com/tuvw
7 matt.com/xyza
8 lenny.com/bcde
9 bob.com/fghi
10 tom.com/jklm
还有另一个较小的通用域数据集:
df_2 <- data.frame(
col1 = c(1,2,3,4,5,6,7),
col2 = c("john.com", "jake.com", "tim.com", "paul.com", "rob.com", "harry.com", "chris.com"))
col1 col2
1 john.com
2 jake.com
3 tim.com
4 paul.com
5 rob.com
6 harry.com
7 chris.com
我想使用 df_2 (df_2$col2) 中的域向量为 df_1 创建一个虚拟变量,指示该域是否出现在 df_1(df_1$col_2)。生成的数据框应类似于 df_3。
df_3 <- data.frame(
col1 = c(1,2,3,4,5,6,7,8,9,10),
col2 = c("john.com/abcd", "ringo.com/defg", "paul.com/hijk", "george.com/lmno", "rob.com/pqrs", "sam.com/tuvw",
"matt.com/xyza", "lenny.com/bcde", "bob.com/fghi", "tom.com/jklm"),
col3 = c(1,0,1,0,1,0,0,0,0,0))
col1 col2 col3
1 john.com/abcd 1
2 ringo.com/defg 0
3 paul.com/hijk 1
4 george.com/lmno 0
5 rob.com/pqrs 1
6 sam.com/tuvw 0
7 matt.com/xyza 0
8 lenny.com/bcde 0
9 bob.com/fghi 0
10 tom.com/jklm 0
我读过这个post:
并尝试为 df_2 的每个单独观察结果编码,例如
df_3 <- df_1 %>%
mutate(col3 =
ifelse(like(df_1$col2, "john.com") | df_1$col2, "jake.com" | etc.,1,0))
但到目前为止,我已经 运行 陷入堆栈限制或 R 无法识别类似功能的境地。必须有更简单的方法来做到这一点。感谢您的任何帮助。
如果您正在寻找像这里这样定义明确的前缀,您可以提取它:
sdf_1 <- copy_to(sc, df_1)
sdf_2 <- copy_to(sc, df_2)
sdf_1_keyed <- sdf_1 %>% mutate(key = regexp_extract(col2, "^(.*)/", 1))
应用左等值连接:
matched <- sdf_1_keyed %>%
left_join(sdf_2 %>% transmute(key = col2, id = col1), by="key")
总结
matched %>% group_by(col1, col2) %>%
summarise(col3 = as.numeric(sum(as.numeric(!is.na(id)), na.rm = TRUE) > 0))
# Source: lazy query [?? x 3]
# Database: spark_connection
# Groups: col1
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 5 rob.com/pqrs 1
3 6 sam.com/tuvw 0
4 9 bob.com/fghi 0
5 3 paul.com/hijk 1
6 4 george.com/lmno 0
7 8 lenny.com/bcde 0
8 10 tom.com/jklm 0
9 2 ringo.com/defg 0
10 7 matt.com/xyza 0
# ... with more rows
类似的事情可以用RLIKE
条件来完成:
candidates <- sdf_1 %>% spark_dataframe() %>%
sparklyr::invoke("crossJoin",
sdf_2 %>% transmute(target = col2) %>% spark_dataframe()) %>%
sdf_register()
candidates %>%
mutate(matched = as.numeric(rlike(col2, target))) %>%
group_by(col1, col2) %>%
summarise(col3 = as.numeric(sum(matched, na.rm=TRUE) > 0))
# Source: lazy query [?? x 3]
# Database: spark_connection
# Groups: col1
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 5 rob.com/pqrs 1
3 6 sam.com/tuvw 0
4 9 bob.com/fghi 0
5 3 paul.com/hijk 1
6 4 george.com/lmno 0
7 8 lenny.com/bcde 0
8 10 tom.com/jklm 0
9 2 ringo.com/defg 0
10 7 matt.com/xyza 0
# ... with more rows
您终于可以提取唯一值了:
targets <- unique(as.character(df_2$col2))
并创建 SQL 表达式:
library(glue)
expr <- glue_collapse(glue("col2 rlike '{targets}'"), " OR ")
sdf_1 %>%
spark_dataframe() %>%
sparklyr::invoke(
"selectExpr",
list("*", as.character(glue("{expr} as col3")))) %>%
sdf_register() %>%
mutate(col3 = as.numeric(col3))
# Source: lazy query [?? x 3]
# Database: spark_connection
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 2 ringo.com/defg 0
3 3 paul.com/hijk 1
4 4 george.com/lmno 0
5 5 rob.com/pqrs 1
6 6 sam.com/tuvw 0
7 7 matt.com/xyza 0
8 8 lenny.com/bcde 0
9 9 bob.com/fghi 0
10 10 tom.com/jklm 0
# ... with more rows
或R表达式:
library(rlang)
rexpr <- glue_collapse(glue("rlike(col2, '{targets}')"), " | ")
sdf_1 %>% mutate(col3 = !!parse_quosure(glue("as.numeric({rexpr})")))
# Source: lazy query [?? x 3]
# Database: spark_connection
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 2 ringo.com/defg 0
3 3 paul.com/hijk 1
4 4 george.com/lmno 0
5 5 rob.com/pqrs 1
6 6 sam.com/tuvw 0
7 7 matt.com/xyza 0
8 8 lenny.com/bcde 0
9 9 bob.com/fghi 0
10 10 tom.com/jklm 0
# ... with more rows
我是第一次使用 sparklyr,我在匹配两个向量的字符串以大规模创建新变量时遇到了问题。我的问题具有以下一般结构:
我有一个大型的 url 数据集:
df_1 <- data.frame(
col1 = c(1,2,3,4,5,6,7,8,9,10),
col2 = c("john.com/abcd", "ringo.com/defg", "paul.com/hijk", "george.com/lmno", "rob.com/pqrs", "sam.com/tuvw",
"matt.com/xyza", "lenny.com/bcde", "bob.com/fghi", "tom.com/jklm"))
col1 col2
1 john.com/abcd
2 ringo.com/defg
3 paul.com/hijk
4 george.com/lmno
5 rob.com/pqrs
6 sam.com/tuvw
7 matt.com/xyza
8 lenny.com/bcde
9 bob.com/fghi
10 tom.com/jklm
还有另一个较小的通用域数据集:
df_2 <- data.frame(
col1 = c(1,2,3,4,5,6,7),
col2 = c("john.com", "jake.com", "tim.com", "paul.com", "rob.com", "harry.com", "chris.com"))
col1 col2
1 john.com
2 jake.com
3 tim.com
4 paul.com
5 rob.com
6 harry.com
7 chris.com
我想使用 df_2 (df_2$col2) 中的域向量为 df_1 创建一个虚拟变量,指示该域是否出现在 df_1(df_1$col_2)。生成的数据框应类似于 df_3。
df_3 <- data.frame(
col1 = c(1,2,3,4,5,6,7,8,9,10),
col2 = c("john.com/abcd", "ringo.com/defg", "paul.com/hijk", "george.com/lmno", "rob.com/pqrs", "sam.com/tuvw",
"matt.com/xyza", "lenny.com/bcde", "bob.com/fghi", "tom.com/jklm"),
col3 = c(1,0,1,0,1,0,0,0,0,0))
col1 col2 col3
1 john.com/abcd 1
2 ringo.com/defg 0
3 paul.com/hijk 1
4 george.com/lmno 0
5 rob.com/pqrs 1
6 sam.com/tuvw 0
7 matt.com/xyza 0
8 lenny.com/bcde 0
9 bob.com/fghi 0
10 tom.com/jklm 0
我读过这个post:
并尝试为 df_2 的每个单独观察结果编码,例如
df_3 <- df_1 %>%
mutate(col3 =
ifelse(like(df_1$col2, "john.com") | df_1$col2, "jake.com" | etc.,1,0))
但到目前为止,我已经 运行 陷入堆栈限制或 R 无法识别类似功能的境地。必须有更简单的方法来做到这一点。感谢您的任何帮助。
如果您正在寻找像这里这样定义明确的前缀,您可以提取它:
sdf_1 <- copy_to(sc, df_1)
sdf_2 <- copy_to(sc, df_2)
sdf_1_keyed <- sdf_1 %>% mutate(key = regexp_extract(col2, "^(.*)/", 1))
应用左等值连接:
matched <- sdf_1_keyed %>%
left_join(sdf_2 %>% transmute(key = col2, id = col1), by="key")
总结
matched %>% group_by(col1, col2) %>%
summarise(col3 = as.numeric(sum(as.numeric(!is.na(id)), na.rm = TRUE) > 0))
# Source: lazy query [?? x 3]
# Database: spark_connection
# Groups: col1
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 5 rob.com/pqrs 1
3 6 sam.com/tuvw 0
4 9 bob.com/fghi 0
5 3 paul.com/hijk 1
6 4 george.com/lmno 0
7 8 lenny.com/bcde 0
8 10 tom.com/jklm 0
9 2 ringo.com/defg 0
10 7 matt.com/xyza 0
# ... with more rows
类似的事情可以用RLIKE
条件来完成:
candidates <- sdf_1 %>% spark_dataframe() %>%
sparklyr::invoke("crossJoin",
sdf_2 %>% transmute(target = col2) %>% spark_dataframe()) %>%
sdf_register()
candidates %>%
mutate(matched = as.numeric(rlike(col2, target))) %>%
group_by(col1, col2) %>%
summarise(col3 = as.numeric(sum(matched, na.rm=TRUE) > 0))
# Source: lazy query [?? x 3]
# Database: spark_connection
# Groups: col1
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 5 rob.com/pqrs 1
3 6 sam.com/tuvw 0
4 9 bob.com/fghi 0
5 3 paul.com/hijk 1
6 4 george.com/lmno 0
7 8 lenny.com/bcde 0
8 10 tom.com/jklm 0
9 2 ringo.com/defg 0
10 7 matt.com/xyza 0
# ... with more rows
您终于可以提取唯一值了:
targets <- unique(as.character(df_2$col2))
并创建 SQL 表达式:
library(glue)
expr <- glue_collapse(glue("col2 rlike '{targets}'"), " OR ")
sdf_1 %>%
spark_dataframe() %>%
sparklyr::invoke(
"selectExpr",
list("*", as.character(glue("{expr} as col3")))) %>%
sdf_register() %>%
mutate(col3 = as.numeric(col3))
# Source: lazy query [?? x 3]
# Database: spark_connection
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 2 ringo.com/defg 0
3 3 paul.com/hijk 1
4 4 george.com/lmno 0
5 5 rob.com/pqrs 1
6 6 sam.com/tuvw 0
7 7 matt.com/xyza 0
8 8 lenny.com/bcde 0
9 9 bob.com/fghi 0
10 10 tom.com/jklm 0
# ... with more rows
或R表达式:
library(rlang)
rexpr <- glue_collapse(glue("rlike(col2, '{targets}')"), " | ")
sdf_1 %>% mutate(col3 = !!parse_quosure(glue("as.numeric({rexpr})")))
# Source: lazy query [?? x 3]
# Database: spark_connection
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 2 ringo.com/defg 0
3 3 paul.com/hijk 1
4 4 george.com/lmno 0
5 5 rob.com/pqrs 1
6 6 sam.com/tuvw 0
7 7 matt.com/xyza 0
8 8 lenny.com/bcde 0
9 9 bob.com/fghi 0
10 10 tom.com/jklm 0
# ... with more rows