在 dbplyr 管道中使用类似 grepl 的字符串匹配
Using string matching like grepl in a dbplyr pipeline
dbplyr 非常方便,因为它将 dplyr 代码转换为 SQL。这非常有效,除非它没有。例如,我试图通过将字符串与列中的值部分匹配来对行进行子集化。除了 postgres 之外,似乎这还没有在 dbplyr 中实现。我是否缺少一些可以完成以下操作的 {stringr} 函数:
library(dplyr, warn.conflicts = FALSE)
library(DBI)
data("flights", package = "nycflights13")
con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "flights", flights)
## works in dplyr
flights %>%
filter(grepl("N", tailnum))
#> # A tibble: 334,264 × 19
#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#> <int> <int> <int> <int> <int> <dbl> <int> <int>
#> 1 2013 1 1 517 515 2 830 819
#> 2 2013 1 1 533 529 4 850 830
#> 3 2013 1 1 542 540 2 923 850
#> 4 2013 1 1 544 545 -1 1004 1022
#> 5 2013 1 1 554 600 -6 812 837
#> 6 2013 1 1 554 558 -4 740 728
#> 7 2013 1 1 555 600 -5 913 854
#> 8 2013 1 1 557 600 -3 709 723
#> 9 2013 1 1 557 600 -3 838 846
#> 10 2013 1 1 558 600 -2 753 745
#> # … with 334,254 more rows, and 11 more variables: arr_delay <dbl>,
#> # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
## no function translation to grepl
tbl(con, "flights") %>%
filter(grepl("N", tailnum)) %>%
collect()
#> Error: no such function: grepl
## Also not implemented for stringr
library(stringr)
tbl(con, "flights") %>%
filter(str_detect(tailnum, "N")) %>%
collect()
#> Error: str_detect() is not available in this SQL variant
dbDisconnect(con)
我们可以用%like%
tbl(con, "flights") %>%
dplyr::filter(tailnum %like% "%N%") %>%
collect()
-输出
# A tibble: 334,264 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl>
1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227
2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227
3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160
4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183
5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116
6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150
7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158
8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53
9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140
10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138
# … with 334,254 more rows, and 4 more variables: distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dbl>
dbplyr 非常方便,因为它将 dplyr 代码转换为 SQL。这非常有效,除非它没有。例如,我试图通过将字符串与列中的值部分匹配来对行进行子集化。除了 postgres 之外,似乎这还没有在 dbplyr 中实现。我是否缺少一些可以完成以下操作的 {stringr} 函数:
library(dplyr, warn.conflicts = FALSE)
library(DBI)
data("flights", package = "nycflights13")
con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "flights", flights)
## works in dplyr
flights %>%
filter(grepl("N", tailnum))
#> # A tibble: 334,264 × 19
#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#> <int> <int> <int> <int> <int> <dbl> <int> <int>
#> 1 2013 1 1 517 515 2 830 819
#> 2 2013 1 1 533 529 4 850 830
#> 3 2013 1 1 542 540 2 923 850
#> 4 2013 1 1 544 545 -1 1004 1022
#> 5 2013 1 1 554 600 -6 812 837
#> 6 2013 1 1 554 558 -4 740 728
#> 7 2013 1 1 555 600 -5 913 854
#> 8 2013 1 1 557 600 -3 709 723
#> 9 2013 1 1 557 600 -3 838 846
#> 10 2013 1 1 558 600 -2 753 745
#> # … with 334,254 more rows, and 11 more variables: arr_delay <dbl>,
#> # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
## no function translation to grepl
tbl(con, "flights") %>%
filter(grepl("N", tailnum)) %>%
collect()
#> Error: no such function: grepl
## Also not implemented for stringr
library(stringr)
tbl(con, "flights") %>%
filter(str_detect(tailnum, "N")) %>%
collect()
#> Error: str_detect() is not available in this SQL variant
dbDisconnect(con)
我们可以用%like%
tbl(con, "flights") %>%
dplyr::filter(tailnum %like% "%N%") %>%
collect()
-输出
# A tibble: 334,264 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl>
1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227
2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227
3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160
4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183
5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116
6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150
7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158
8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA IAD 53
9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140
10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138
# … with 334,254 more rows, and 4 more variables: distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dbl>