bigrquery:使用 dplyr 动词在 5-95 个分位数内提取均值、最大值、最小值和 sd 不起作用

bigrquery: mean, max, min and sd extraction inside 5-95 quantiles using dplyr verbs doesn't work

我尝试在 BigQuery 服务器中提取 5-95 个分位数内的均值、最大值、最小值和标准偏差,但 dplyr 动词不起作用,输出错误为:x Syntax error: Expected ")" but got keyword AS at [1:117] [invalidQuery]

在我的例子中:

library(tidyverse)
library(bigrquery)
library(DBI)
library(googleAuthR)
bq_auth(email = "xxxxxxxxx@gmail.com") 

projectid<-'helical-client-323918'
datasetid<-'spectra_calibration'
bq_conn <-  dbConnect(bigquery(), 
                      project = projectid,
                      dataset = datasetid, 
                      use_legacy_sql = FALSE
)
bigrquery::dbListTables(bq_conn) # List all the tables in BigQuery data set
#[1] "CMPC"
#

raw_spectra_CMPC <- dplyr::tbl(bq_conn, 
                               "CMPC") # connects to a table

glimpse(raw_spectra_CMPC) 
Rows: ??
Columns: 27
Database: BigQueryConnection
$ x          <dbl> -52.5502, -52.5501, -52.5501, -52.5501, -52.5501, -52.5500, -52.5500, -52.5500, -52.5500, -52.5500, -5~
$ y          <dbl> -30.8295, -30.8297, -30.8296, -30.8295, -30.8294, -30.8298, -30.8297, -30.8296, -30.8295, -30.8294, -3~
$ stand      <chr> "ABRANJO001A", "ABRANJO001A", "ABRANJO001A", "ABRANJO001A", "ABRANJO001A", "ABRANJO001A", "ABRANJO001A~
$ date       <chr> "2019-01-28", "2019-01-28", "2019-01-28", "2019-01-28", "2019-01-28", "2019-01-28", "2019-01-28", "201~
$ B2         <dbl> 213, 205, 181, 207, 216, 205, 165, 161, 173, 182, 181, 259, 227, 190, 153, 147, 160, 164, 194, 210, 18~
$ B3         <dbl> 361.0, 362.0, 346.0, 352.0, 369.0, 330.0, 290.0, 326.0, 334.0, 332.0, 325.0, 375.0, 352.0, 307.0, 281.~
$ B4         <dbl> 227.0, 233.0, 198.0, 207.0, 209.0, 227.0, 178.0, 164.0, 180.0, 207.0, 209.0, 267.0, 269.0, 194.0, 163.~
$ B8         <dbl> 3033.0, 3307.0, 3322.0, 3232.0, 3241.0, 3065.0, 3306.0, 3422.0, 3427.0, 3392.0, 3165.0, 3206.0, 2984.0~
$ NDVI       <dbl> 0.86074, 0.86836, 0.88750, 0.87962, 0.87884, 0.86209, 0.89782, 0.90853, 0.90019, 0.88497, 0.87611, 0.8~
$ SAVI       <dbl> 4549.379, 4960.386, 4982.905, 4847.897, 4861.397, 4597.380, 4958.915, 5132.925, 5140.417, 5087.903, 47~
$ SIPI       <dbl> 1.00499, 1.00911, 1.00544, 1.00000, 0.99769, 1.00775, 1.00416, 1.00092, 1.00216, 1.00785, 1.00947, 1.0~
$ SR         <dbl> 13.36123, 14.19313, 16.77778, 15.61353, 15.50718, 13.50220, 18.57303, 20.86585, 19.03889, 16.38647, 15~
$ RGI        <dbl> 0.62881, 0.64365, 0.57225, 0.58807, 0.56640, 0.68788, 0.61379, 0.50307, 0.53892, 0.62349, 0.64308, 0.7~
$ TVI        <int> 173720, 189600, 193360, 187300, 188320, 174400, 192160, 201960, 200980, 196100, 182000, 180660, 166220~
$ MSR        <dbl> 3.65530, 3.76738, 4.09607, 3.95140, 3.93792, 3.67453, 4.30964, 4.56792, 4.36336, 4.04802, 3.89147, 3.4~
$ PRI        <dbl> -0.25784, -0.27690, -0.31309, -0.25939, -0.26154, -0.23364, -0.27473, -0.33881, -0.31755, -0.29183, -0~
$ GNDVI      <dbl> 0.78727, 0.80267, 0.81134, 0.80357, 0.79557, 0.80560, 0.83871, 0.82604, 0.82239, 0.82170, 0.81375, 0.7~
$ PSRI       <dbl> -0.04418, -0.03901, -0.04455, -0.04486, -0.04937, -0.03361, -0.03388, -0.04734, -0.04494, -0.03685, -0~
$ GCI        <dbl> 7.40166, 8.13536, 8.60116, 8.18182, 7.78320, 8.28788, 10.40000, 9.49693, 9.26048, 9.21687, 8.73846, 7.~
$ ID_PROJETO <int> 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 24~
$ PROJETO    <chr> "ABRANJO", "ABRANJO", "ABRANJO", "ABRANJO", "ABRANJO", "ABRANJO", "ABRANJO", "ABRANJO", "ABRANJO", "AB~
$ CD_TALHAO  <chr> "001A", "001A", "001A", "001A", "001A", "001A", "001A", "001A", "001A", "001A", "001A", "001A", "001A"~
$ DATA_PLANT <chr> "2008-07-15", "2008-07-15", "2008-07-15", "2008-07-15", "2008-07-15", "2008-07-15", "2008-07-15", "200~
$ ESPECIE    <chr> "SALIGNA", "SALIGNA", "SALIGNA", "SALIGNA", "SALIGNA", "SALIGNA", "SALIGNA", "SALIGNA", "SALIGNA", "SA~
$ ESPAC      <chr> "3.5x2.14", "3.5x2.14", "3.5x2.14", "3.5x2.14", "3.5x2.14", "3.5x2.14", "3.5x2.14", "3.5x2.14", "3.5x2~
$ AGE_1      <dbl> 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, ~
$ AGE        <int> 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11~

但是当我尝试使用我的函数时:

quantis <- function(x) x > quantile(x, 0.05) & x < quantile(x, 0.95)
data_Q95 <- raw_spectra_CMPC  %>% 
  filter (across(B2:GCI, .fns = quantis)) %>%
  group_by(AGE, ESPAC) %>%
  summarise(across(B2:GCI, list(mean = ~mean(.x, na.rm = TRUE), max = ~ max(.x, na.rm = TRUE), min = ~ min(.x, na.rm = TRUE), sd = ~ sd(.x, na.rm = TRUE))))
data_Q95
# <error/rlang_error>
# Job 'fc-vm-v1.job_41KNdnsYP_gws6avquMVeqjOC6_V.US' failed
# x Syntax error: Expected ")" but got keyword AS at [1:117] [invalidQuery]
# Backtrace:
#   1. (function (x, ...) ...
#   2. dbplyr:::print.tbl_sql(x)
#   7. pillar:::format.tbl(x, ..., n = n, width = width, n_extra = n_extra)
#   8. pillar:::format_tbl(...)
#   9. pillar::tbl_format_setup(...)
#  11. pillar:::tbl_format_setup.tbl(...)
#  12. pillar:::df_head(x, n + 1)
#  14. dbplyr:::as.data.frame.tbl_sql(head(x, n))
#  17. bigrquery:::collect.tbl_BigQueryConnection(x, n = n)
#  18. bigrquery::bq_dataset_query(...)
#  19. bigrquery::bq_job_wait(job, quiet = quiet)
# Run `rlang::last_trace()` to see the full context.

拜托,有什么好的方法可以帮助我进行这些提取吗?是否也需要使用 SQL 查询?在导出到 BigQuery 之前,我在 *csv 文件中使用我的函数进行了一些测试并且运行良好:

raw_spectra_CMPC_csv <- read.csv("https://raw.githubusercontent.com/Leprechault/trash/main/my_ds_CSV.csv")
quantis <- function(x) x > quantile(x, 0.05) & x < quantile(x, 0.95)
data_Q95 <- raw_spectra_CMPC_csv  %>% 
  filter (across(B2:GCI, .fns = quantis)) %>%
  group_by(AGE, ESPAC) %>%
  summarise(across(B2:GCI, list(mean = ~mean(.x, na.rm = TRUE), max = ~ max(.x, na.rm = TRUE), min = ~ min(.x, na.rm = TRUE), sd = ~ sd(.x, na.rm = TRUE))))
data_Q95
# # A tibble: 7 x 62
# # Groups:   AGE [7]
#     AGE ESPAC    B2_mean B2_max B2_min B2_sd B3_mean B3_max B3_min B3_sd B4_mean B4_max B4_min B4_sd B8_mean B8_max B8_min
#   <int> <chr>      <dbl>  <dbl>  <dbl> <dbl>   <dbl>  <dbl>  <dbl> <dbl>   <dbl>  <dbl>  <dbl> <dbl>   <dbl>  <dbl>  <dbl>
# 1    -2 4X1.85      125.   175    75    14.2    236.    312   199   18.6    129.    345   108.  12.6   3590.   4052   2189
# 2    -1 4X1.85      153.   300    67    34.0    282.    436   198.  34.3    166.    379   108.  30.7   3346.   4059   2208
# 3     0 4X1.85      419.   928.   71   274.     610.   1208   199  328.     730.   1668   109  535.    2933.   4069   2095
# 4     1 4X1.85      344.   683   129    83.4    510.    944   286   97.0    544.   1180   256  129.    2871.   3451   2115
# 5    11 3.5x2.14    137.   259    70    29.8    276.    467   199   38.4    160.    361   109   26.4   3665.   4069   2688
# 6    12 3.5x2.14    150.   298    67.5  23.6    267.    485   200   32.1    169.    421   109   26.2   3354.   4067   2293
# 7    13 3.5x2.14    130.   302    70    35.3    247.    482   200   30.0    144.    465   111   29.7   3833.   4069   3116
# # ... with 45 more variables: B8_sd <dbl>, NDVI_mean <dbl>, NDVI_max <dbl>, NDVI_min <dbl>, NDVI_sd <dbl>,
# #   SAVI_mean <dbl>, SAVI_max <dbl>, SAVI_min <dbl>, SAVI_sd <dbl>, SIPI_mean <dbl>, SIPI_max <dbl>, SIPI_min <dbl>,
# #   SIPI_sd <dbl>, SR_mean <dbl>, SR_max <dbl>, SR_min <dbl>, SR_sd <dbl>, RGI_mean <dbl>, RGI_max <dbl>, RGI_min <dbl>,
# #   RGI_sd <dbl>, TVI_mean <dbl>, TVI_max <dbl>, TVI_min <dbl>, TVI_sd <dbl>, MSR_mean <dbl>, MSR_max <dbl>,
# #   MSR_min <dbl>, MSR_sd <dbl>, PRI_mean <dbl>, PRI_max <dbl>, PRI_min <dbl>, PRI_sd <dbl>, GNDVI_mean <dbl>,
# #   GNDVI_max <dbl>, GNDVI_min <dbl>, GNDVI_sd <dbl>, PSRI_mean <dbl>, PSRI_max <dbl>, PSRI_min <dbl>, PSRI_sd <dbl>,

提前致谢!

我尝试重现您的代码,但发现 R 代码未正确转换为 BQ 查询。

  1. 使用 sd(.x) 因为默认情况下 BQ (STTD_DEV) 会忽略空值。
  2. 函数 quantis 不是在 BQ 中创建的,因此它没有完成它的工作并出错。我不确定 R 是否支持对 BQ 使用函数。

我可以建议的是,不要使用本机 R 操作,而是使用 SQL 语句来防止将 R 错误地转换为 BigQuery 操作。您可以尝试创建一个 user defined function (your quantis function) in BQ. In your select statement perform mean, max and sd on your fields. Filter using your UDF(quantis), group by age and espac. You can also try creating a VIEW that achieves most of your goal including WHERE clause on quantile. You can refer to this document 以参考如何在 R 中使用 BigQuery。