获取 pyspark.sql.utils.ParseException：在 pyspark sql 的 'in' 处缺少 ')'

Question

您好，我在运行下面查询 pyspark sql 但出现错误。请在我遗漏')'的地方帮助我。

查询-

`with cte1 as (select `Project Number`, indication,rank() over (partition by `Project Number`,REGEXP_REPLACE(indication,'[^a-zA-Z0-9]+', '') order by `Project Number`,indication) as rnk from (select distinct `Project Number`, indication from vw_onco_pharma onco_pharma union select distinct `Project Number`, indication from vw_onco_cell_gene cell_gene union select distinct `Project Number`, indication from vw_non_onco_cell_gene onco_cell_gene union select distinct `Project Number`, indication from vw_non_onco_pharma non_onco_pharma union select distinct `Project Number`, indication from vw_plasma_protein plasma_protein)),y as (select max(cast(project_id as integer)) as max_prj_id from vw_project_id) select nvl(max_prj_id,0)+ROW_NUMBER () OVER (ORDER BY `Project Number`,indication) as project_id,`Project Number`,indication,date_format(current_timestamp(),'yyyy-MM-dd hh:mm:ss') as HTA_INSERT_DT from (select cte1.`Project Number`, cte1.indication,max_prj_id from cte1 left join vw_project_id prj on cte1.`Project Number` = prj.`Project Number` and REGEXP_REPLACE(cte1.indicatio,'[^a-zA-Z0-9]+', '') = REGEXP_REPLACE(prj.indication,'[^a-zA-Z0-9]+', '') left join  y on 1 = 1 where rnk = 1 and prj.project_id is null and cte1.`project number` in (select `project number` from cte1 group by `project number` having count(*) > 1) union select cte1.`Project Number`, null as indication,max_prj_id from cte1 left join vw_project_id prj on cte1.`Project Number` = prj.`Project Number` left join y on 1 = 1 where rnk = 1 and prj.project_id is null and cte1.`project number` in (select `project number` from cte1 groupby `project number` having count(*) = 1))`

错误-

pyspark.sql.utils.ParseException:
missing ')' at 'in'(line 1, pos 1575)

Answer 1

最后一个 groupby 应该是 group by.

也尝试格式化您的查询，使其可读：

with cte1 as (
  select
    ` Project Number `,
    indication,
    rank() over (
      partition by ` Project Number `,
      REGEXP_REPLACE(indication, '[^a-zA-Z0-9]+', '')
      order by
        ` Project Number `,
        indication
    ) as rnk
  from
    (
      select
        distinct ` Project Number `,
        indication
      from
        vw_onco_pharma onco_pharma
      union
      select
        distinct ` Project Number `,
        indication
      from
        vw_onco_cell_gene cell_gene
      union
      select
        distinct ` Project Number `,
        indication
      from
        vw_non_onco_cell_gene onco_cell_gene
      union
      select
        distinct ` Project Number `,
        indication
      from
        vw_non_onco_pharma non_onco_pharma
      union
      select
        distinct ` Project Number `,
        indication
      from
        vw_plasma_protein plasma_protein
    )
),
y as (
  select
    max(cast(project_id as integer)) as max_prj_id
  from
    vw_project_id
)
select
  nvl(max_prj_id, 0) + ROW_NUMBER () OVER (
    ORDER BY
      ` Project Number `,
      indication
  ) as project_id,
  ` Project Number `,
  indication,
  date_format(current_timestamp(), 'yyyy-MM-dd hh:mm:ss') as HTA_INSERT_DT
from
  (
    select
      cte1.` Project Number `,
      cte1.indication,
      max_prj_id
    from
      cte1
      left join vw_project_id prj on cte1.` Project Number ` = prj.` Project Number `
      and REGEXP_REPLACE(cte1.indicatio, '[^a-zA-Z0-9]+', '') = REGEXP_REPLACE(prj.indication, '[^a-zA-Z0-9]+', '')
      left join y on 1 = 1
    where
      rnk = 1
      and prj.project_id is null
      and cte1.` project number ` in (
        select
          ` project number `
        from
          cte1
        group by
          ` project number `
        having
          count(*) > 1
      )
    union
    select
      cte1.` Project Number `,
      null as indication,
      max_prj_id
    from
      cte1
      left join vw_project_id prj on cte1.` Project Number ` = prj.` Project Number `
      left join y on 1 = 1
    where
      rnk = 1
      and prj.project_id is null
      and cte1.` project number ` in (
        select
          ` project number `
        from
          cte1 group by ` project number `
        having
          count(*) = 1
      )
  )

获取 pyspark.sql.utils.ParseException：在 pyspark sql 的 'in' 处缺少 ')'

Getting pyspark.sql.utils.ParseException: missing ')' at 'in' in pyspark sql

apache-spark-sql

pyspark