具有 AND 条件的 AWS Athena 查询 JSON 数组

AWS Athena query JSON array with AND Condition

我有 JSON 这样的数据保存在 S3 中。我正在使用 ATHENA 编写 select 语句。

  {
   "sample_data":{
      "people":[
         {
            "firstName":"Emily",
            "address":{
               "streetAddress":"101",
               "city":"abc",
               "state":"",
               "phoneNumbers":[
                  {
                     "type":"home",
                     "number":"3"
                  },
                  {
                     "type":"city",
                     "number":"4"
                  }
               ]
            }
         },
          {
            "firstName":"Smily",
            "address":{
               "streetAddress":"102",
               "city":"def",
               "state":"",
               "phoneNumbers":[
                  {
                     "type":"home",
                     "number":"1"
                  },
                  {
                     "type":"city",
                     "number":"1"
                  }
               ]
            }
         }
      ]
   }
}

如何编写 select 语句 selects streetaddresscity where home>2 and city=4;

我试过 UNNEST 但没有用。

预期输出:

streetAddress  city
101            abc   

尝试了这个 UNNEST,但它将电话号码提取到多行。所以不能 按 homecity 查询,因为它们现在在 different 行中。

SELECT  idx,JSON_EXTRACT_SCALAR(x.n, '$.address.streetaddress') as streetaddress,
JSON_EXTRACT_SCALAR(x.n, '$.address.city') as city, JSON_EXTRACT_SCALAR(x.m, '$.type') as type, JSON_EXTRACT_SCALAR(x.m, '$.number')  as value
  FROM sample_data1 cross join
  UNNEST (CAST(JSON_EXTRACT(sample_data,'$.people') AS ARRAY<JSON>)) AS x(n)
  CROSS JOIN
  UNNEST (CAST(JSON_EXTRACT(x.n,'$.address.phonenumbers') AS ARRAY<JSON>))  WITH ordinality AS x(m,idx) ;

unnest 将数据展平为多行,因此您可以在不使用数组函数取消嵌套的情况下处理数组。 Athena 当前使用的 Presto 版本不支持 any_match 因此您需要使用 cardinality + filter 组合(并且它不支持通过 json 路径进行过滤):

-- sample data
WITH dataset (json_str) AS (
    VALUES (
            json '{
            "firstName":"Emily",
            "address":{
               "streetAddress":"101",
               "city":"abc",
               "state":"",
               "phoneNumbers":[
                  {
                     "type":"home",
                     "number":"11"
                  },
                  {
                     "type":"city",
                     "number":"4"
                  }
               ]
            }
         }'
        ),
        (
            json '{
            "firstName":"Smily",
            "address":{
               "streetAddress":"102",
               "city":"def",
               "state":"",
               "phoneNumbers":[
                  {
                     "type":"home",
                     "number":"1"
                  },
                  {
                     "type":"city",
                     "number":"1"
                  }
               ]
            }
         }'
        )
) -- query
select street_address,
    city
from (
        select JSON_EXTRACT_SCALAR(json_str, '$.address.streetAddress') as street_address,
            JSON_EXTRACT_SCALAR(json_str, '$.address.city') as city,
            cast(
                JSON_EXTRACT(json_str, '$.address.phoneNumbers') as array(json)
            ) phones
        from dataset
    )
where cardinality(
        filter(
            phones,
            js->json_extract_scalar(js, '$.type') = 'home'
                and try_cast(json_extract_scalar(js, '$.number') as integer) > 2
        )
    ) > 0 -- check for home
    and
    cardinality(
        filter(
            phones,
            js->json_extract_scalar(js, '$.type') = 'city'
                and json_extract_scalar(js, '$.number') = '4'
        )
    ) > 0 -- check for city

输出:

street_address city
101 abc