如何使用 Hive (get_json_object) 查询结构数组?

How to query struct array with Hive (get_json_object)?

我将以下 JSON 个对象存储在 Hive table 中:

{
  "main_id": "qwert",
  "features": [
    {
      "scope": "scope1",
      "name": "foo",
      "value": "ab12345",
      "age": 50,
      "somelist": ["abcde","fghij"]
    },
    {
      "scope": "scope2",
      "name": "bar",
      "value": "cd67890"
    },
    {
      "scope": "scope3",
      "name": "baz",
      "value": [
        "A",
        "B",
        "C"
      ]
    }
  ]
}

"features" 是一个可变长度的数组,即所有对象都是可选的。对象具有任意元素,但它们都包含 "scope"、"name" 和 "value".

这是我创建的 Hive table:

CREATE TABLE tbl(
main_id STRING,features array<struct<scope:STRING,name:STRING,value:array<STRING>,age:INT,somelist:array<STRING>>>
)

我需要一个 Hive 查询 returns main_id 和名称为 "baz" 的结构的值,即

main_id baz_value
qwert ["A","B","C"]

我的问题是 Hive UDF“get_json_object”仅支持 JSONPath 的有限版本。它不支持像 get_json_object(features, '$.features[?(@.name='baz')]') 这样的路径。

如何用Hive查询想要的结果?使用另一个 Hive table 结构可能更容易吗?

我找到了解决方案:

使用 Hive explode UDTF 分解结构数组,即创建第二个(临时)table,为数组 "features" 中的每个结构创建一条记录。

CREATE TABLE tbl_exploded as
select main_id, 
f.name as f_name,
f.value as f_value
from tbl
LATERAL VIEW explode(features) exploded_table as f
-- optionally filter here instead of in 2nd query:
-- where f.name = 'baz'; 

结果是:

qwert, foo, ab12345
qwert, bar, cd67890
qwert, baz, ["A","B","C"]

现在您可以 select main_id 和这样的值:

select main_id, f_value from tbl_exploded where f_name = 'baz';

这个应该可以吧

ParseJsonWithPath

ADD JAR your-path/ParseJsonWithPath.jar;
CREATE TEMPORARY FUNCTION parseJsonWithPath AS 'com.ntc.hive.udf.ParseJsonWithPath';

SELECT parseJsonWithPath(jsonStr, xpath) FROM ....

要解析的字段可以是json字符串(jsonStr),给定xpath,可以得到你想要的。

例如

jsonStr
{ "book": [
    {
        "category": "reference",
        "author": "Nigel Rees",
        "title": "Sayings of the Century",
        "price": 8.95
    },
    {
        "category": "fiction",
        "author": "Evelyn Waugh",
        "title": "Sword of Honour",
        "price": 12.99
   }
}

xpath
"$.book" 
        return the insider json string [....]
"$.book[?(@.price < 10)]" 
        return the [8.95]

more detail

下面贴的UDF我觉得很接近你的需求。它需要 array<struct>、一个字符串和一个整数。字符串是字段名称,在您的例子中是 "name",第三个参数是要匹配的值。目前它需要一个整数,但为了您的目的将其更改为字符串/文本应该相对容易。

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.lazy.LazyString;
import org.apache.hadoop.hive.serde2.lazy.LazyLong;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantIntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantStringObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import java.util.ArrayList;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyLongObjectInspector;

@Description(name = "extract_value",
    value = "_FUNC_( array< struct<value:string> > ) - Collect all \"value\" field values inside an array of struct(s), and return the results in an array<string>",
    extended = "Example:\n SELECT _FUNC_(array_of_structs_with_value_field)")
public class StructFromArrayStructDynamicInt
        extends GenericUDF
{
    private ArrayList ret;

    private ListObjectInspector listOI;
    private StructObjectInspector structOI;
    private ObjectInspector indOI;
    private ObjectInspector valOI;
    private ObjectInspector arg1OI;
    private ObjectInspector arg2OI;

    private String indexName;

    WritableConstantStringObjectInspector element1OI;
    WritableConstantIntObjectInspector element2OI;

    @Override
    public ObjectInspector initialize(ObjectInspector[] args)
            throws UDFArgumentException
    {
        if (args.length != 3) {
            throw new UDFArgumentLengthException("The function extract_value() requires exactly three arguments.");
        }

        if (args[0].getCategory() != Category.LIST) {
            throw new UDFArgumentTypeException(0, "Type array<struct> is expected to be the argument for extract_value but " + args[0].getTypeName() + " is found instead");
        }
        if (args[1].getCategory() != Category.PRIMITIVE) {
            throw new UDFArgumentTypeException(0, "Second argument is expected to be primitive but " + args[1].getTypeName() + " is found instead");
        }
        if (args[2].getCategory() != Category.PRIMITIVE) {
            throw new UDFArgumentTypeException(0, "Second argument is expected to be primitive but " + args[2].getTypeName() + " is found instead");
        }

        listOI = ((ListObjectInspector) args[0]);
        structOI = ((StructObjectInspector) listOI.getListElementObjectInspector());
        arg1OI = (StringObjectInspector) args[1];
        arg2OI = args[2];

        this.element1OI = (WritableConstantStringObjectInspector) arg1OI;
        this.element2OI = (WritableConstantIntObjectInspector) arg2OI;

        indexName = element1OI.getWritableConstantValue().toString();

//        if (structOI.getAllStructFieldRefs().size() != 2) {
//            throw new UDFArgumentTypeException(0, "Incorrect number of fields in the struct, should be one");
//        }

//        StructField valueField = structOI.getStructFieldRef("value");
        StructField indexField = structOI.getStructFieldRef(indexName);
        //If not, throw exception
//        if (valueField == null) {
//            throw new UDFArgumentTypeException(0, "NO \"value\" field in input structure");
//        }

        if (indexField == null) {
            throw new UDFArgumentTypeException(0, "Index field not in input structure");
        }

        //Are they of the correct types?
        //We store these object inspectors for use in the evaluate() method
//        valOI = valueField.getFieldObjectInspector();
        indOI = indexField.getFieldObjectInspector();

        //First are they primitives
//        if (valOI.getCategory() != Category.PRIMITIVE) {
//            throw new UDFArgumentTypeException(0, "value field must be of primitive type");
//        }
        if (indOI.getCategory() != Category.PRIMITIVE) {
            throw new UDFArgumentTypeException(0, "index field must be of primitive type");
        }
        if (arg1OI.getCategory() != Category.PRIMITIVE) {
            throw new UDFArgumentTypeException(0, "second argument must be primitive type");
        }
        if (arg2OI.getCategory() != Category.PRIMITIVE) {
            throw new UDFArgumentTypeException(0, "third argument must be primitive type");
        }

        //Are they of the correct primitives?
//        if (((PrimitiveObjectInspector)valOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
//            throw new UDFArgumentTypeException(0, "value field must be of string type");
//        }
        if (((PrimitiveObjectInspector)indOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.LONG) {
            throw new UDFArgumentTypeException(0, "index field must be of long type");
        }
        if (((PrimitiveObjectInspector)arg1OI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
            throw new UDFArgumentTypeException(0, "second arg must be of string type");
        }
        if (((PrimitiveObjectInspector)arg2OI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.INT) {
            throw new UDFArgumentTypeException(0, "third arg must be of int type");
        }

//        ret = new ArrayList();
        return listOI.getListElementObjectInspector();
//        return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
//        return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
    }

    @Override
    public Object evaluate(DeferredObject[] arguments)
            throws HiveException
    {
//        ret.clear();

        if (arguments.length != 3) {
            return null;
        }

        if (arguments[0].get() == null) {
        return null;
        }

        int numElements = listOI.getListLength(arguments[0].get());
//        long xl = argOI.getPrimitiveJavaObject(arguments[1].get());
//        long xl = arguments[1].get(); //9;
        long xl2 = element2OI.get(arguments[2].get());
//        String xl1 = element1OI.getPrimitiveJavaObject(arguments[2].get());

//        long xl = 9;

        for (int i = 0; i < numElements; i++) {
//            LazyString valDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("value")));
            long indValue = (Long) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef(indexName)));
//            throw new HiveException("second arg must be of string type");
//            LazyString indDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("index")));
//            Text valueValue = ((StringObjectInspector) valOI).getPrimitiveWritableObject(valDataObject);
//            LongWritable indValue = ((LazyLongObjectInspector) indOI).getPrimitiveWritableObject(indDataObject);

            if(indValue == xl2) {
                return listOI.getListElement(arguments[0].get(), i);
            }

//            ret.add(valueValue);
       }
        return null;
    }

    @Override
    public String getDisplayString(String[] strings) {
        assert (strings.length > 0);
        StringBuilder sb = new StringBuilder();
        sb.append("extract_value(");
        sb.append(strings[0]);
        sb.append(")");
        return sb.toString();
    }
}

Here 是这个代码和其他几个使用 array<struct>.

的工作 udfs 的代码

假设您有一个类型为 array> 的列 my_data,并且您想要查询 id,您可以这样做

select * from <table_nmae> where my_data[n].id = 10;

这里n是你要搜索的索引,这样就省去了横向展开查询。