如何使用 Hive (get_json_object) 查询结构数组?
How to query struct array with Hive (get_json_object)?
我将以下 JSON 个对象存储在 Hive table 中:
{
"main_id": "qwert",
"features": [
{
"scope": "scope1",
"name": "foo",
"value": "ab12345",
"age": 50,
"somelist": ["abcde","fghij"]
},
{
"scope": "scope2",
"name": "bar",
"value": "cd67890"
},
{
"scope": "scope3",
"name": "baz",
"value": [
"A",
"B",
"C"
]
}
]
}
"features" 是一个可变长度的数组,即所有对象都是可选的。对象具有任意元素,但它们都包含 "scope"、"name" 和 "value".
这是我创建的 Hive table:
CREATE TABLE tbl(
main_id STRING,features array<struct<scope:STRING,name:STRING,value:array<STRING>,age:INT,somelist:array<STRING>>>
)
我需要一个 Hive 查询 returns main_id 和名称为 "baz" 的结构的值,即
main_id baz_value
qwert ["A","B","C"]
我的问题是 Hive UDF“get_json_object”仅支持 JSONPath 的有限版本。它不支持像 get_json_object(features, '$.features[?(@.name='baz')]')
这样的路径。
如何用Hive查询想要的结果?使用另一个 Hive table 结构可能更容易吗?
我找到了解决方案:
使用 Hive explode UDTF 分解结构数组,即创建第二个(临时)table,为数组 "features" 中的每个结构创建一条记录。
CREATE TABLE tbl_exploded as
select main_id,
f.name as f_name,
f.value as f_value
from tbl
LATERAL VIEW explode(features) exploded_table as f
-- optionally filter here instead of in 2nd query:
-- where f.name = 'baz';
结果是:
qwert, foo, ab12345
qwert, bar, cd67890
qwert, baz, ["A","B","C"]
现在您可以 select main_id 和这样的值:
select main_id, f_value from tbl_exploded where f_name = 'baz';
这个应该可以吧
ADD JAR your-path/ParseJsonWithPath.jar;
CREATE TEMPORARY FUNCTION parseJsonWithPath AS 'com.ntc.hive.udf.ParseJsonWithPath';
SELECT parseJsonWithPath(jsonStr, xpath) FROM ....
要解析的字段可以是json字符串(jsonStr),给定xpath,可以得到你想要的。
例如
jsonStr
{ "book": [
{
"category": "reference",
"author": "Nigel Rees",
"title": "Sayings of the Century",
"price": 8.95
},
{
"category": "fiction",
"author": "Evelyn Waugh",
"title": "Sword of Honour",
"price": 12.99
}
}
xpath
"$.book"
return the insider json string [....]
"$.book[?(@.price < 10)]"
return the [8.95]
下面贴的UDF我觉得很接近你的需求。它需要 array<struct>
、一个字符串和一个整数。字符串是字段名称,在您的例子中是 "name",第三个参数是要匹配的值。目前它需要一个整数,但为了您的目的将其更改为字符串/文本应该相对容易。
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.lazy.LazyString;
import org.apache.hadoop.hive.serde2.lazy.LazyLong;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantIntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantStringObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import java.util.ArrayList;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyLongObjectInspector;
@Description(name = "extract_value",
value = "_FUNC_( array< struct<value:string> > ) - Collect all \"value\" field values inside an array of struct(s), and return the results in an array<string>",
extended = "Example:\n SELECT _FUNC_(array_of_structs_with_value_field)")
public class StructFromArrayStructDynamicInt
extends GenericUDF
{
private ArrayList ret;
private ListObjectInspector listOI;
private StructObjectInspector structOI;
private ObjectInspector indOI;
private ObjectInspector valOI;
private ObjectInspector arg1OI;
private ObjectInspector arg2OI;
private String indexName;
WritableConstantStringObjectInspector element1OI;
WritableConstantIntObjectInspector element2OI;
@Override
public ObjectInspector initialize(ObjectInspector[] args)
throws UDFArgumentException
{
if (args.length != 3) {
throw new UDFArgumentLengthException("The function extract_value() requires exactly three arguments.");
}
if (args[0].getCategory() != Category.LIST) {
throw new UDFArgumentTypeException(0, "Type array<struct> is expected to be the argument for extract_value but " + args[0].getTypeName() + " is found instead");
}
if (args[1].getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "Second argument is expected to be primitive but " + args[1].getTypeName() + " is found instead");
}
if (args[2].getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "Second argument is expected to be primitive but " + args[2].getTypeName() + " is found instead");
}
listOI = ((ListObjectInspector) args[0]);
structOI = ((StructObjectInspector) listOI.getListElementObjectInspector());
arg1OI = (StringObjectInspector) args[1];
arg2OI = args[2];
this.element1OI = (WritableConstantStringObjectInspector) arg1OI;
this.element2OI = (WritableConstantIntObjectInspector) arg2OI;
indexName = element1OI.getWritableConstantValue().toString();
// if (structOI.getAllStructFieldRefs().size() != 2) {
// throw new UDFArgumentTypeException(0, "Incorrect number of fields in the struct, should be one");
// }
// StructField valueField = structOI.getStructFieldRef("value");
StructField indexField = structOI.getStructFieldRef(indexName);
//If not, throw exception
// if (valueField == null) {
// throw new UDFArgumentTypeException(0, "NO \"value\" field in input structure");
// }
if (indexField == null) {
throw new UDFArgumentTypeException(0, "Index field not in input structure");
}
//Are they of the correct types?
//We store these object inspectors for use in the evaluate() method
// valOI = valueField.getFieldObjectInspector();
indOI = indexField.getFieldObjectInspector();
//First are they primitives
// if (valOI.getCategory() != Category.PRIMITIVE) {
// throw new UDFArgumentTypeException(0, "value field must be of primitive type");
// }
if (indOI.getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "index field must be of primitive type");
}
if (arg1OI.getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "second argument must be primitive type");
}
if (arg2OI.getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "third argument must be primitive type");
}
//Are they of the correct primitives?
// if (((PrimitiveObjectInspector)valOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
// throw new UDFArgumentTypeException(0, "value field must be of string type");
// }
if (((PrimitiveObjectInspector)indOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.LONG) {
throw new UDFArgumentTypeException(0, "index field must be of long type");
}
if (((PrimitiveObjectInspector)arg1OI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
throw new UDFArgumentTypeException(0, "second arg must be of string type");
}
if (((PrimitiveObjectInspector)arg2OI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.INT) {
throw new UDFArgumentTypeException(0, "third arg must be of int type");
}
// ret = new ArrayList();
return listOI.getListElementObjectInspector();
// return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
// return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
}
@Override
public Object evaluate(DeferredObject[] arguments)
throws HiveException
{
// ret.clear();
if (arguments.length != 3) {
return null;
}
if (arguments[0].get() == null) {
return null;
}
int numElements = listOI.getListLength(arguments[0].get());
// long xl = argOI.getPrimitiveJavaObject(arguments[1].get());
// long xl = arguments[1].get(); //9;
long xl2 = element2OI.get(arguments[2].get());
// String xl1 = element1OI.getPrimitiveJavaObject(arguments[2].get());
// long xl = 9;
for (int i = 0; i < numElements; i++) {
// LazyString valDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("value")));
long indValue = (Long) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef(indexName)));
// throw new HiveException("second arg must be of string type");
// LazyString indDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("index")));
// Text valueValue = ((StringObjectInspector) valOI).getPrimitiveWritableObject(valDataObject);
// LongWritable indValue = ((LazyLongObjectInspector) indOI).getPrimitiveWritableObject(indDataObject);
if(indValue == xl2) {
return listOI.getListElement(arguments[0].get(), i);
}
// ret.add(valueValue);
}
return null;
}
@Override
public String getDisplayString(String[] strings) {
assert (strings.length > 0);
StringBuilder sb = new StringBuilder();
sb.append("extract_value(");
sb.append(strings[0]);
sb.append(")");
return sb.toString();
}
}
Here 是这个代码和其他几个使用 array<struct>
.
的工作 udfs 的代码
假设您有一个类型为 array> 的列 my_data,并且您想要查询 id,您可以这样做
select * from <table_nmae> where my_data[n].id = 10;
这里n是你要搜索的索引,这样就省去了横向展开查询。
我将以下 JSON 个对象存储在 Hive table 中:
{
"main_id": "qwert",
"features": [
{
"scope": "scope1",
"name": "foo",
"value": "ab12345",
"age": 50,
"somelist": ["abcde","fghij"]
},
{
"scope": "scope2",
"name": "bar",
"value": "cd67890"
},
{
"scope": "scope3",
"name": "baz",
"value": [
"A",
"B",
"C"
]
}
]
}
"features" 是一个可变长度的数组,即所有对象都是可选的。对象具有任意元素,但它们都包含 "scope"、"name" 和 "value".
这是我创建的 Hive table:
CREATE TABLE tbl(
main_id STRING,features array<struct<scope:STRING,name:STRING,value:array<STRING>,age:INT,somelist:array<STRING>>>
)
我需要一个 Hive 查询 returns main_id 和名称为 "baz" 的结构的值,即
main_id baz_value
qwert ["A","B","C"]
我的问题是 Hive UDF“get_json_object”仅支持 JSONPath 的有限版本。它不支持像 get_json_object(features, '$.features[?(@.name='baz')]')
这样的路径。
如何用Hive查询想要的结果?使用另一个 Hive table 结构可能更容易吗?
我找到了解决方案:
使用 Hive explode UDTF 分解结构数组,即创建第二个(临时)table,为数组 "features" 中的每个结构创建一条记录。
CREATE TABLE tbl_exploded as
select main_id,
f.name as f_name,
f.value as f_value
from tbl
LATERAL VIEW explode(features) exploded_table as f
-- optionally filter here instead of in 2nd query:
-- where f.name = 'baz';
结果是:
qwert, foo, ab12345
qwert, bar, cd67890
qwert, baz, ["A","B","C"]
现在您可以 select main_id 和这样的值:
select main_id, f_value from tbl_exploded where f_name = 'baz';
这个应该可以吧
ADD JAR your-path/ParseJsonWithPath.jar;
CREATE TEMPORARY FUNCTION parseJsonWithPath AS 'com.ntc.hive.udf.ParseJsonWithPath';
SELECT parseJsonWithPath(jsonStr, xpath) FROM ....
要解析的字段可以是json字符串(jsonStr),给定xpath,可以得到你想要的。
例如
jsonStr
{ "book": [
{
"category": "reference",
"author": "Nigel Rees",
"title": "Sayings of the Century",
"price": 8.95
},
{
"category": "fiction",
"author": "Evelyn Waugh",
"title": "Sword of Honour",
"price": 12.99
}
}
xpath
"$.book"
return the insider json string [....]
"$.book[?(@.price < 10)]"
return the [8.95]
下面贴的UDF我觉得很接近你的需求。它需要 array<struct>
、一个字符串和一个整数。字符串是字段名称,在您的例子中是 "name",第三个参数是要匹配的值。目前它需要一个整数,但为了您的目的将其更改为字符串/文本应该相对容易。
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.lazy.LazyString;
import org.apache.hadoop.hive.serde2.lazy.LazyLong;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantIntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantStringObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import java.util.ArrayList;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyLongObjectInspector;
@Description(name = "extract_value",
value = "_FUNC_( array< struct<value:string> > ) - Collect all \"value\" field values inside an array of struct(s), and return the results in an array<string>",
extended = "Example:\n SELECT _FUNC_(array_of_structs_with_value_field)")
public class StructFromArrayStructDynamicInt
extends GenericUDF
{
private ArrayList ret;
private ListObjectInspector listOI;
private StructObjectInspector structOI;
private ObjectInspector indOI;
private ObjectInspector valOI;
private ObjectInspector arg1OI;
private ObjectInspector arg2OI;
private String indexName;
WritableConstantStringObjectInspector element1OI;
WritableConstantIntObjectInspector element2OI;
@Override
public ObjectInspector initialize(ObjectInspector[] args)
throws UDFArgumentException
{
if (args.length != 3) {
throw new UDFArgumentLengthException("The function extract_value() requires exactly three arguments.");
}
if (args[0].getCategory() != Category.LIST) {
throw new UDFArgumentTypeException(0, "Type array<struct> is expected to be the argument for extract_value but " + args[0].getTypeName() + " is found instead");
}
if (args[1].getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "Second argument is expected to be primitive but " + args[1].getTypeName() + " is found instead");
}
if (args[2].getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "Second argument is expected to be primitive but " + args[2].getTypeName() + " is found instead");
}
listOI = ((ListObjectInspector) args[0]);
structOI = ((StructObjectInspector) listOI.getListElementObjectInspector());
arg1OI = (StringObjectInspector) args[1];
arg2OI = args[2];
this.element1OI = (WritableConstantStringObjectInspector) arg1OI;
this.element2OI = (WritableConstantIntObjectInspector) arg2OI;
indexName = element1OI.getWritableConstantValue().toString();
// if (structOI.getAllStructFieldRefs().size() != 2) {
// throw new UDFArgumentTypeException(0, "Incorrect number of fields in the struct, should be one");
// }
// StructField valueField = structOI.getStructFieldRef("value");
StructField indexField = structOI.getStructFieldRef(indexName);
//If not, throw exception
// if (valueField == null) {
// throw new UDFArgumentTypeException(0, "NO \"value\" field in input structure");
// }
if (indexField == null) {
throw new UDFArgumentTypeException(0, "Index field not in input structure");
}
//Are they of the correct types?
//We store these object inspectors for use in the evaluate() method
// valOI = valueField.getFieldObjectInspector();
indOI = indexField.getFieldObjectInspector();
//First are they primitives
// if (valOI.getCategory() != Category.PRIMITIVE) {
// throw new UDFArgumentTypeException(0, "value field must be of primitive type");
// }
if (indOI.getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "index field must be of primitive type");
}
if (arg1OI.getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "second argument must be primitive type");
}
if (arg2OI.getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "third argument must be primitive type");
}
//Are they of the correct primitives?
// if (((PrimitiveObjectInspector)valOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
// throw new UDFArgumentTypeException(0, "value field must be of string type");
// }
if (((PrimitiveObjectInspector)indOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.LONG) {
throw new UDFArgumentTypeException(0, "index field must be of long type");
}
if (((PrimitiveObjectInspector)arg1OI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
throw new UDFArgumentTypeException(0, "second arg must be of string type");
}
if (((PrimitiveObjectInspector)arg2OI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.INT) {
throw new UDFArgumentTypeException(0, "third arg must be of int type");
}
// ret = new ArrayList();
return listOI.getListElementObjectInspector();
// return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
// return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
}
@Override
public Object evaluate(DeferredObject[] arguments)
throws HiveException
{
// ret.clear();
if (arguments.length != 3) {
return null;
}
if (arguments[0].get() == null) {
return null;
}
int numElements = listOI.getListLength(arguments[0].get());
// long xl = argOI.getPrimitiveJavaObject(arguments[1].get());
// long xl = arguments[1].get(); //9;
long xl2 = element2OI.get(arguments[2].get());
// String xl1 = element1OI.getPrimitiveJavaObject(arguments[2].get());
// long xl = 9;
for (int i = 0; i < numElements; i++) {
// LazyString valDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("value")));
long indValue = (Long) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef(indexName)));
// throw new HiveException("second arg must be of string type");
// LazyString indDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("index")));
// Text valueValue = ((StringObjectInspector) valOI).getPrimitiveWritableObject(valDataObject);
// LongWritable indValue = ((LazyLongObjectInspector) indOI).getPrimitiveWritableObject(indDataObject);
if(indValue == xl2) {
return listOI.getListElement(arguments[0].get(), i);
}
// ret.add(valueValue);
}
return null;
}
@Override
public String getDisplayString(String[] strings) {
assert (strings.length > 0);
StringBuilder sb = new StringBuilder();
sb.append("extract_value(");
sb.append(strings[0]);
sb.append(")");
return sb.toString();
}
}
Here 是这个代码和其他几个使用 array<struct>
.
假设您有一个类型为 array
select * from <table_nmae> where my_data[n].id = 10;
这里n是你要搜索的索引,这样就省去了横向展开查询。