在 JSON 文件上配置单元
Hive on JSON File
我在使用 HIVE 解析 JSON 文件时需要帮助。该文件具有嵌套数组,当我尝试使用 HiVE UDF 解析文件或查询时,我只能向下钻取到一个级别。然后下一级数组在我的结果中出现 Null 。我已经给出了下面的例子。文件有几个部分(数组),但下面给出的是最复杂的部分。我尝试使用 get_json_object 来解析,我只能获取一级数据,它根本没有拉出嵌套数组。如果有人指导我解析嵌套的 json 数组
会很有帮助
"section": {
"moodCode": "xxx",
"classCode": "xxx",
"templateId": {
"root": "2.xx.840"
},
"code": {
"codeSystemName": "LOINC",
},
"title": "problems",
"text": {
"mediaType": "text/x-hl7-text+xml",
"list": [{
"caption": "Recorded",
"item": {
"ID": "pr101",
"content": [{
"ID": "pr101-desc",
"text": "Salm"
},
"003.1"],
"text": " "
},
"text": " "
},
{
"caption": "Reported",
"item": "None Reported",
"text": " "
}],
"text": " "
},
"entry": {
"typeCode": "DRIV",
"act": {
"moodCode": "EVN",
"classCode": "ACT",
"templateId": [{
"root": "2.16.840.1.0.1.27"
},
{
"root": "1.3.6.1.4..1"
},
{
"root": "1.3.6.1.4.4.5.2"
}],
"id": {
"root": "068fd4d4-dfa2-48190768f"
},
"code": {
"nullFlavor": "NA"
},
"statusCode": {
"code": "completed"
},
"effectiveTime": {
"low": {
"value": "20140428144743+0100"
},
"high": {
"value": "20140428144743+0100"
},
"text": " "
},
"entryRelationship": {
"typeCode": "SUBJ",
"observation": {
"moodCode": "EVN",
"classCode": "OBS",
"templateId": [{
"root": "2.16.840.1.20.1.28"
},
{
"root": "1.3.6.1.4.1.5.3.1.4.5"
}],
"id": {
"root": "fa34b4da4dbb-b090-01bd4d6ef62b"
},
"code": {
"codeSystemName": "SNOMED CT",
"code": "282009",
"displayName": "diagnosis",
"codeSystem": "2.16.840.1.6.96"
},
"text": {
"reference": {
"value": "#pr101"
},
"text": " "
},
"statusCode": {
"code": "completed"
},
"effectiveTime": {
"low": {
"value": "20140428144743+0100"
},
"high": {
"nullFlavor": "UNK"
},
"text": " "
},
"value": {
"codeSystemName": "ICD-9",
"xsi:type": "CD",
"code": "003.1",
"displayName": "Sla sia",
"codeSystem": "2.16.840.1.103",
"originalText": {
"reference": {
"value": "#pr101-desc"
},
"text": " "
},
"text": " "
},
"entryRelationship": {
"typeCode": "REFR",
"observation": {
"moodCode": "EVN",
"classCode": "OBS",
"templateId": [{
"root": "2.16.840..1.50"
},
{
"root": "2.16.8410.20.1.57"
},
{
"root": "1.3.6.13.1.4.1.1"
}],
"code": {
"codeSystemName": "LOINC",
"code": "33999-4",
"displayName": "Status",
"codeSystem": "2.16.840.1.113883.6.1"
},
"statusCode": {
"code": "completed"
},
"value": {
"codeSystemName": " CT",
"xsi:type": "CE",
"code": "55563",
"displayName": "active",
"codeSystem": "2.16.8406.96"
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
当我使用下面提到的 get_json_object 数据时 null
。
"title": "problems",
"text": {
"mediaType": "text/x-hl7-text+xml",
"list": [{
"caption": "Recorded",
"item": {
"ID": "pr101",
"content": [{
"ID": "pr101-desc",
"text": "Salm"
},
"003.1"],
已更新-您必须以这样一种方式格式化您的 JSON 文件,即每条记录应仅在一行中,例如
'{"section":{"moodCode":"xxx","classCode":"xxx","templateId":{"root":"2.xx.840"},"code":{"codeSystemName":"LOINC"},"title":"problems","text":{"mediaType":"text/x-hl7-text+xml","list":[{"caption":"Recorded","item":{"ID":"pr101","content":[{"ID":"pr101-desc","text":"Salm"},"003.1"],"text":" "},"text":" "},{"caption":"Reported","item":"None Reported","text":" "}],"text":" "},"entry":{"typeCode":"DRIV","act":{"moodCode":"EVN","classCode":"ACT","templateId":[{"root":"2.16.840.1.0.1.27"},{"root":"1.3.6.1.4..1"},{"root":"1.3.6.1.4.4.5.2"}],"id":{"root":"068fd4d4-dfa2-48190768f"},"code":{"nullFlavor":"NA"},"statusCode":{"code":"completed"},"effectiveTime":{"low":{"value":"20140428144743+0100"},"high":{"value":"20140428144743+0100"},"text":""},"entryRelationship":{"typeCode":"SUBJ","observation":{"moodCode":"EVN","classCode":"OBS","templateId":[{"root":"2.16.840.1.20.1.28"},{"root":"1.3.6.1.4.1.5.3.1.4.5"}],"id":{"root":"fa34b4da4dbb-b090-01bd4d6ef62b"},"code":{"codeSystemName":"SNOMEDCT","code":"282009","displayName":"diagnosis","codeSystem":"2.16.840.1.6.96"},"text":"","statusCode":{"code":"completed"},"effectiveTime":{"low":{"value":"20140428144743+0100"},"high":{"nullFlavor":"UNK"},"text":""},"value":{"codeSystemName":"ICD-9","xsi: type":"CD","code":"003.1","displayName":"Slasia","codeSystem":"2.16.840.1.103","originalText":{"reference":{"value":"#pr101-desc"},"text":""},"text":""},"entryRelationship":{"typeCode":"REFR","observation":{"moodCode":"EVN","classCode":"OBS","templateId":[{"root":"2.16.840..1.50"},{"root":"2.16.8410.20.1.57"},{"root":"1.3.6.13.1.4.1.1"}],"code":{"codeSystemName":"LOINC","code":"33999-4","displayName":"Status","codeSystem":"2.16.840.1.113883.6.1"},"statusCode":{"code":"completed"},"value":{"codeSystemName":"CT","xsi: type":"CE","code":"55563","displayName":"active","codeSystem":"2.16.8406.96"},"text":""},"text":""}},"text":""},"text":""},"text":""}},"text":""}'
现在创建一个 table 将整个字符串视为一列
'drop table json_test;
create external table json_test(value string)
LOCATION 'path'; '
您可以横向查看 json_tuple 以获取您现在需要的字段。
'set hive.cli.print.header=true;
SELECT c.moodCode,c.classCode,d.root,c.title,e.mediaType FROM json_test a LATERAL VIEW json_tuple(a.value, 'section') b AS section LATERAL VIEW json_tuple(b.section,'moodCode','classCode','templateId','title','text')c
AS moodCode,classCode,templateId,title,text LATERAL VIEW json_tuple(c.templateId,'root')d
AS root LATERAL VIEW json_tuple(c.text,'mediaType')e AS mediaType;'
结果
'c.moodcode|c.classcode|d.root |c.title |e.mediatype
xxx | xxx |2.xx.840 |problems|text/x-hl7-text+xml'
我在使用 HIVE 解析 JSON 文件时需要帮助。该文件具有嵌套数组,当我尝试使用 HiVE UDF 解析文件或查询时,我只能向下钻取到一个级别。然后下一级数组在我的结果中出现 Null 。我已经给出了下面的例子。文件有几个部分(数组),但下面给出的是最复杂的部分。我尝试使用 get_json_object 来解析,我只能获取一级数据,它根本没有拉出嵌套数组。如果有人指导我解析嵌套的 json 数组
会很有帮助"section": {
"moodCode": "xxx",
"classCode": "xxx",
"templateId": {
"root": "2.xx.840"
},
"code": {
"codeSystemName": "LOINC",
},
"title": "problems",
"text": {
"mediaType": "text/x-hl7-text+xml",
"list": [{
"caption": "Recorded",
"item": {
"ID": "pr101",
"content": [{
"ID": "pr101-desc",
"text": "Salm"
},
"003.1"],
"text": " "
},
"text": " "
},
{
"caption": "Reported",
"item": "None Reported",
"text": " "
}],
"text": " "
},
"entry": {
"typeCode": "DRIV",
"act": {
"moodCode": "EVN",
"classCode": "ACT",
"templateId": [{
"root": "2.16.840.1.0.1.27"
},
{
"root": "1.3.6.1.4..1"
},
{
"root": "1.3.6.1.4.4.5.2"
}],
"id": {
"root": "068fd4d4-dfa2-48190768f"
},
"code": {
"nullFlavor": "NA"
},
"statusCode": {
"code": "completed"
},
"effectiveTime": {
"low": {
"value": "20140428144743+0100"
},
"high": {
"value": "20140428144743+0100"
},
"text": " "
},
"entryRelationship": {
"typeCode": "SUBJ",
"observation": {
"moodCode": "EVN",
"classCode": "OBS",
"templateId": [{
"root": "2.16.840.1.20.1.28"
},
{
"root": "1.3.6.1.4.1.5.3.1.4.5"
}],
"id": {
"root": "fa34b4da4dbb-b090-01bd4d6ef62b"
},
"code": {
"codeSystemName": "SNOMED CT",
"code": "282009",
"displayName": "diagnosis",
"codeSystem": "2.16.840.1.6.96"
},
"text": {
"reference": {
"value": "#pr101"
},
"text": " "
},
"statusCode": {
"code": "completed"
},
"effectiveTime": {
"low": {
"value": "20140428144743+0100"
},
"high": {
"nullFlavor": "UNK"
},
"text": " "
},
"value": {
"codeSystemName": "ICD-9",
"xsi:type": "CD",
"code": "003.1",
"displayName": "Sla sia",
"codeSystem": "2.16.840.1.103",
"originalText": {
"reference": {
"value": "#pr101-desc"
},
"text": " "
},
"text": " "
},
"entryRelationship": {
"typeCode": "REFR",
"observation": {
"moodCode": "EVN",
"classCode": "OBS",
"templateId": [{
"root": "2.16.840..1.50"
},
{
"root": "2.16.8410.20.1.57"
},
{
"root": "1.3.6.13.1.4.1.1"
}],
"code": {
"codeSystemName": "LOINC",
"code": "33999-4",
"displayName": "Status",
"codeSystem": "2.16.840.1.113883.6.1"
},
"statusCode": {
"code": "completed"
},
"value": {
"codeSystemName": " CT",
"xsi:type": "CE",
"code": "55563",
"displayName": "active",
"codeSystem": "2.16.8406.96"
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
当我使用下面提到的 get_json_object 数据时 null
。
"title": "problems",
"text": {
"mediaType": "text/x-hl7-text+xml",
"list": [{
"caption": "Recorded",
"item": {
"ID": "pr101",
"content": [{
"ID": "pr101-desc",
"text": "Salm"
},
"003.1"],
已更新-您必须以这样一种方式格式化您的 JSON 文件,即每条记录应仅在一行中,例如
'{"section":{"moodCode":"xxx","classCode":"xxx","templateId":{"root":"2.xx.840"},"code":{"codeSystemName":"LOINC"},"title":"problems","text":{"mediaType":"text/x-hl7-text+xml","list":[{"caption":"Recorded","item":{"ID":"pr101","content":[{"ID":"pr101-desc","text":"Salm"},"003.1"],"text":" "},"text":" "},{"caption":"Reported","item":"None Reported","text":" "}],"text":" "},"entry":{"typeCode":"DRIV","act":{"moodCode":"EVN","classCode":"ACT","templateId":[{"root":"2.16.840.1.0.1.27"},{"root":"1.3.6.1.4..1"},{"root":"1.3.6.1.4.4.5.2"}],"id":{"root":"068fd4d4-dfa2-48190768f"},"code":{"nullFlavor":"NA"},"statusCode":{"code":"completed"},"effectiveTime":{"low":{"value":"20140428144743+0100"},"high":{"value":"20140428144743+0100"},"text":""},"entryRelationship":{"typeCode":"SUBJ","observation":{"moodCode":"EVN","classCode":"OBS","templateId":[{"root":"2.16.840.1.20.1.28"},{"root":"1.3.6.1.4.1.5.3.1.4.5"}],"id":{"root":"fa34b4da4dbb-b090-01bd4d6ef62b"},"code":{"codeSystemName":"SNOMEDCT","code":"282009","displayName":"diagnosis","codeSystem":"2.16.840.1.6.96"},"text":"","statusCode":{"code":"completed"},"effectiveTime":{"low":{"value":"20140428144743+0100"},"high":{"nullFlavor":"UNK"},"text":""},"value":{"codeSystemName":"ICD-9","xsi: type":"CD","code":"003.1","displayName":"Slasia","codeSystem":"2.16.840.1.103","originalText":{"reference":{"value":"#pr101-desc"},"text":""},"text":""},"entryRelationship":{"typeCode":"REFR","observation":{"moodCode":"EVN","classCode":"OBS","templateId":[{"root":"2.16.840..1.50"},{"root":"2.16.8410.20.1.57"},{"root":"1.3.6.13.1.4.1.1"}],"code":{"codeSystemName":"LOINC","code":"33999-4","displayName":"Status","codeSystem":"2.16.840.1.113883.6.1"},"statusCode":{"code":"completed"},"value":{"codeSystemName":"CT","xsi: type":"CE","code":"55563","displayName":"active","codeSystem":"2.16.8406.96"},"text":""},"text":""}},"text":""},"text":""},"text":""}},"text":""}'
现在创建一个 table 将整个字符串视为一列
'drop table json_test;
create external table json_test(value string)
LOCATION 'path'; '
您可以横向查看 json_tuple 以获取您现在需要的字段。
'set hive.cli.print.header=true;
SELECT c.moodCode,c.classCode,d.root,c.title,e.mediaType FROM json_test a LATERAL VIEW json_tuple(a.value, 'section') b AS section LATERAL VIEW json_tuple(b.section,'moodCode','classCode','templateId','title','text')c
AS moodCode,classCode,templateId,title,text LATERAL VIEW json_tuple(c.templateId,'root')d
AS root LATERAL VIEW json_tuple(c.text,'mediaType')e AS mediaType;'
结果
'c.moodcode|c.classcode|d.root |c.title |e.mediatype
xxx | xxx |2.xx.840 |problems|text/x-hl7-text+xml'