在 Hive 中使用 XMLSerDe 在 XML 文件配对中出现问题
Issue in XML File paring using XMLSerDe in Hive
以下是我的输入XML
<entity>
<link idType="ProviderId">AEY000977645</link>
<link idType="PAID">000977645</link>
<link idType="PID">AEY</link>
<message reason="Not Currently In TMS Database" status="Unmappable"/>
</entity>
我需要解析此数据并使用具有 4 列(ProviderID、PAID、PID、message_reason)的 hivexmlserde 创建 Hive table。由于所有值都在标签内,因此很难解析数据。下面是我的 table DDL。
CREATE EXTERNAL TABLE xml_testing
(
provider_id String,
paid String,
pid String,
message_reason String
)
ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
WITH SERDEPROPERTIES (
"column.xpath.provider_id"="/entity/link/@idType", --> not sure what to give here
"column.xpath.paid"="/entity/link/@idType", --> not sure what to give here
"column.xpath.pid"="/entity/link/@idType", --> not sure what to give here
"column.xpath.message_reason"="/entity/message/@reason"
)
STORED AS
INPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
LOCATION '/input/'
TBLPROPERTIES (
"xmlinput.start"="<entity>",
"xmlinput.end"="</entity>"
);
谁能帮我解决这个问题?
select 按属性值排列的元素。
即/entity/link[@idType='ProviderId']/text()
CREATE EXTERNAL TABLE xml_testing
(
provider_id String,
paid String,
pid String,
message_reason String
)
ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
WITH SERDEPROPERTIES (
"column.xpath.provider_id"="/entity/link[@idType='ProviderId']/text()",
"column.xpath.paid"="/entity/link[@idType='PAID']/text()",
"column.xpath.pid"="/entity/link[@idType='PID']/text()",
"column.xpath.message_reason"="/entity/message/@reason"
)
STORED AS
INPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
LOCATION '/input/'
TBLPROPERTIES (
"xmlinput.start"="<entity>",
"xmlinput.end"="</entity>"
);
以下是我的输入XML
<entity>
<link idType="ProviderId">AEY000977645</link>
<link idType="PAID">000977645</link>
<link idType="PID">AEY</link>
<message reason="Not Currently In TMS Database" status="Unmappable"/>
</entity>
我需要解析此数据并使用具有 4 列(ProviderID、PAID、PID、message_reason)的 hivexmlserde 创建 Hive table。由于所有值都在标签内,因此很难解析数据。下面是我的 table DDL。
CREATE EXTERNAL TABLE xml_testing
(
provider_id String,
paid String,
pid String,
message_reason String
)
ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
WITH SERDEPROPERTIES (
"column.xpath.provider_id"="/entity/link/@idType", --> not sure what to give here
"column.xpath.paid"="/entity/link/@idType", --> not sure what to give here
"column.xpath.pid"="/entity/link/@idType", --> not sure what to give here
"column.xpath.message_reason"="/entity/message/@reason"
)
STORED AS
INPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
LOCATION '/input/'
TBLPROPERTIES (
"xmlinput.start"="<entity>",
"xmlinput.end"="</entity>"
);
谁能帮我解决这个问题?
select 按属性值排列的元素。
即/entity/link[@idType='ProviderId']/text()
CREATE EXTERNAL TABLE xml_testing
(
provider_id String,
paid String,
pid String,
message_reason String
)
ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
WITH SERDEPROPERTIES (
"column.xpath.provider_id"="/entity/link[@idType='ProviderId']/text()",
"column.xpath.paid"="/entity/link[@idType='PAID']/text()",
"column.xpath.pid"="/entity/link[@idType='PID']/text()",
"column.xpath.message_reason"="/entity/message/@reason"
)
STORED AS
INPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
LOCATION '/input/'
TBLPROPERTIES (
"xmlinput.start"="<entity>",
"xmlinput.end"="</entity>"
);