如何使用 USQL 将 JSON 扁平化为 CSV
How do I flatten JSON to CSV using USQL
我可以使用 Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple 获取一些数据,但我无法展平整个文件。
这是我正在使用的文件格式:
{
"SourceUrl":"http://www.unittest.org/test.html",
"Title":"Unit Test File",
"Guest":"Unit Test Guest",
"PublishDate":"2017-05-15T00:00:00",
"TranscriptionSections":[
{
"SectionStartTime":"00:00:03",
"Sentences":[
{
"Text":"Intro."
},
{
"Text":"Sentence one"
},
{
"Text":"Sentence two"
}
]
},
{
"SectionStartTime":"00:04:46",
"Sentences":[
{
"Text":"Sentence three"
},
{
"Text":"Sentence four"
}
]
}
],
"Categories":null
}
我想要得到的是每个文本一行(其中 5 个),包括它的 'SectionStartTime' 和所有顶级属性('PublishDate'、'Guest'...) .
到目前为止,我可以使用这个 'SectionStartTime' 得到一行:
USE econosphere;
REFERENCE ASSEMBLY [Newtonsoft.Json];
REFERENCE ASSEMBLY [Microsoft.Analytics.Samples.Formats];
DECLARE @in string="adl://abc.azuredatalakestore.net/data/20170515UnitTest.json";
DECLARE @out
string="adl://abc.azuredatalakestore.net/processed/20170515UnitTest.csv";
@ep = EXTRACT
Title string,
SourceUrl string,
Guest string,
PublishDate DateTime,
TranscriptionSections string
FROM @in
USING new Microsoft.Analytics.Samples.Formats.Json.JsonExtractor();
@epAndTransctripts =
SELECT Title,
SourceUrl,
Guest,
PublishDate,
Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple(TranscriptionSections).Values AS TranscriptionSections_arr
FROM @ep;
@all =
SELECT
Title,
SourceUrl,
Guest,
PublishDate,
Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple(sects)["SectionStartTime"] AS TranscriptionSectionTimes
FROM @epAndTransctripts
CROSS APPLY
EXPLODE(TranscriptionSections_arr) AS t(sects);
OUTPUT @all
TO @out
USING Outputters.Csv();
这是对我有用的解决方案:
DECLARE @input string = "/input/data.json";
REFERENCE ASSEMBLY JSONBlog.[Newtonsoft.Json];
REFERENCE ASSEMBLY JSONBlog.[Microsoft.Analytics.Samples.Formats];
USING Microsoft.Analytics.Samples.Formats.Json;
@data =
EXTRACT SourceUrl string,
Title string,
Guest string,
PublishDate DateTime,
TranscriptionSections string,
Categories string
FROM @input
USING new JsonExtractor();
@data =
SELECT SourceUrl,
Title,
Guest,
PublishDate,
Categories,
JsonFunctions.JsonTuple(transcription_section) AS ts_map
FROM @data
CROSS APPLY
EXPLODE(JsonFunctions.JsonTuple(TranscriptionSections).Values) AS T(transcription_section);
@data =
SELECT SourceUrl,
Title,
Guest,
PublishDate,
Categories,
ts_map["SectionStartTime"]AS SectionStartTime,
JsonFunctions.JsonTuple(text_item) ["Text"]AS text
FROM @data
CROSS APPLY
EXPLODE(JsonFunctions.JsonTuple(ts_map["Sentences"]).Values) AS S(text_item);
OUTPUT @data
TO "/output/jsondata.csv"
USING Outputters.Csv(outputHeader : true);
我可以使用 Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple 获取一些数据,但我无法展平整个文件。
这是我正在使用的文件格式:
{
"SourceUrl":"http://www.unittest.org/test.html",
"Title":"Unit Test File",
"Guest":"Unit Test Guest",
"PublishDate":"2017-05-15T00:00:00",
"TranscriptionSections":[
{
"SectionStartTime":"00:00:03",
"Sentences":[
{
"Text":"Intro."
},
{
"Text":"Sentence one"
},
{
"Text":"Sentence two"
}
]
},
{
"SectionStartTime":"00:04:46",
"Sentences":[
{
"Text":"Sentence three"
},
{
"Text":"Sentence four"
}
]
}
],
"Categories":null
}
我想要得到的是每个文本一行(其中 5 个),包括它的 'SectionStartTime' 和所有顶级属性('PublishDate'、'Guest'...) .
到目前为止,我可以使用这个 'SectionStartTime' 得到一行:
USE econosphere;
REFERENCE ASSEMBLY [Newtonsoft.Json];
REFERENCE ASSEMBLY [Microsoft.Analytics.Samples.Formats];
DECLARE @in string="adl://abc.azuredatalakestore.net/data/20170515UnitTest.json";
DECLARE @out
string="adl://abc.azuredatalakestore.net/processed/20170515UnitTest.csv";
@ep = EXTRACT
Title string,
SourceUrl string,
Guest string,
PublishDate DateTime,
TranscriptionSections string
FROM @in
USING new Microsoft.Analytics.Samples.Formats.Json.JsonExtractor();
@epAndTransctripts =
SELECT Title,
SourceUrl,
Guest,
PublishDate,
Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple(TranscriptionSections).Values AS TranscriptionSections_arr
FROM @ep;
@all =
SELECT
Title,
SourceUrl,
Guest,
PublishDate,
Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple(sects)["SectionStartTime"] AS TranscriptionSectionTimes
FROM @epAndTransctripts
CROSS APPLY
EXPLODE(TranscriptionSections_arr) AS t(sects);
OUTPUT @all
TO @out
USING Outputters.Csv();
这是对我有用的解决方案:
DECLARE @input string = "/input/data.json";
REFERENCE ASSEMBLY JSONBlog.[Newtonsoft.Json];
REFERENCE ASSEMBLY JSONBlog.[Microsoft.Analytics.Samples.Formats];
USING Microsoft.Analytics.Samples.Formats.Json;
@data =
EXTRACT SourceUrl string,
Title string,
Guest string,
PublishDate DateTime,
TranscriptionSections string,
Categories string
FROM @input
USING new JsonExtractor();
@data =
SELECT SourceUrl,
Title,
Guest,
PublishDate,
Categories,
JsonFunctions.JsonTuple(transcription_section) AS ts_map
FROM @data
CROSS APPLY
EXPLODE(JsonFunctions.JsonTuple(TranscriptionSections).Values) AS T(transcription_section);
@data =
SELECT SourceUrl,
Title,
Guest,
PublishDate,
Categories,
ts_map["SectionStartTime"]AS SectionStartTime,
JsonFunctions.JsonTuple(text_item) ["Text"]AS text
FROM @data
CROSS APPLY
EXPLODE(JsonFunctions.JsonTuple(ts_map["Sentences"]).Values) AS S(text_item);
OUTPUT @data
TO "/output/jsondata.csv"
USING Outputters.Csv(outputHeader : true);