通过 Azure Data Lake 使用 U-SQL 嵌套 JSON 到 CSV
Nested JSON to CSV with U-SQL via Azure Data Lake
问题
是否有更优雅的方法来解析下面的嵌套 JSON 文档示例?
特别是@middle 步骤,根据我的阅读,我认为 JsonFunctions.JsonTuple(part_b, "rating") AS 在步骤 3 中的评级就足够了,但它还不够似乎有效,因此我添加了@middle。
sample.json
{
"listings": [
{
"part_a": {
"random": "x"
},
"part_b": {
"listing_id": "001",
"rating": {
"text": "four",
"numeric": "4.0"
}
}
},
{
"part_a": {
"random": "y"
},
"part_b": {
"listing_id": "002",
"rating": {
"text": "seven",
"numeric": "7.0"
}
}
},
{
"part_a": {
"random": "z"
},
"part_b": {
"listing_id": "003",
"rating": {
"text": "two",
"numeric": "2.0"
}
}
}
]
}
sample.usql
CREATE ASSEMBLY IF NOT EXISTS [Newtonsoft.Json] FROM @"adl://ADL_NAME.azuredatalakestore.net/Newtonsoft.Json.dll";
CREATE ASSEMBLY IF NOT EXISTS [Microsoft.Analytics.Samples.Formats] FROM @"adl://ADL_NAME.azuredatalakestore.net/Microsoft.Analytics.Samples.Formats.dll";
REFERENCE ASSEMBLY [Newtonsoft.Json];
REFERENCE ASSEMBLY [Microsoft.Analytics.Samples.Formats];
USING Microsoft.Analytics.Samples.Formats.Json;
// 1. Define Input and Output
DECLARE @InputFile string = @"adl://ADL_NAME.azuredatalakestore.net/sample.json";
DECLARE @OutputFile string = @"adl://ADL_NAME.azuredatalakestore.net/sample.csv";
// 2. Extract JSON (schema on read)
@json =
EXTRACT
part_a string,
part_b string
FROM
@InputFile
USING new JsonExtractor("$.listings[*]");
// 3. Convert string into dictionary
@listing_dict =
SELECT
JsonFunctions.JsonTuple(part_a) AS random,
JsonFunctions.JsonTuple(part_b) AS listing
FROM @json;
// 4. Extract values
@middle =
SELECT
random,
listing,
JsonFunctions.JsonTuple(listing["rating"]) AS rating
FROM @listing_dict;
@listing_values =
SELECT
random["random"] AS random,
listing["listing_id"] AS listing_id,
rating["text"] AS rating_text,
rating["numeric"] AS rating_numeric
FROM @middle;
// 5. Write output to CSV
OUTPUT @listing_values
TO @OutputFile
USING Outputters.Csv(outputHeader:true,quoting:true);
sample.csv
"random","listing_id","rating_text","rating_numeric"
"x","001","four","4.0"
"y","002","seven","7.0"
"z","003","two","2.0"
通过使用 MultiLevelJsonExtractor
(同一库的一部分),您的代码可以简化为:
// 1. Define Input and Output
DECLARE @InputFile string = @".../sample.json";
DECLARE @OutputFile string = @".../sample.csv";
// 2. Extract JSON (schema on read)
@json =
EXTRACT
random string,
listing_id string,
rating_text string,
rating_numeric string
FROM
@InputFile
USING new MultiLevelJsonExtractor("listings[*]",true, "part_a.random", "part_b.listing_id", "part_b.rating.text", "part_b.rating.numeric" );
// 3. Write output to CSV
OUTPUT @json
TO @OutputFile
USING Outputters.Csv(outputHeader:true,quoting:true);
在 GitHub 上查看有关此提取器的详细信息:
MultiLevelJsonExtractor.cs
问题
是否有更优雅的方法来解析下面的嵌套 JSON 文档示例?
特别是@middle 步骤,根据我的阅读,我认为 JsonFunctions.JsonTuple(part_b, "rating") AS 在步骤 3 中的评级就足够了,但它还不够似乎有效,因此我添加了@middle。
sample.json
{
"listings": [
{
"part_a": {
"random": "x"
},
"part_b": {
"listing_id": "001",
"rating": {
"text": "four",
"numeric": "4.0"
}
}
},
{
"part_a": {
"random": "y"
},
"part_b": {
"listing_id": "002",
"rating": {
"text": "seven",
"numeric": "7.0"
}
}
},
{
"part_a": {
"random": "z"
},
"part_b": {
"listing_id": "003",
"rating": {
"text": "two",
"numeric": "2.0"
}
}
}
]
}
sample.usql
CREATE ASSEMBLY IF NOT EXISTS [Newtonsoft.Json] FROM @"adl://ADL_NAME.azuredatalakestore.net/Newtonsoft.Json.dll";
CREATE ASSEMBLY IF NOT EXISTS [Microsoft.Analytics.Samples.Formats] FROM @"adl://ADL_NAME.azuredatalakestore.net/Microsoft.Analytics.Samples.Formats.dll";
REFERENCE ASSEMBLY [Newtonsoft.Json];
REFERENCE ASSEMBLY [Microsoft.Analytics.Samples.Formats];
USING Microsoft.Analytics.Samples.Formats.Json;
// 1. Define Input and Output
DECLARE @InputFile string = @"adl://ADL_NAME.azuredatalakestore.net/sample.json";
DECLARE @OutputFile string = @"adl://ADL_NAME.azuredatalakestore.net/sample.csv";
// 2. Extract JSON (schema on read)
@json =
EXTRACT
part_a string,
part_b string
FROM
@InputFile
USING new JsonExtractor("$.listings[*]");
// 3. Convert string into dictionary
@listing_dict =
SELECT
JsonFunctions.JsonTuple(part_a) AS random,
JsonFunctions.JsonTuple(part_b) AS listing
FROM @json;
// 4. Extract values
@middle =
SELECT
random,
listing,
JsonFunctions.JsonTuple(listing["rating"]) AS rating
FROM @listing_dict;
@listing_values =
SELECT
random["random"] AS random,
listing["listing_id"] AS listing_id,
rating["text"] AS rating_text,
rating["numeric"] AS rating_numeric
FROM @middle;
// 5. Write output to CSV
OUTPUT @listing_values
TO @OutputFile
USING Outputters.Csv(outputHeader:true,quoting:true);
sample.csv
"random","listing_id","rating_text","rating_numeric"
"x","001","four","4.0"
"y","002","seven","7.0"
"z","003","two","2.0"
通过使用 MultiLevelJsonExtractor
(同一库的一部分),您的代码可以简化为:
// 1. Define Input and Output
DECLARE @InputFile string = @".../sample.json";
DECLARE @OutputFile string = @".../sample.csv";
// 2. Extract JSON (schema on read)
@json =
EXTRACT
random string,
listing_id string,
rating_text string,
rating_numeric string
FROM
@InputFile
USING new MultiLevelJsonExtractor("listings[*]",true, "part_a.random", "part_b.listing_id", "part_b.rating.text", "part_b.rating.numeric" );
// 3. Write output to CSV
OUTPUT @json
TO @OutputFile
USING Outputters.Csv(outputHeader:true,quoting:true);
在 GitHub 上查看有关此提取器的详细信息: MultiLevelJsonExtractor.cs