使用 pyspark 将复杂的 json 对象从 Web api 转换为 Azure Databricks 数据框中的多行?
transform a complex json object from a web api to multiple rows in a dataframe in Azure Databricks using pyspark?
我有一个从 REST API 接收到的 JSON 文件。 return 的一个例子是这样的:
{
"d": {
"results": [
{
"__metadata": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')",
"type": "EmpEmployment"
},
"personIdExternal": "60000033",
"userId": "60000033",
"hiringNotCompleted": false,
"isECRecord": true,
"lastModifiedDateTime": "/Date(1642917586000+0000)/",
"endDate": "/Date(1675123200000)/",
"createdDateTime": "/Date(1641473919000+0000)/",
"createdOn": "/Date(1641473919000)/",
"originalStartDate": "/Date(1501545600000)/",
"customDate1": "/Date(1501545600000)/",
"customString17": null,
"customString18": null,
"customString19": null,
"assignmentClass": "ST",
"lastModifiedBy": "This Dude",
"okToRehire": true,
"customString4": null,
"customString3": "3",
"customString2": null,
"assignmentIdExternal": "60000033",
"customString16": null,
"lastModifiedOn": "/Date(1642917586000)/",
"customString1": null,
"createdBy": "This Dudette",
"seniorityDate": "/Date(1501545600000)/",
"startDate": "/Date(1659398400000)/",
"customString16Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString16Nav"
}
},
"customString1Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString1Nav"
}
},
"customString18Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString18Nav"
}
},
"customString3Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString3Nav"
}
},
"paymentInformationNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/paymentInformationNav"
}
},
"empJobRelationshipNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empJobRelationshipNav"
}
},
"personNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/personNav"
}
},
"empWorkPermitNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empWorkPermitNav"
}
},
"photoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/photoNav"
}
},
"compInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/compInfoNav"
}
},
"userNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/userNav"
}
},
"customString2Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString2Nav"
}
},
"customString19Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString19Nav"
}
},
"jobInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/jobInfoNav"
}
},
"wfRequestNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/wfRequestNav"
}
},
"costDistributionNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/costDistributionNav"
}
},
"empPayCompNonRecurringNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empPayCompNonRecurringNav"
}
}
},
{
"__metadata": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')",
"type": "EmpEmployment"
},
"personIdExternal": "100003",
"userId": "100003",
"hiringNotCompleted": false,
"isECRecord": true,
"lastModifiedDateTime": "/Date(1638051713000+0000)/",
"endDate": null,
"createdDateTime": "/Date(1638051713000+0000)/",
"createdOn": "/Date(1638051713000)/",
"originalStartDate": "/Date(1635724800000)/",
"customDate1": null,
"customString17": null,
"customString18": null,
"customString19": null,
"assignmentClass": "ST",
"lastModifiedBy": "This Dudette",
"okToRehire": null,
"customString4": null,
"customString3": null,
"customString2": null,
"assignmentIdExternal": "100003",
"customString16": null,
"lastModifiedOn": "/Date(1638051713000)/",
"customString1": null,
"createdBy": "This Dude",
"seniorityDate": "/Date(1635724800000)/",
"startDate": "/Date(1635724800000)/",
"customString16Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString16Nav"
}
},
"customString1Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString1Nav"
}
},
"customString18Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString18Nav"
}
},
"customString3Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString3Nav"
}
},
"paymentInformationNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/paymentInformationNav"
}
},
"empJobRelationshipNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empJobRelationshipNav"
}
},
"personNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/personNav"
}
},
"empWorkPermitNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empWorkPermitNav"
}
},
"photoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/photoNav"
}
},
"compInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/compInfoNav"
}
},
"userNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/userNav"
}
},
"customString2Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString2Nav"
}
},
"customString19Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString19Nav"
}
},
"jobInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/jobInfoNav"
}
},
"wfRequestNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/wfRequestNav"
}
},
"costDistributionNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/costDistributionNav"
}
},
"empPayCompNonRecurringNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empPayCompNonRecurringNav"
}
}
}
]
}
}
我目前只想从 JSON 中提取 userId 和 startDate。我已尝试使用本示例中所示的 explode 命令。
https://adatis.co.uk/parsing-nested-json-lists-in-databricks-using-python/
但我似乎所做的就是能够将整个结果放入数据框中的一列中,或者如果我使用以下内容:
请提供一些帮助以指明正确的方向。我是否更好地构建自定义模式并尝试将 JSON 解析为该模式?
我想要做的就是return图片中的结果,但是每个 userId 和 startDate 都在自己的行上,因为它们相互关联。
分解 results
使它们进入行
df = spark.read.json("./sample.json", multiLine=True)
df2 = df.withColumn('d', explode(col('d.results')))
df2.select(df2.d.userId, df2.d.startDate).show(10,False)
+--------+---------------------+
|d.userId|d.startDate |
+--------+---------------------+
|60000033|/Date(1659398400000)/|
|100003 |/Date(1635724800000)/|
+--------+---------------------+
您可以根据需要添加任意数量的属性,例如
df.select(explode(col('d.results'))).\
selectExpr("col.userId","col.startDate","col.lastModifiedBy").\
show(10,False)
+--------+---------------------+--------------+
|userId |startDate |lastModifiedBy|
+--------+---------------------+--------------+
|60000033|/Date(1659398400000)/|This Dude |
|100003 |/Date(1635724800000)/|This Dudette |
+--------+---------------------+--------------+
我有一个从 REST API 接收到的 JSON 文件。 return 的一个例子是这样的:
{
"d": {
"results": [
{
"__metadata": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')",
"type": "EmpEmployment"
},
"personIdExternal": "60000033",
"userId": "60000033",
"hiringNotCompleted": false,
"isECRecord": true,
"lastModifiedDateTime": "/Date(1642917586000+0000)/",
"endDate": "/Date(1675123200000)/",
"createdDateTime": "/Date(1641473919000+0000)/",
"createdOn": "/Date(1641473919000)/",
"originalStartDate": "/Date(1501545600000)/",
"customDate1": "/Date(1501545600000)/",
"customString17": null,
"customString18": null,
"customString19": null,
"assignmentClass": "ST",
"lastModifiedBy": "This Dude",
"okToRehire": true,
"customString4": null,
"customString3": "3",
"customString2": null,
"assignmentIdExternal": "60000033",
"customString16": null,
"lastModifiedOn": "/Date(1642917586000)/",
"customString1": null,
"createdBy": "This Dudette",
"seniorityDate": "/Date(1501545600000)/",
"startDate": "/Date(1659398400000)/",
"customString16Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString16Nav"
}
},
"customString1Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString1Nav"
}
},
"customString18Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString18Nav"
}
},
"customString3Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString3Nav"
}
},
"paymentInformationNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/paymentInformationNav"
}
},
"empJobRelationshipNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empJobRelationshipNav"
}
},
"personNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/personNav"
}
},
"empWorkPermitNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empWorkPermitNav"
}
},
"photoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/photoNav"
}
},
"compInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/compInfoNav"
}
},
"userNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/userNav"
}
},
"customString2Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString2Nav"
}
},
"customString19Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString19Nav"
}
},
"jobInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/jobInfoNav"
}
},
"wfRequestNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/wfRequestNav"
}
},
"costDistributionNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/costDistributionNav"
}
},
"empPayCompNonRecurringNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empPayCompNonRecurringNav"
}
}
},
{
"__metadata": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')",
"type": "EmpEmployment"
},
"personIdExternal": "100003",
"userId": "100003",
"hiringNotCompleted": false,
"isECRecord": true,
"lastModifiedDateTime": "/Date(1638051713000+0000)/",
"endDate": null,
"createdDateTime": "/Date(1638051713000+0000)/",
"createdOn": "/Date(1638051713000)/",
"originalStartDate": "/Date(1635724800000)/",
"customDate1": null,
"customString17": null,
"customString18": null,
"customString19": null,
"assignmentClass": "ST",
"lastModifiedBy": "This Dudette",
"okToRehire": null,
"customString4": null,
"customString3": null,
"customString2": null,
"assignmentIdExternal": "100003",
"customString16": null,
"lastModifiedOn": "/Date(1638051713000)/",
"customString1": null,
"createdBy": "This Dude",
"seniorityDate": "/Date(1635724800000)/",
"startDate": "/Date(1635724800000)/",
"customString16Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString16Nav"
}
},
"customString1Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString1Nav"
}
},
"customString18Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString18Nav"
}
},
"customString3Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString3Nav"
}
},
"paymentInformationNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/paymentInformationNav"
}
},
"empJobRelationshipNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empJobRelationshipNav"
}
},
"personNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/personNav"
}
},
"empWorkPermitNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empWorkPermitNav"
}
},
"photoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/photoNav"
}
},
"compInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/compInfoNav"
}
},
"userNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/userNav"
}
},
"customString2Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString2Nav"
}
},
"customString19Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString19Nav"
}
},
"jobInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/jobInfoNav"
}
},
"wfRequestNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/wfRequestNav"
}
},
"costDistributionNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/costDistributionNav"
}
},
"empPayCompNonRecurringNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empPayCompNonRecurringNav"
}
}
}
]
}
}
我目前只想从 JSON 中提取 userId 和 startDate。我已尝试使用本示例中所示的 explode 命令。
https://adatis.co.uk/parsing-nested-json-lists-in-databricks-using-python/
但我似乎所做的就是能够将整个结果放入数据框中的一列中,或者如果我使用以下内容:
请提供一些帮助以指明正确的方向。我是否更好地构建自定义模式并尝试将 JSON 解析为该模式?
我想要做的就是return图片中的结果,但是每个 userId 和 startDate 都在自己的行上,因为它们相互关联。
分解 results
使它们进入行
df = spark.read.json("./sample.json", multiLine=True)
df2 = df.withColumn('d', explode(col('d.results')))
df2.select(df2.d.userId, df2.d.startDate).show(10,False)
+--------+---------------------+
|d.userId|d.startDate |
+--------+---------------------+
|60000033|/Date(1659398400000)/|
|100003 |/Date(1635724800000)/|
+--------+---------------------+
您可以根据需要添加任意数量的属性,例如
df.select(explode(col('d.results'))).\
selectExpr("col.userId","col.startDate","col.lastModifiedBy").\
show(10,False)
+--------+---------------------+--------------+
|userId |startDate |lastModifiedBy|
+--------+---------------------+--------------+
|60000033|/Date(1659398400000)/|This Dude |
|100003 |/Date(1635724800000)/|This Dudette |
+--------+---------------------+--------------+