使用 pyspark 将复杂的 json 对象从 Web api 转换为 Azure Databricks 数据框中的多行?

transform a complex json object from a web api to multiple rows in a dataframe in Azure Databricks using pyspark?

我有一个从 REST API 接收到的 JSON 文件。 return 的一个例子是这样的:

{
    "d": {
        "results": [
            {
                "__metadata": {
                    "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')",
                    "type": "EmpEmployment"
                },
                "personIdExternal": "60000033",
                "userId": "60000033",
                "hiringNotCompleted": false,
                "isECRecord": true,
                "lastModifiedDateTime": "/Date(1642917586000+0000)/",
                "endDate": "/Date(1675123200000)/",
                "createdDateTime": "/Date(1641473919000+0000)/",
                "createdOn": "/Date(1641473919000)/",
                "originalStartDate": "/Date(1501545600000)/",
                "customDate1": "/Date(1501545600000)/",
                "customString17": null,
                "customString18": null,
                "customString19": null,
                "assignmentClass": "ST",
                "lastModifiedBy": "This Dude",
                "okToRehire": true,
                "customString4": null,
                "customString3": "3",
                "customString2": null,
                "assignmentIdExternal": "60000033",
                "customString16": null,
                "lastModifiedOn": "/Date(1642917586000)/",
                "customString1": null,
                "createdBy": "This Dudette",
                "seniorityDate": "/Date(1501545600000)/",
                "startDate": "/Date(1659398400000)/",
                "customString16Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString16Nav"
                    }
                },
                "customString1Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString1Nav"
                    }
                },
                "customString18Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString18Nav"
                    }
                },
                "customString3Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString3Nav"
                    }
                },
                "paymentInformationNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/paymentInformationNav"
                    }
                },
                "empJobRelationshipNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empJobRelationshipNav"
                    }
                },
                "personNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/personNav"
                    }
                },
                "empWorkPermitNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empWorkPermitNav"
                    }
                },
                "photoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/photoNav"
                    }
                },
                "compInfoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/compInfoNav"
                    }
                },
                "userNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/userNav"
                    }
                },
                "customString2Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString2Nav"
                    }
                },
                "customString19Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString19Nav"
                    }
                },
                "jobInfoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/jobInfoNav"
                    }
                },
                "wfRequestNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/wfRequestNav"
                    }
                },
                "costDistributionNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/costDistributionNav"
                    }
                },
                "empPayCompNonRecurringNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empPayCompNonRecurringNav"
                    }
                }
            },
            {
                "__metadata": {
                    "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')",
                    "type": "EmpEmployment"
                },
                "personIdExternal": "100003",
                "userId": "100003",
                "hiringNotCompleted": false,
                "isECRecord": true,
                "lastModifiedDateTime": "/Date(1638051713000+0000)/",
                "endDate": null,
                "createdDateTime": "/Date(1638051713000+0000)/",
                "createdOn": "/Date(1638051713000)/",
                "originalStartDate": "/Date(1635724800000)/",
                "customDate1": null,
                "customString17": null,
                "customString18": null,
                "customString19": null,
                "assignmentClass": "ST",
                "lastModifiedBy": "This Dudette",
                "okToRehire": null,
                "customString4": null,
                "customString3": null,
                "customString2": null,
                "assignmentIdExternal": "100003",
                "customString16": null,
                "lastModifiedOn": "/Date(1638051713000)/",
                "customString1": null,
                "createdBy": "This Dude",
                "seniorityDate": "/Date(1635724800000)/",
                "startDate": "/Date(1635724800000)/",
                "customString16Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString16Nav"
                    }
                },
                "customString1Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString1Nav"
                    }
                },
                "customString18Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString18Nav"
                    }
                },
                "customString3Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString3Nav"
                    }
                },
                "paymentInformationNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/paymentInformationNav"
                    }
                },
                "empJobRelationshipNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empJobRelationshipNav"
                    }
                },
                "personNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/personNav"
                    }
                },
                "empWorkPermitNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empWorkPermitNav"
                    }
                },
                "photoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/photoNav"
                    }
                },
                "compInfoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/compInfoNav"
                    }
                },
                "userNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/userNav"
                    }
                },
                "customString2Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString2Nav"
                    }
                },
                "customString19Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString19Nav"
                    }
                },
                "jobInfoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/jobInfoNav"
                    }
                },
                "wfRequestNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/wfRequestNav"
                    }
                },
                "costDistributionNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/costDistributionNav"
                    }
                },
                "empPayCompNonRecurringNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empPayCompNonRecurringNav"
                    }
                }
            }
        ]
    }
}

我目前只想从 JSON 中提取 userId 和 startDate。我已尝试使用本示例中所示的 explode 命令。

https://adatis.co.uk/parsing-nested-json-lists-in-databricks-using-python/

但我似乎所做的就是能够将整个结果放入数据框中的一列中,或者如果我使用以下内容:

请提供一些帮助以指明正确的方向。我是否更好地构建自定义模式并尝试将 JSON 解析为该模式?

我想要做的就是return图片中的结果,但是每个 userId 和 startDate 都在自己的行上,因为它们相互关联。

分解 results 使它们进入行

df = spark.read.json("./sample.json", multiLine=True)
df2 = df.withColumn('d', explode(col('d.results')))
df2.select(df2.d.userId, df2.d.startDate).show(10,False)

+--------+---------------------+
|d.userId|d.startDate          |
+--------+---------------------+
|60000033|/Date(1659398400000)/|
|100003  |/Date(1635724800000)/|
+--------+---------------------+

您可以根据需要添加任意数量的属性,例如

df.select(explode(col('d.results'))).\
    selectExpr("col.userId","col.startDate","col.lastModifiedBy").\
    show(10,False)

+--------+---------------------+--------------+
|userId  |startDate            |lastModifiedBy|
+--------+---------------------+--------------+
|60000033|/Date(1659398400000)/|This Dude     |
|100003  |/Date(1635724800000)/|This Dudette  |
+--------+---------------------+--------------+