如何动态访问 json 文件中的所有项目

How to access all items in a json file dynamically

我有一个 JSON 文件,我需要在其中访问所有级别并在数据框中记录,但有些字段名称是动态的,我如何访问这些我不知道其名称的字段?

"Data": [
        {
            "Origem": "1",
            "SubOrigem": "2",
            "Protocolo": "1",
            "Status": "8",
            "WorkflowStatus": "0",
            "CategoryID": "1",
            "filelife": {
                "224698742": {
                    "Link": "teste",
                    "FileName": "teste",
                    "FileSize": "2602614",
                    "Data": "2021-10-07 12:18:58",
                    "Sender": "2",
                    "FileOrigin": "50",
                    "SenderName": "teste"
                },
                "224698764": {
                    "Link": "teste",
                    "FileName": "teste",
                    "FileSize": "805227",
                    "Data": "2021-10-07 12:19:05",
                    "Sender": "2",
                    "FileOrigin": "50",
                    "SenderName": "teste"
                },
                "224698782": {
                    "Link": "teste",
                    "FileName": "teste",
                    "FileSize": "2",
                    "Data": "2021-10-07 12:19:12",
                    "Sender": "2",
                    "FileOrigin": "50",
                    "SenderName": "teste"
                }
        }   }]

因为是StructType,所以可以使用通配符structcol.*

from pyspark.sql import functions as F

(df
    .withColumn('data', F.explode('Data')) # must explode first because `Data` is an array
    .select('data.CategoryID', 'data.Origem', 'data.Protocolo', 'data.Status', 'data.SubOrigem', 'data.WorkflowStatus', 'data.filelife.*')
    .show()
)

# +----------+------+---------+------+---------+--------------+--------------------+--------------------+--------------------+
# |CategoryID|Origem|Protocolo|Status|SubOrigem|WorkflowStatus|           224698742|           224698764|           224698782|
# +----------+------+---------+------+---------+--------------+--------------------+--------------------+--------------------+
# |         1|     1|        1|     8|        2|             0|{2021-10-07 12:18...|{2021-10-07 12:19...|{2021-10-07 12:19...|
# +----------+------+---------+------+---------+--------------+--------------------+--------------------+--------------------+

编辑 #1

循环使用半动态方法来绕过嵌套的 JSON 结构。不过每个级别都有手册

from pyspark.sql import functions as F

base_df = df.withColumn('data', F.explode('Data')) # must explode first because `Data` is an array

name_lv1 = base_df.select('data.*').columns
cols_lv1 = [F.col(f'data.{c}') for c in name_lv1 if c != 'filelife']
# ['CategoryID', 'Origem', 'Protocolo', 'Status', 'SubOrigem', 'WorkflowStatus', 'filelife']

name_lv2 = base_df.select('data.filelife.*').columns
cols_lv2 = [F.col(f'data.filelife.{c}') for c in name_lv2]
# ['224698742', '224698764', '224698782']

name_lv3 = base_df.select(f'data.filelife.{name_lv2[0]}.*').columns
cols_lv3 = [F.col(f'data.filelife.{c}.{cc}').alias(f'{c}_{cc}') for cc in name_lv3 for c in name_lv2]
# ['Data', 'FileName', 'FileOrigin', 'FileSize', 'Link', 'Sender', 'SenderName']

base_df.select(cols_lv1 + cols_lv3).show()
# +----------+------+---------+------+---------+--------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+--------------+--------------+--------------+----------------+----------------+----------------+--------------------+--------------------+--------------------+
# |CategoryID|Origem|Protocolo|Status|SubOrigem|WorkflowStatus|     224698742_Data|     224698764_Data|     224698782_Data|224698742_FileName|224698764_FileName|224698782_FileName|224698742_FileOrigin|224698764_FileOrigin|224698782_FileOrigin|224698742_FileSize|224698764_FileSize|224698782_FileSize|224698742_Link|224698764_Link|224698782_Link|224698742_Sender|224698764_Sender|224698782_Sender|224698742_SenderName|224698764_SenderName|224698782_SenderName|
# +----------+------+---------+------+---------+--------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+--------------+--------------+--------------+----------------+----------------+----------------+--------------------+--------------------+--------------------+
# |         1|     1|        1|     8|        2|             0|2021-10-07 12:18:58|2021-10-07 12:19:05|2021-10-07 12:19:12|             teste|             teste|             teste|                  50|                  50|                  50|           2602614|            805227|                 2|         teste|         teste|         teste|               2|               2|               2|               teste|               teste|               teste|
# +----------+------+---------+------+---------+--------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+--------------+--------------+--------------+----------------+----------------+----------------+--------------------+--------------------+--------------------+