来自数组的 AWS Glue 动态框架列
AWS Glue Dynamic Frame columns from array
我有一个嵌套的 json,其结构如下例所示:
{'A':[{'key':'B','value':'C'},{'key':'D','value':'E'}]}
现在我想将其映射到以下模式:
|--A
|--|--B
|--|--D
例如从 json 中恢复的结构,例如:
{'A':{'B':'C','D':'E'}}
'A'中的数组没有固定数量的条目,但包含的dicts总是有两个键'key','value'
请在下面找到脚本。
from pyspark.sql.functions import lit, col, explode, create_map, collect_list
from itertools import chain
>>> sample.printSchema()
root
|-- A: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- key: string (nullable = true)
| | |-- value: string (nullable = true)
>>> final_df = (sample
... .select(explode('A').alias("A"))
... .withColumn("A",create_map("A.key", "A.value"))
... .groupby().agg(collect_list("A").alias("A"))
... )
>>> final_df.printSchema()
root
|-- A: array (nullable = true)
| |-- element: map (containsNull = false)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
>>> final_df.show(truncate=False)
+--------------------+
|A |
+--------------------+
|[[B -> C], [D -> E]]|
+--------------------+
>>> (final_df
... .write
... .format("json")
... .mode("overwrite")
... .save("sample_files/2020-09-29/out")
... )
我有一个嵌套的 json,其结构如下例所示:
{'A':[{'key':'B','value':'C'},{'key':'D','value':'E'}]}
现在我想将其映射到以下模式:
|--A
|--|--B
|--|--D
例如从 json 中恢复的结构,例如:
{'A':{'B':'C','D':'E'}}
'A'中的数组没有固定数量的条目,但包含的dicts总是有两个键'key','value'
请在下面找到脚本。
from pyspark.sql.functions import lit, col, explode, create_map, collect_list
from itertools import chain
>>> sample.printSchema()
root
|-- A: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- key: string (nullable = true)
| | |-- value: string (nullable = true)
>>> final_df = (sample
... .select(explode('A').alias("A"))
... .withColumn("A",create_map("A.key", "A.value"))
... .groupby().agg(collect_list("A").alias("A"))
... )
>>> final_df.printSchema()
root
|-- A: array (nullable = true)
| |-- element: map (containsNull = false)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
>>> final_df.show(truncate=False)
+--------------------+
|A |
+--------------------+
|[[B -> C], [D -> E]]|
+--------------------+
>>> (final_df
... .write
... .format("json")
... .mode("overwrite")
... .save("sample_files/2020-09-29/out")
... )