如何将多个列转换为一个单元格中的键值列表 - Pyspark?
How to convert multiple columns to a list of key values in one cell - Pyspark?
我有一个 spark 数据框,如下所示:
+-------------+-------+--------+------+-----------+----+----+----+-------------------------+----+
|uniqueKey |channel|division|gender|category |W1 |W3 |W4 |W2 |W5 |
+-------------+-------+--------+------+-----------+----+----+----+-------------------------+----+
|key1 |Digital|APPAREL |KIDS |COLLECTIONS|null|null|null|[7000, 181] |null|
|key2 |Digital|APPAREL |KIDS |CRICKET |null|null|null|[10000.3, 699] |null|
|key3 |Digital|FOOTWEAR|MENS |COLLECTIONS|null|null|null|[4567, 34] |null|
+-------------+-------+--------+------+-----------+----+----+----+-------------------------+----+
我需要创建一个如下所示的 Json:
{
"uniqueKey": "key1"
"division": "APPAREL"
"gender": "KIDS"
"category": "CRICKET"
"channel": "DIGITAL"
"dataRows": [
{
"rowId": "Net Sales"
"dataRow": {
"W1": 0
"W2": 10000
"W3": 0
"W4": 0
"W5": 0
}
}
{
"rowId": "Sales Units"
"dataRow": {
"W1": 0
"W2": 699
"W3": 0
"W4": 0
"W5": 0
}
}
]
我试过 pivot,但我不确定我做的是否正确。你能帮我解决这个问题吗?
添加了内嵌注释,检查下面的代码。
对于Week Columns
,这里我用的是range(1,5)
你可以增加到52
weekCols = F.struct(map(lambda c: F.coalesce(F.col(c).cast("int"),F.lit(0)),["W"+str(x) for x in range(1,5)]))
df.withColumn("W2",F.explode(F.col("W2")))\
.groupBy(F.col("uniqueKey"),F.col("channel"),F.col("division"),F.col("gender"),F.col("category"))\
.agg(F.collect_list(weekCols.alias("dataRow")).alias("dataRows"))\
.withColumn("dataRows",F.expr("transform(dataRows,(v,i) -> if(i=0,struct('Net Sales' as rowId,v as dataRow),struct('Sales Units' as rowId,v as dataRow)))"))\
.select(F.collect_list(F.to_json(F.struct("*"))).alias("data"))\
.show()
最终输出
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|data |

|[{"uniqueKey":"key1","channel":"Digital","division":"APPAREL","gender":"KIDS","category":"COLLECTIONS","dataRows":[{"rowId":"Net Sales","dataRow":{"col1":0,"col2":7000,"col3":0,"col4":0}},{"rowId":"Sales Units","dataRow":{"col1":0,"col2":181,"col3":0,"col4":0}}]}, {"uniqueKey":"key3","channel":"Digital","division":"FOOTWEAR","gender":"MENS","category":"COLLECTIONS","dataRows":[{"rowId":"Net Sales","dataRow":{"col1":0,"col2":4567,"col3":0,"col4":0}},{"rowId":"Sales Units","dataRow":{"col1":0,"col2":34,"col3":0,"col4":0}}]}, {"uniqueKey":"key2","channel":"Digital","division":"APPAREL","gender":"KIDS","category":"CRICKET","dataRows":[{"rowId":"Net Sales","dataRow":{"col1":0,"col2":10000,"col3":0,"col4":0}},{"rowId":"Sales Units","dataRow":{"col1":0,"col2":699,"col3":0,"col4":0}}]}]|

格式化Json输出
[
{
"uniqueKey": "key1",
"channel": "Digital",
"division": "APPAREL",
"gender": "KIDS",
"category": "COLLECTIONS",
"dataRows": [
{
"rowId": "Net Sales",
"dataRow": {
"col1": 0,
"col2": 7000,
"col3": 0,
"col4": 0
}
},
{
"rowId": "Sales Units",
"dataRow": {
"col1": 0,
"col2": 181,
"col3": 0,
"col4": 0
}
}
]
},
{
"uniqueKey": "key3",
"channel": "Digital",
"division": "FOOTWEAR",
"gender": "MENS",
"category": "COLLECTIONS",
"dataRows": [
{
"rowId": "Net Sales",
"dataRow": {
"col1": 0,
"col2": 4567,
"col3": 0,
"col4": 0
}
},
{
"rowId": "Sales Units",
"dataRow": {
"col1": 0,
"col2": 34,
"col3": 0,
"col4": 0
}
}
]
},
{
"uniqueKey": "key2",
"channel": "Digital",
"division": "APPAREL",
"gender": "KIDS",
"category": "CRICKET",
"dataRows": [
{
"rowId": "Net Sales",
"dataRow": {
"col1": 0,
"col2": 10000,
"col3": 0,
"col4": 0
}
},
{
"rowId": "Sales Units",
"dataRow": {
"col1": 0,
"col2": 699,
"col3": 0,
"col4": 0
}
}
]
}
]
我有一个 spark 数据框,如下所示:
+-------------+-------+--------+------+-----------+----+----+----+-------------------------+----+
|uniqueKey |channel|division|gender|category |W1 |W3 |W4 |W2 |W5 |
+-------------+-------+--------+------+-----------+----+----+----+-------------------------+----+
|key1 |Digital|APPAREL |KIDS |COLLECTIONS|null|null|null|[7000, 181] |null|
|key2 |Digital|APPAREL |KIDS |CRICKET |null|null|null|[10000.3, 699] |null|
|key3 |Digital|FOOTWEAR|MENS |COLLECTIONS|null|null|null|[4567, 34] |null|
+-------------+-------+--------+------+-----------+----+----+----+-------------------------+----+
我需要创建一个如下所示的 Json:
{
"uniqueKey": "key1"
"division": "APPAREL"
"gender": "KIDS"
"category": "CRICKET"
"channel": "DIGITAL"
"dataRows": [
{
"rowId": "Net Sales"
"dataRow": {
"W1": 0
"W2": 10000
"W3": 0
"W4": 0
"W5": 0
}
}
{
"rowId": "Sales Units"
"dataRow": {
"W1": 0
"W2": 699
"W3": 0
"W4": 0
"W5": 0
}
}
]
我试过 pivot,但我不确定我做的是否正确。你能帮我解决这个问题吗?
添加了内嵌注释,检查下面的代码。
对于Week Columns
,这里我用的是range(1,5)
你可以增加到52
weekCols = F.struct(map(lambda c: F.coalesce(F.col(c).cast("int"),F.lit(0)),["W"+str(x) for x in range(1,5)]))
df.withColumn("W2",F.explode(F.col("W2")))\
.groupBy(F.col("uniqueKey"),F.col("channel"),F.col("division"),F.col("gender"),F.col("category"))\
.agg(F.collect_list(weekCols.alias("dataRow")).alias("dataRows"))\
.withColumn("dataRows",F.expr("transform(dataRows,(v,i) -> if(i=0,struct('Net Sales' as rowId,v as dataRow),struct('Sales Units' as rowId,v as dataRow)))"))\
.select(F.collect_list(F.to_json(F.struct("*"))).alias("data"))\
.show()
最终输出

|data |

|[{"uniqueKey":"key1","channel":"Digital","division":"APPAREL","gender":"KIDS","category":"COLLECTIONS","dataRows":[{"rowId":"Net Sales","dataRow":{"col1":0,"col2":7000,"col3":0,"col4":0}},{"rowId":"Sales Units","dataRow":{"col1":0,"col2":181,"col3":0,"col4":0}}]}, {"uniqueKey":"key3","channel":"Digital","division":"FOOTWEAR","gender":"MENS","category":"COLLECTIONS","dataRows":[{"rowId":"Net Sales","dataRow":{"col1":0,"col2":4567,"col3":0,"col4":0}},{"rowId":"Sales Units","dataRow":{"col1":0,"col2":34,"col3":0,"col4":0}}]}, {"uniqueKey":"key2","channel":"Digital","division":"APPAREL","gender":"KIDS","category":"CRICKET","dataRows":[{"rowId":"Net Sales","dataRow":{"col1":0,"col2":10000,"col3":0,"col4":0}},{"rowId":"Sales Units","dataRow":{"col1":0,"col2":699,"col3":0,"col4":0}}]}]|

格式化Json输出
[
{
"uniqueKey": "key1",
"channel": "Digital",
"division": "APPAREL",
"gender": "KIDS",
"category": "COLLECTIONS",
"dataRows": [
{
"rowId": "Net Sales",
"dataRow": {
"col1": 0,
"col2": 7000,
"col3": 0,
"col4": 0
}
},
{
"rowId": "Sales Units",
"dataRow": {
"col1": 0,
"col2": 181,
"col3": 0,
"col4": 0
}
}
]
},
{
"uniqueKey": "key3",
"channel": "Digital",
"division": "FOOTWEAR",
"gender": "MENS",
"category": "COLLECTIONS",
"dataRows": [
{
"rowId": "Net Sales",
"dataRow": {
"col1": 0,
"col2": 4567,
"col3": 0,
"col4": 0
}
},
{
"rowId": "Sales Units",
"dataRow": {
"col1": 0,
"col2": 34,
"col3": 0,
"col4": 0
}
}
]
},
{
"uniqueKey": "key2",
"channel": "Digital",
"division": "APPAREL",
"gender": "KIDS",
"category": "CRICKET",
"dataRows": [
{
"rowId": "Net Sales",
"dataRow": {
"col1": 0,
"col2": 10000,
"col3": 0,
"col4": 0
}
},
{
"rowId": "Sales Units",
"dataRow": {
"col1": 0,
"col2": 699,
"col3": 0,
"col4": 0
}
}
]
}
]