pyspark-连接两列数组元素
pyspark- connect two columns array elements
我对 pays-ark 还很陌生。
我有一个包含两列的数据框,每列都有数组格式的字符串:
如何将第一列的数组元素连接到其他列数组中相同位置的值。
如果我将 Dataframe 转换为数据块中的 Pandas Dataframe,下面的代码可以工作,但它不会使数组保持正确的格式。
for item in list_x:
df_head[item] = "x"
value = df_head['materialText'].values
headName = df_head['materialTextPart'].values
value_list = []
for k in range(len(df_head)):
# print(k)
if type(value[k]) == np.float:
continue;
else:
value_array =value[k][0:].split(',')
# print(value_array)
headName_array = headName[k][1:-2].split(',')
for m in range(len(headName_array)):
if (headName_array[m] == item) or (headName_array[m] ==' '+item) or (headName_array[m] ==' '+item.replace('s','')):
columnName = item
columnValue = df_head.loc[k,columnName]
if columnValue == 'x':
df_head.loc[k,columnName] = value_array[m]
else:
df_head.loc[k,columnName]= df_head.loc[k,columnName]+ ',' + value_array[m]
df_head[item] = df_head[item].replace('x', np.nan)
列示例:
[“织物:”,“壁挂支架:”,“顶轨/底轨:”]
[“100% 聚酯(100% 回收),PET 塑料”,“钢,Polycarbonate/ABS 塑料,粉末涂层”,“铝,粉末涂层”]
materialTextPart
materialText
["Fabric:", "Wall bracket:", "Top rail/ Bottom rail:"]
["100 % polyester (100% recycled), PET plastic", "Steel, Polycarbonate/ABS plastic, Powder coating", "Aluminium, Powder coating"]
["Ticking:", "Filling:", "Ticking, underside:", "Comfort filling:", "Ticking:"]
["100 % polyester (100% recycled)", "100 % polyester", "100% polypropylene", "Polyurethane foam 28 kg/cu.m.", "100% polyester"]
正如我在评论中提到的那样 -
from pyspark.sql.functions import *
from pyspark.sql.types import *
df = spark.createDataFrame( data = [
(["Fabric:", "Wall bracket:", "Top rail/ Bottom rail:"],
["100 % polyester (100% recycled), PET plastic", "Steel, Polycarbonate/ABS plastic, Powder coating", "Aluminium, Powder coating"]
)
],
schema = StructType([StructField("xs", ArrayType(StringType())), StructField("ys", ArrayType(StringType()))])
)
df.select(zip_with("xs", "ys", lambda x, y: concat(x,y)).alias("Array_Elements_Concat")).show(truncate=False)
输出
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Array_Elements_Concat |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Fabric:100 % polyester (100% recycled), PET plastic, Wall bracket:Steel, Polycarbonate/ABS plastic, Powder coating, Top rail/ Bottom rail:Aluminium, Powder coating]|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
我对 pays-ark 还很陌生。 我有一个包含两列的数据框,每列都有数组格式的字符串: 如何将第一列的数组元素连接到其他列数组中相同位置的值。
如果我将 Dataframe 转换为数据块中的 Pandas Dataframe,下面的代码可以工作,但它不会使数组保持正确的格式。
for item in list_x:
df_head[item] = "x"
value = df_head['materialText'].values
headName = df_head['materialTextPart'].values
value_list = []
for k in range(len(df_head)):
# print(k)
if type(value[k]) == np.float:
continue;
else:
value_array =value[k][0:].split(',')
# print(value_array)
headName_array = headName[k][1:-2].split(',')
for m in range(len(headName_array)):
if (headName_array[m] == item) or (headName_array[m] ==' '+item) or (headName_array[m] ==' '+item.replace('s','')):
columnName = item
columnValue = df_head.loc[k,columnName]
if columnValue == 'x':
df_head.loc[k,columnName] = value_array[m]
else:
df_head.loc[k,columnName]= df_head.loc[k,columnName]+ ',' + value_array[m]
df_head[item] = df_head[item].replace('x', np.nan)
列示例: [“织物:”,“壁挂支架:”,“顶轨/底轨:”] [“100% 聚酯(100% 回收),PET 塑料”,“钢,Polycarbonate/ABS 塑料,粉末涂层”,“铝,粉末涂层”]
materialTextPart | materialText |
---|---|
["Fabric:", "Wall bracket:", "Top rail/ Bottom rail:"] | |
["100 % polyester (100% recycled), PET plastic", "Steel, Polycarbonate/ABS plastic, Powder coating", "Aluminium, Powder coating"] | |
["Ticking:", "Filling:", "Ticking, underside:", "Comfort filling:", "Ticking:"] | ["100 % polyester (100% recycled)", "100 % polyester", "100% polypropylene", "Polyurethane foam 28 kg/cu.m.", "100% polyester"] |
正如我在评论中提到的那样 -
from pyspark.sql.functions import *
from pyspark.sql.types import *
df = spark.createDataFrame( data = [
(["Fabric:", "Wall bracket:", "Top rail/ Bottom rail:"],
["100 % polyester (100% recycled), PET plastic", "Steel, Polycarbonate/ABS plastic, Powder coating", "Aluminium, Powder coating"]
)
],
schema = StructType([StructField("xs", ArrayType(StringType())), StructField("ys", ArrayType(StringType()))])
)
df.select(zip_with("xs", "ys", lambda x, y: concat(x,y)).alias("Array_Elements_Concat")).show(truncate=False)
输出
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Array_Elements_Concat |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Fabric:100 % polyester (100% recycled), PET plastic, Wall bracket:Steel, Polycarbonate/ABS plastic, Powder coating, Top rail/ Bottom rail:Aluminium, Powder coating]|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+