如何通过pyspark更改数组结构中的列类型
how to change a column type in array struct by pyspark
如何通过 pyspark 更改数组结构中的列类型,例如,我想将 userid
从 int 更改为 long
root
|-- id: string (nullable = true)
|-- numbers: array (nullable = true)
| |-- element: struct (containsNull = true)
|-- m1: long (nullable = true)
|-- m2: long (nullable = true)
|-- m3: struct (nullable = true)
|-- userid: integer (nullable = true)
如果您也提供可重现的 df 将会很有用。
关注下方评论请看下面代码
sch= StructType([StructField('id', StringType(),False),StructField('numbers', ArrayType(
StructType([StructField('m1',LongType(),True),
StructField('m2',LongType(),True),
StructField('m3',StructType([StructField('userid',IntegerType(),True)]),True)])),True)])
df=spark.createDataFrame([
('21',[(1234567, 9876543,(1,))]),
('34',[(63467892345, 19523789,(2,))])
], schema=sch)
df.printSchema()
root
|-- id: string (nullable = false)
|-- numbers: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- m1: long (nullable = true)
| | |-- m2: long (nullable = true)
| | |-- m3: struct (nullable = true)
| | | |-- userid: integer (nullable = true)
解决方案
df1 = df.selectExpr(
"id",
"CAST(numbers AS array<struct<m1:long,m2:long, m3:struct<userid:double>>>) numbers"
)
df1.printSchema()
root
|-- id: string (nullable = false)
|-- numbers: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- m1: long (nullable = true)
| | |-- m2: long (nullable = true)
| | |-- m3: struct (nullable = true)
| | | |-- userid: double (nullable = true)
如何通过 pyspark 更改数组结构中的列类型,例如,我想将 userid
从 int 更改为 long
root
|-- id: string (nullable = true)
|-- numbers: array (nullable = true)
| |-- element: struct (containsNull = true)
|-- m1: long (nullable = true)
|-- m2: long (nullable = true)
|-- m3: struct (nullable = true)
|-- userid: integer (nullable = true)
如果您也提供可重现的 df 将会很有用。
关注下方评论请看下面代码
sch= StructType([StructField('id', StringType(),False),StructField('numbers', ArrayType(
StructType([StructField('m1',LongType(),True),
StructField('m2',LongType(),True),
StructField('m3',StructType([StructField('userid',IntegerType(),True)]),True)])),True)])
df=spark.createDataFrame([
('21',[(1234567, 9876543,(1,))]),
('34',[(63467892345, 19523789,(2,))])
], schema=sch)
df.printSchema()
root
|-- id: string (nullable = false)
|-- numbers: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- m1: long (nullable = true)
| | |-- m2: long (nullable = true)
| | |-- m3: struct (nullable = true)
| | | |-- userid: integer (nullable = true)
解决方案
df1 = df.selectExpr(
"id",
"CAST(numbers AS array<struct<m1:long,m2:long, m3:struct<userid:double>>>) numbers"
)
df1.printSchema()
root
|-- id: string (nullable = false)
|-- numbers: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- m1: long (nullable = true)
| | |-- m2: long (nullable = true)
| | |-- m3: struct (nullable = true)
| | | |-- userid: double (nullable = true)