将字符串列表转换为数组类型
Convert string list to array type
我有一个包含字符串数据类型列的数据框,但实际表示是数组类型。
import pyspark
from pyspark.sql import Row
item = spark.createDataFrame([Row(item='fish',geography=['london','a','b','hyd']),
Row(item='chicken',geography=['a','hyd','c']),
Row(item='rice',geography=['a','b','c','blr']),
Row(item='soup',geography=['a','kol','simla']),
Row(item='pav',geography=['a','del']),
Row(item='kachori',geography=['a','guj']),
Row(item='fries',geography=['a','chen']),
Row(item='noodles',geography=['a','mum'])])
item.show()
# +-------+-------------------+
# | item| geography|
# +-------+-------------------+
# | fish|[london, a, b, hyd]|
# |chicken| [a, hyd, c]|
# | rice| [a, b, c, blr]|
# | soup| [a, kol, simla]|
# | pav| [a, del]|
# |kachori| [a, guj]|
# | fries| [a, chen]|
# |noodles| [a, mum]|
# +-------+-------------------+
print(item.printSchema())
# root
# |-- item: string (nullable = true)
# |-- geography: string (nullable = true)
如何将上述数据集中的地理列转为数组类型?
使用拆分
选项 1
new= (item.withColumn('geography',split(regexp_replace('geography','[^\w\,]',''),'\,'))).printSchema()
选项 2
new1 =(item.withColumn('geography',col('geography').cast('string'))
.withColumn('geography',split('geography','\,'))).printSchema()
F.expr("regexp_extract_all(geography, '(\\w+)', 1)")
regexp_extract_all
可从 Spark 3.1+
获得
regexp_extract_all(str, regexp[, idx])
- Extract all strings in the str
that match the regexp
expression and corresponding to the regex group index.
from pyspark.sql import Row, functions as F
item = spark.createDataFrame([Row(item='fish',geography="['london','a','b','hyd']"),
Row(item='noodles',geography="['a','mum']")])
item.printSchema()
# root
# |-- item: string (nullable = true)
# |-- geography: string (nullable = true)
item = item.withColumn('geography', F.expr("regexp_extract_all(geography, '(\\w+)', 1)"))
item.printSchema()
# root
# |-- item: string (nullable = true)
# |-- geography: array (nullable = true)
# | |-- element: string (containsNull = true)
我有一个包含字符串数据类型列的数据框,但实际表示是数组类型。
import pyspark
from pyspark.sql import Row
item = spark.createDataFrame([Row(item='fish',geography=['london','a','b','hyd']),
Row(item='chicken',geography=['a','hyd','c']),
Row(item='rice',geography=['a','b','c','blr']),
Row(item='soup',geography=['a','kol','simla']),
Row(item='pav',geography=['a','del']),
Row(item='kachori',geography=['a','guj']),
Row(item='fries',geography=['a','chen']),
Row(item='noodles',geography=['a','mum'])])
item.show()
# +-------+-------------------+
# | item| geography|
# +-------+-------------------+
# | fish|[london, a, b, hyd]|
# |chicken| [a, hyd, c]|
# | rice| [a, b, c, blr]|
# | soup| [a, kol, simla]|
# | pav| [a, del]|
# |kachori| [a, guj]|
# | fries| [a, chen]|
# |noodles| [a, mum]|
# +-------+-------------------+
print(item.printSchema())
# root
# |-- item: string (nullable = true)
# |-- geography: string (nullable = true)
如何将上述数据集中的地理列转为数组类型?
使用拆分
选项 1
new= (item.withColumn('geography',split(regexp_replace('geography','[^\w\,]',''),'\,'))).printSchema()
选项 2
new1 =(item.withColumn('geography',col('geography').cast('string'))
.withColumn('geography',split('geography','\,'))).printSchema()
F.expr("regexp_extract_all(geography, '(\\w+)', 1)")
regexp_extract_all
可从 Spark 3.1+
regexp_extract_all(str, regexp[, idx])
- Extract all strings in thestr
that match theregexp
expression and corresponding to the regex group index.
from pyspark.sql import Row, functions as F
item = spark.createDataFrame([Row(item='fish',geography="['london','a','b','hyd']"),
Row(item='noodles',geography="['a','mum']")])
item.printSchema()
# root
# |-- item: string (nullable = true)
# |-- geography: string (nullable = true)
item = item.withColumn('geography', F.expr("regexp_extract_all(geography, '(\\w+)', 1)"))
item.printSchema()
# root
# |-- item: string (nullable = true)
# |-- geography: array (nullable = true)
# | |-- element: string (containsNull = true)