如何在 pyspark 中遍历数组结构和 return 我想要的元素

How to iterate through an array struct and return the element I want in pyspark

这是我的示例 json 文件:

{"data":"example1","data2":"example2","register":[{"name":"John","last_name":"Travolta","age":68},{"name":"Nicolas","last_name":"Cage","age":58}], "data3":"example3","data4":"example4"}

我有一个类似于此的数据模式(完全是说明性的):

root
 |-- register: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- age: long (nullable = true)
 |    |    |-- last_name: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- data: string (nullable = true)
 |-- data2: string (nullable = true)
 |-- data3: string (nullable = true)
 |-- data4: string (nullable = true)

我想要的是在这个寄存器中迭代,检查名称字段是否等于例如John Travolta 并创建一个新结构 new_register(例如),其中包含与名称位于同一索引中的所有字段。

我尝试使用 spark 自己的一些函数,例如 filter、when、contains,但是其中 none 给了我想要的结果。

我也尝试过实现一个 UDF,但我找不到将函数应用到我想要的领域的方法。

如何解决上述问题?

首先分解数组字段并使用点符号和过滤器访问结构字段value.Here是代码。

df.printSchema()
df.show(10,False)

df1 = df.withColumn("new_struct",explode("register")).filter((col("new_struct.last_name") == 'Travolta') &  (col("new_struct.name") == 'John'))
df1.show(10,False)
df1.printSchema()


 root
 |-- data: string (nullable = true)
 |-- data2: string (nullable = true)
 |-- data3: string (nullable = true)
 |-- data4: string (nullable = true)
 |-- register: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- age: long (nullable = true)
 |    |    |-- last_name: string (nullable = true)
 |    |    |-- name: string (nullable = true)

+--------+--------+--------+--------+-------------------------------------------+
|data    |data2   |data3   |data4   |register                                   |
+--------+--------+--------+--------+-------------------------------------------+
|example1|example2|example3|example4|[{68, Travolta, John}, {58, Cage, Nicolas}]|
+--------+--------+--------+--------+-------------------------------------------+

+--------+--------+--------+--------+-------------------------------------------+--------------------+
|data    |data2   |data3   |data4   |register                                   |new_struct          |
+--------+--------+--------+--------+-------------------------------------------+--------------------+
|example1|example2|example3|example4|[{68, Travolta, John}, {58, Cage, Nicolas}]|{68, Travolta, John}|
+--------+--------+--------+--------+-------------------------------------------+--------------------+

root
 |-- data: string (nullable = true)
 |-- data2: string (nullable = true)
 |-- data3: string (nullable = true)
 |-- data4: string (nullable = true)
 |-- register: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- age: long (nullable = true)
 |    |    |-- last_name: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- new_struct: struct (nullable = true)
 |    |-- age: long (nullable = true)
 |    |-- last_name: string (nullable = true)
 |    |-- name: string (nullable = true)