使用 UDF 时忽略条件
Using UDF ignores condition in when
假设您有以下 pyspark DataFrame:
data= [('foo',), ('123',), (None,), ('bar',)]
df = sqlCtx.createDataFrame(data, ["col"])
df.show()
#+----+
#| col|
#+----+
#| foo|
#| 123|
#|null|
#| bar|
#+----+
接下来的两个代码块应该做同样的事情 - 也就是说,如果不是 null
,则 return 列的大写。但是,第二种方法(使用 udf
)会产生错误。
方法一:使用pyspark.sql.functions.upper()
import pyspark.sql.functions as f
df.withColumn(
'upper',
f.when(
f.isnull(f.col('col')),
f.col('col')
).otherwise(f.upper(f.col('col')))
).show()
#+----+-----+
#| col|upper|
#+----+-----+
#| foo| FOO|
#| 123| 123|
#|null| null|
#| bar| BAR|
#+----+-----+
方法 2:在 udf
中使用 str.upper()
df.withColumn(
'upper',
f.when(
f.isnull(f.col('col')),
f.col('col')
).otherwise(f.udf(lambda x: x.upper(), StringType())(f.col('col')))
).show()
这给了我 AttributeError: 'NoneType' object has no attribute 'upper'
。为什么 f.isnull()
调用 when
的检查似乎被忽略了?
我知道我可以将我的 udf
更改为 f.udf(lambda x: x.upper() if x else x, StringType())
以避免此错误,但我想了解为什么会这样。
完整追溯:
Py4JJavaErrorTraceback (most recent call last)
<ipython-input-38-cbf0ffe73538> in <module>()
4 f.isnull(f.col('col')),
5 f.col('col')
----> 6 ).otherwise(f.udf(lambda x: x.upper(), StringType())(f.col('col')))
7 ).show()
/opt/SPARK2/lib/spark2/python/pyspark/sql/dataframe.py in show(self, n, truncate)
316 """
317 if isinstance(truncate, bool) and truncate:
--> 318 print(self._jdf.showString(n, 20))
319 else:
320 print(self._jdf.showString(n, int(truncate)))
/opt/SPARK2/lib/spark2/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/opt/SPARK2/lib/spark2/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/opt/SPARK2/lib/spark2/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling o642.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 51 in stage 77.0 failed 4 times, most recent failure: Lost task 51.3 in stage 77.0 (TID 5101, someserver.prod.somecompany.net, executor 99): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/SPARK2/lib/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
process()
File "/opt/SPARK2/lib/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/SPARK2/lib/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 106, in <lambda>
func = lambda _, it: map(mapper, it)
File "/opt/SPARK2/lib/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 92, in <lambda>
mapper = lambda a: udf(*a)
File "/opt/SPARK2/lib/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 70, in <lambda>
return lambda *a: f(*a)
File "<ipython-input-38-cbf0ffe73538>", line 6, in <lambda>
AttributeError: 'NoneType' object has no attribute 'upper'
你必须记住 Spark SQL(不像 RDD)不是 what-you-see-is-what-you-get。优化器/规划器可以自由地以任意顺序安排操作,甚至可以多次重复阶段。
Python udfs
不是在 Row
基础上应用,而是使用批处理模式。 when
没有那么多被忽略,但不是用来优化执行计划的:
== Physical Plan ==
*Project [col#0, CASE WHEN isnull(col#0) THEN col#0 ELSE pythonUDF0#21 END AS upper#17]
+- BatchEvalPython [<lambda>(col#0)], [col#0, pythonUDF0#21]
+- Scan ExistingRDD[col#0]
因此与 udf
一起使用的函数必须对 None
输入具有鲁棒性,例如:
df.withColumn(
'upper',
f.udf(
lambda x: x.upper() if x is not None else None,
StringType()
)(f.col('col'))
).show()
假设您有以下 pyspark DataFrame:
data= [('foo',), ('123',), (None,), ('bar',)]
df = sqlCtx.createDataFrame(data, ["col"])
df.show()
#+----+
#| col|
#+----+
#| foo|
#| 123|
#|null|
#| bar|
#+----+
接下来的两个代码块应该做同样的事情 - 也就是说,如果不是 null
,则 return 列的大写。但是,第二种方法(使用 udf
)会产生错误。
方法一:使用pyspark.sql.functions.upper()
import pyspark.sql.functions as f
df.withColumn(
'upper',
f.when(
f.isnull(f.col('col')),
f.col('col')
).otherwise(f.upper(f.col('col')))
).show()
#+----+-----+
#| col|upper|
#+----+-----+
#| foo| FOO|
#| 123| 123|
#|null| null|
#| bar| BAR|
#+----+-----+
方法 2:在 udf
str.upper()
df.withColumn(
'upper',
f.when(
f.isnull(f.col('col')),
f.col('col')
).otherwise(f.udf(lambda x: x.upper(), StringType())(f.col('col')))
).show()
这给了我 AttributeError: 'NoneType' object has no attribute 'upper'
。为什么 f.isnull()
调用 when
的检查似乎被忽略了?
我知道我可以将我的 udf
更改为 f.udf(lambda x: x.upper() if x else x, StringType())
以避免此错误,但我想了解为什么会这样。
完整追溯:
Py4JJavaErrorTraceback (most recent call last)
<ipython-input-38-cbf0ffe73538> in <module>()
4 f.isnull(f.col('col')),
5 f.col('col')
----> 6 ).otherwise(f.udf(lambda x: x.upper(), StringType())(f.col('col')))
7 ).show()
/opt/SPARK2/lib/spark2/python/pyspark/sql/dataframe.py in show(self, n, truncate)
316 """
317 if isinstance(truncate, bool) and truncate:
--> 318 print(self._jdf.showString(n, 20))
319 else:
320 print(self._jdf.showString(n, int(truncate)))
/opt/SPARK2/lib/spark2/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/opt/SPARK2/lib/spark2/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/opt/SPARK2/lib/spark2/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling o642.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 51 in stage 77.0 failed 4 times, most recent failure: Lost task 51.3 in stage 77.0 (TID 5101, someserver.prod.somecompany.net, executor 99): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/SPARK2/lib/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
process()
File "/opt/SPARK2/lib/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/SPARK2/lib/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 106, in <lambda>
func = lambda _, it: map(mapper, it)
File "/opt/SPARK2/lib/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 92, in <lambda>
mapper = lambda a: udf(*a)
File "/opt/SPARK2/lib/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 70, in <lambda>
return lambda *a: f(*a)
File "<ipython-input-38-cbf0ffe73538>", line 6, in <lambda>
AttributeError: 'NoneType' object has no attribute 'upper'
你必须记住 Spark SQL(不像 RDD)不是 what-you-see-is-what-you-get。优化器/规划器可以自由地以任意顺序安排操作,甚至可以多次重复阶段。
Python udfs
不是在 Row
基础上应用,而是使用批处理模式。 when
没有那么多被忽略,但不是用来优化执行计划的:
== Physical Plan ==
*Project [col#0, CASE WHEN isnull(col#0) THEN col#0 ELSE pythonUDF0#21 END AS upper#17]
+- BatchEvalPython [<lambda>(col#0)], [col#0, pythonUDF0#21]
+- Scan ExistingRDD[col#0]
因此与 udf
一起使用的函数必须对 None
输入具有鲁棒性,例如:
df.withColumn(
'upper',
f.udf(
lambda x: x.upper() if x is not None else None,
StringType()
)(f.col('col'))
).show()