AnalysisException,pyspark 无法解析数据框查询中的变量
AnalysisException, pyspark cannot resolve variables inside dataframe query
我这里有一个 pyspark 脚本行,
df_output = df.select("*",$checkcol)
df_output.show()
通过对变量进行硬编码可以很好地工作,
但是当参数化时它会抛出一个错误说,
pyspark.sql.utils.AnalysisException: 'cannot resolve \'`"*", F.....
其中 checkcol
是一个变量,其值如下所示,
checkcol -
F.when(F.col("colA")=='null',"Yes").otherwise(date_validation_udf("colA")).alias("colA_DateCheck"),
F.when(F.col("colB")=='null',"Yes").otherwise(date_validation_udf("colB")).alias("colB_DateCheck"),F.when(F.col("colC")=='null',"Yes").otherwise(date_validation_udf("colC")).alias("colC_DateCheck"),
F.when(F.col("colD")=='null',"Yes").otherwise(num_check_udf("colD")).alias("colD_NumCheck"),F.when(F.col("colE")=='null',"Yes").otherwise(num_check_udf("colE")).alias("colE_NumCheck"),
F.when(F.col("colF")=='null',"Yes").otherwise(num_check_udf("colF")).alias("colF_NumCheck"),F.when(F.col("colG")=='null',"Yes").otherwise(num_check_udf("colG")).alias("colG_NumCheck")
试试这个:
import pyspark.sql.functions as F
df_output = df.withColumn("colA",
F.when(F.col("colA")=='null',"Yes").otherwise(date_validation_udf("colA")).alias("colA_DateCheck"))
.withColumn("colB",
F.when(F.col("colB")=='null',"Yes").otherwise(date_validation_udf("colB")).alias("colB_DateCheck"),F.when(F.col("colC")=='null',"Yes").otherwise(date_validation_udf("colC")).alias("colC_DateCheck"),)
...
df_output.show()
编辑:
将这些语句作为一个变量传递给 select,试试这个:
checkcol = (F.when(F.col("colA") == 'null', "Yes").otherwise(date_validation_udf("colA")).alias("colA_DateCheck"),
F.when(F.col("colB") == 'null', "Yes").otherwise(date_validation_udf("colB")).alias("colB_DateCheck"),
F.when(F.col("colC") == 'null', "Yes").otherwise(date_validation_udf("colC")).alias("colC_DateCheck"),
F.when(F.col("colD") == 'null', "Yes").otherwise(num_check_udf("colD")).alias("colD_NumCheck"),
F.when(F.col("colE") == 'null', "Yes").otherwise(num_check_udf("colE")).alias("colE_NumCheck"),
F.when(F.col("colF") == 'null', "Yes").otherwise(num_check_udf("colF")).alias("colF_NumCheck"),
F.when(F.col("colG") == 'null', "Yes").otherwise(num_check_udf("colG")).alias("colG_NumCheck"))
df_output = df.select(
'*',
*checkcol
)
我这里有一个 pyspark 脚本行,
df_output = df.select("*",$checkcol)
df_output.show()
通过对变量进行硬编码可以很好地工作, 但是当参数化时它会抛出一个错误说,
pyspark.sql.utils.AnalysisException: 'cannot resolve \'`"*", F.....
其中 checkcol
是一个变量,其值如下所示,
checkcol -
F.when(F.col("colA")=='null',"Yes").otherwise(date_validation_udf("colA")).alias("colA_DateCheck"),
F.when(F.col("colB")=='null',"Yes").otherwise(date_validation_udf("colB")).alias("colB_DateCheck"),F.when(F.col("colC")=='null',"Yes").otherwise(date_validation_udf("colC")).alias("colC_DateCheck"),
F.when(F.col("colD")=='null',"Yes").otherwise(num_check_udf("colD")).alias("colD_NumCheck"),F.when(F.col("colE")=='null',"Yes").otherwise(num_check_udf("colE")).alias("colE_NumCheck"),
F.when(F.col("colF")=='null',"Yes").otherwise(num_check_udf("colF")).alias("colF_NumCheck"),F.when(F.col("colG")=='null',"Yes").otherwise(num_check_udf("colG")).alias("colG_NumCheck")
试试这个:
import pyspark.sql.functions as F
df_output = df.withColumn("colA",
F.when(F.col("colA")=='null',"Yes").otherwise(date_validation_udf("colA")).alias("colA_DateCheck"))
.withColumn("colB",
F.when(F.col("colB")=='null',"Yes").otherwise(date_validation_udf("colB")).alias("colB_DateCheck"),F.when(F.col("colC")=='null',"Yes").otherwise(date_validation_udf("colC")).alias("colC_DateCheck"),)
...
df_output.show()
编辑:
将这些语句作为一个变量传递给 select,试试这个:
checkcol = (F.when(F.col("colA") == 'null', "Yes").otherwise(date_validation_udf("colA")).alias("colA_DateCheck"),
F.when(F.col("colB") == 'null', "Yes").otherwise(date_validation_udf("colB")).alias("colB_DateCheck"),
F.when(F.col("colC") == 'null', "Yes").otherwise(date_validation_udf("colC")).alias("colC_DateCheck"),
F.when(F.col("colD") == 'null', "Yes").otherwise(num_check_udf("colD")).alias("colD_NumCheck"),
F.when(F.col("colE") == 'null', "Yes").otherwise(num_check_udf("colE")).alias("colE_NumCheck"),
F.when(F.col("colF") == 'null', "Yes").otherwise(num_check_udf("colF")).alias("colF_NumCheck"),
F.when(F.col("colG") == 'null', "Yes").otherwise(num_check_udf("colG")).alias("colG_NumCheck"))
df_output = df.select(
'*',
*checkcol
)