Pyspark 中的多个 WHEN 条件实现
Multiple WHEN condition implementation in Pyspark
我有我的 T-SQL 代码,我在 Pyspark 中转换了下面的代码,但给我错误
CASE
WHEN time_on_site.eventaction = 'IN' AND time_on_site.next_action = 'OUT' AND time_on_site.timespent_sec < 72000 THEN 1 -- 20 hours
WHEN time_on_site.eventaction = 'IN' AND time_on_site.next_action = 'OUT' AND time_on_site.timespent_sec >= 72000 THEN 0
WHEN time_on_site.eventaction = 'IN' AND time_on_site.next_action = 'IN' AND time_on_site.timespent_sec <= 28800 THEN 2 -- 8 hours
WHEN time_on_site.eventaction = 'IN' AND time_on_site.next_action = 'IN' AND time_on_site.timespent_sec > 28800 THEN 3
WHEN time_on_site.type_flag = 'TYPE4' THEN 4
ELSE NULL
END AS "type"
下面是我的 Pyspark 脚本,它抛出了一个错误
from pyspark.sql.functions import when
TOS=TOS.withColumn('type', F.when( (col('eventaction') == 'IN') & (col('next_action') == 'OUT') & ("timespent_sec < 72000") , 1).
when( (col('eventaction') == 'IN') & (col('next_action') == 'OUT') & ("timespent_sec >= 72000") , 0).
when( (col('eventaction') == 'IN') & (col('next_action') == 'IN') & ("timespent_sec <= 28800") , 2).
when( (col('eventaction') == 'IN') & (col('next_action') == 'IN') & ("timespent_sec > 28800") , 3).
when(col('type_flag')=='TYPE4', 4).otherwise('NULL')
)
我哪里错了!?
我不知道 T-SQL 语法,但是如果你想做 if:.. elif: ...elif.... else
,那么下面的代码就可以了。
from pyspark.sql.functions import when, col
TOS=TOS.withColumn('type', when( (col('eventaction') == 'IN') & (col('next_action') == 'OUT') & ("timespent_sec < 72000") , 1).
otherwise( when( (col('eventaction') == 'IN') & (col('next_action') == 'OUT') & ("timespent_sec >= 72000") , 0).
otherwise( when( (col('eventaction') == 'IN') & (col('next_action') == 'IN') & ("timespent_sec <= 28800") , 2).
otherwise( when( (col('eventaction') == 'IN') & (col('next_action') == 'IN') & ("timespent_sec > 28800") , 3).
otherwise( when( col('type_flag')=='TYPE4', 4).otherwise('NULL'))))))
我已经正确实现了,如下所示
import pyspark.sql.functions as F
TOS=TOS.withColumn('type', F.when( (F.col("eventaction") == 'IN') & (F.col("next_action") == 'OUT') & (F.col("timespent_sec") < 72000) , 1).
when( (F.col("eventaction") == 'IN') & (F.col("next_action") == 'OUT') & (F.col("timespent_sec") >= 72000) , 0).
when( (F.col("eventaction") == 'IN') & (F.col("next_action") == 'IN') & (F.col("timespent_sec") <= 28800) , 2).
when( (F.col("eventaction") == 'IN') & (F.col("next_action") == 'IN') & (F.col("timespent_sec") > 28800) , 3).
when(F.col('type_flag')=='TYPE4', 4).otherwise('NULL'))
我有我的 T-SQL 代码,我在 Pyspark 中转换了下面的代码,但给我错误
CASE
WHEN time_on_site.eventaction = 'IN' AND time_on_site.next_action = 'OUT' AND time_on_site.timespent_sec < 72000 THEN 1 -- 20 hours
WHEN time_on_site.eventaction = 'IN' AND time_on_site.next_action = 'OUT' AND time_on_site.timespent_sec >= 72000 THEN 0
WHEN time_on_site.eventaction = 'IN' AND time_on_site.next_action = 'IN' AND time_on_site.timespent_sec <= 28800 THEN 2 -- 8 hours
WHEN time_on_site.eventaction = 'IN' AND time_on_site.next_action = 'IN' AND time_on_site.timespent_sec > 28800 THEN 3
WHEN time_on_site.type_flag = 'TYPE4' THEN 4
ELSE NULL
END AS "type"
下面是我的 Pyspark 脚本,它抛出了一个错误
from pyspark.sql.functions import when
TOS=TOS.withColumn('type', F.when( (col('eventaction') == 'IN') & (col('next_action') == 'OUT') & ("timespent_sec < 72000") , 1).
when( (col('eventaction') == 'IN') & (col('next_action') == 'OUT') & ("timespent_sec >= 72000") , 0).
when( (col('eventaction') == 'IN') & (col('next_action') == 'IN') & ("timespent_sec <= 28800") , 2).
when( (col('eventaction') == 'IN') & (col('next_action') == 'IN') & ("timespent_sec > 28800") , 3).
when(col('type_flag')=='TYPE4', 4).otherwise('NULL')
)
我哪里错了!?
我不知道 T-SQL 语法,但是如果你想做 if:.. elif: ...elif.... else
,那么下面的代码就可以了。
from pyspark.sql.functions import when, col
TOS=TOS.withColumn('type', when( (col('eventaction') == 'IN') & (col('next_action') == 'OUT') & ("timespent_sec < 72000") , 1).
otherwise( when( (col('eventaction') == 'IN') & (col('next_action') == 'OUT') & ("timespent_sec >= 72000") , 0).
otherwise( when( (col('eventaction') == 'IN') & (col('next_action') == 'IN') & ("timespent_sec <= 28800") , 2).
otherwise( when( (col('eventaction') == 'IN') & (col('next_action') == 'IN') & ("timespent_sec > 28800") , 3).
otherwise( when( col('type_flag')=='TYPE4', 4).otherwise('NULL'))))))
我已经正确实现了,如下所示
import pyspark.sql.functions as F
TOS=TOS.withColumn('type', F.when( (F.col("eventaction") == 'IN') & (F.col("next_action") == 'OUT') & (F.col("timespent_sec") < 72000) , 1).
when( (F.col("eventaction") == 'IN') & (F.col("next_action") == 'OUT') & (F.col("timespent_sec") >= 72000) , 0).
when( (F.col("eventaction") == 'IN') & (F.col("next_action") == 'IN') & (F.col("timespent_sec") <= 28800) , 2).
when( (F.col("eventaction") == 'IN') & (F.col("next_action") == 'IN') & (F.col("timespent_sec") > 28800) , 3).
when(F.col('type_flag')=='TYPE4', 4).otherwise('NULL'))