删除空列的快速方法[PySpark]
Quick way to delete empty column [PySpark]
有没有一种简单的方法可以在 pyspark 中删除巨大数据集(300+ col >100k 行)的空列?例如 df.dropna(axis=1,how='all')
在 Python
是的,您可以简单地使用 中的答案。我给它添加了一个 threshold
参数:
import pyspark.sql.functions as F
# Sample data
df = pd.DataFrame({'x1': ['a', '1', '2'],
'x2': ['b', None, '2'],
'x3': ['c', '0', '3'] })
df = sqlContext.createDataFrame(df)
df.show()
def drop_null_columns(df, threshold=0):
"""
This function drops all columns which contain null values.
:param df: A PySpark DataFrame
"""
null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict()
to_drop = [k for k, v in null_counts.items() if v > threshold]
df = df.drop(*to_drop)
return df
# Drops column b2, because it contains null values
drop_null_columns(df).show()
输出
+---+---+
| x1| x3|
+---+---+
| a| c|
| 1| 0|
| 2| 3|
+---+---+
第 x2 列已删除。
可以边用边用threshold=df.count()
这是@pissall 的 fn 的扩展功能。:
def drop_null_columns(df, threshold=-1):
"""
This function drops all columns which contain null values.
If threshold is negative (default), drop columns that have only null values.
If threshold is >=0, drop columns that have count of null values bigger than threshold. This may be very computationally expensive!
Returns PySpark DataFrame.
"""
if threshold<0:
max_per_column = df.select([F.max(c).alias(c) for c in df.columns]).collect()[0].asDict()
to_drop = [k for k, v in max_per_column.items() if v == None]
else:
null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict()
to_drop = [k for k, v in null_counts.items() if v > threshold]
df = df.drop(*to_drop)
return df
有没有一种简单的方法可以在 pyspark 中删除巨大数据集(300+ col >100k 行)的空列?例如 df.dropna(axis=1,how='all')
在 Python
是的,您可以简单地使用 threshold
参数:
import pyspark.sql.functions as F
# Sample data
df = pd.DataFrame({'x1': ['a', '1', '2'],
'x2': ['b', None, '2'],
'x3': ['c', '0', '3'] })
df = sqlContext.createDataFrame(df)
df.show()
def drop_null_columns(df, threshold=0):
"""
This function drops all columns which contain null values.
:param df: A PySpark DataFrame
"""
null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict()
to_drop = [k for k, v in null_counts.items() if v > threshold]
df = df.drop(*to_drop)
return df
# Drops column b2, because it contains null values
drop_null_columns(df).show()
输出
+---+---+
| x1| x3|
+---+---+
| a| c|
| 1| 0|
| 2| 3|
+---+---+
第 x2 列已删除。
可以边用边用threshold=df.count()
这是@pissall 的 fn 的扩展功能。:
def drop_null_columns(df, threshold=-1):
"""
This function drops all columns which contain null values.
If threshold is negative (default), drop columns that have only null values.
If threshold is >=0, drop columns that have count of null values bigger than threshold. This may be very computationally expensive!
Returns PySpark DataFrame.
"""
if threshold<0:
max_per_column = df.select([F.max(c).alias(c) for c in df.columns]).collect()[0].asDict()
to_drop = [k for k, v in max_per_column.items() if v == None]
else:
null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict()
to_drop = [k for k, v in null_counts.items() if v > threshold]
df = df.drop(*to_drop)
return df