获取 PySpark 中列的名称/别名
Get name / alias of column in PySpark
我正在这样定义一个列对象:
column = F.col('foo').alias('bar')
我知道我可以使用 str(column)
获得完整的表达式。
但是我怎样才能只得到列的别名呢?
在示例中,我正在寻找函数 get_column_name
,其中 get_column_name(column)
returns 字符串 bar
.
一种方法是通过正则表达式:
from pyspark.sql.functions import col
column = col('foo').alias('bar')
print(column)
#Column<foo AS `bar`>
import re
print(re.findall("(?<=AS `)\w+(?=`>$)", str(column)))[0]
#'bar'
或者,我们可以使用包装函数来调整 Column.alias
和 Column.name
方法的行为,以仅将 别名 存储在 AS
属性:
from pyspark.sql import Column, SparkSession
from pyspark.sql.functions import col, explode, array, struct, lit
SparkSession.builder.getOrCreate()
def alias_wrapper(self, *alias, **kwargs):
renamed_col = Column._alias(self, *alias, **kwargs)
renamed_col.AS = alias[0] if len(alias) == 1 else alias
return renamed_col
Column._alias, Column.alias, Column.name, Column.AS = Column.alias, alias_wrapper, alias_wrapper, None
然后保证:
assert(col("foo").alias("bar").AS == "bar")
# `name` should act like `alias`
assert(col("foo").name("bar").AS == "bar")
# column without alias should have None in `AS`
assert(col("foo").AS is None)
# multialias should be handled
assert(explode(array(struct(lit(1), lit("a")))).alias("foo", "bar").AS == ("foo", "bar"))
我注意到在某些系统中,列名周围可能有反引号。以下选项在有反引号和没有反引号的情况下都有效。
选项 1(无正则表达式):str(col).replace("`", "").split("'")[-2].split(" AS ")[-1])
from pyspark.sql.functions import col
col_1 = col('foo')
col_2 = col('foo').alias('bar')
col_3 = col('foo').alias('bar').alias('baz')
s = str(col_1)
print(col_1)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo'>
# foo
s = str(col_2)
print(col_2)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo AS bar'>
# bar
s = str(col_3)
print(col_3)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo AS bar AS baz'>
# baz
选项 2(正则表达式):模式 '.*?`?(\w+)`?'
看起来足够安全:
re.search(r"'.*?`?(\w+)`?'", str(col)).group(1)
from pyspark.sql.functions import col
col_1 = col('foo')
col_2 = col('foo').alias('bar')
col_3 = col('foo').alias('bar').alias('baz')
import re
print(col_1)
print(re.search(r"'.*?`?(\w+)`?'", str(col_1)).group(1))
# Column<'foo'>
# foo
print(col_2)
print(re.search(r"'.*?`?(\w+)`?'", str(col_2)).group(1))
# Column<'foo AS bar'>
# bar
print(col_3)
print(re.search(r"'.*?`?(\w+)`?'", str(col_3)).group(1))
# Column<'foo AS bar AS baz'>
# baz
我正在这样定义一个列对象:
column = F.col('foo').alias('bar')
我知道我可以使用 str(column)
获得完整的表达式。
但是我怎样才能只得到列的别名呢?
在示例中,我正在寻找函数 get_column_name
,其中 get_column_name(column)
returns 字符串 bar
.
一种方法是通过正则表达式:
from pyspark.sql.functions import col
column = col('foo').alias('bar')
print(column)
#Column<foo AS `bar`>
import re
print(re.findall("(?<=AS `)\w+(?=`>$)", str(column)))[0]
#'bar'
或者,我们可以使用包装函数来调整 Column.alias
和 Column.name
方法的行为,以仅将 别名 存储在 AS
属性:
from pyspark.sql import Column, SparkSession
from pyspark.sql.functions import col, explode, array, struct, lit
SparkSession.builder.getOrCreate()
def alias_wrapper(self, *alias, **kwargs):
renamed_col = Column._alias(self, *alias, **kwargs)
renamed_col.AS = alias[0] if len(alias) == 1 else alias
return renamed_col
Column._alias, Column.alias, Column.name, Column.AS = Column.alias, alias_wrapper, alias_wrapper, None
然后保证:
assert(col("foo").alias("bar").AS == "bar")
# `name` should act like `alias`
assert(col("foo").name("bar").AS == "bar")
# column without alias should have None in `AS`
assert(col("foo").AS is None)
# multialias should be handled
assert(explode(array(struct(lit(1), lit("a")))).alias("foo", "bar").AS == ("foo", "bar"))
我注意到在某些系统中,列名周围可能有反引号。以下选项在有反引号和没有反引号的情况下都有效。
选项 1(无正则表达式):str(col).replace("`", "").split("'")[-2].split(" AS ")[-1])
from pyspark.sql.functions import col
col_1 = col('foo')
col_2 = col('foo').alias('bar')
col_3 = col('foo').alias('bar').alias('baz')
s = str(col_1)
print(col_1)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo'>
# foo
s = str(col_2)
print(col_2)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo AS bar'>
# bar
s = str(col_3)
print(col_3)
print(s.replace("`", "").split("'")[-2].split(" AS ")[-1])
# Column<'foo AS bar AS baz'>
# baz
选项 2(正则表达式):模式 '.*?`?(\w+)`?'
看起来足够安全:
re.search(r"'.*?`?(\w+)`?'", str(col)).group(1)
from pyspark.sql.functions import col
col_1 = col('foo')
col_2 = col('foo').alias('bar')
col_3 = col('foo').alias('bar').alias('baz')
import re
print(col_1)
print(re.search(r"'.*?`?(\w+)`?'", str(col_1)).group(1))
# Column<'foo'>
# foo
print(col_2)
print(re.search(r"'.*?`?(\w+)`?'", str(col_2)).group(1))
# Column<'foo AS bar'>
# bar
print(col_3)
print(re.search(r"'.*?`?(\w+)`?'", str(col_3)).group(1))
# Column<'foo AS bar AS baz'>
# baz