如何将多列附加到一列?
How to append multiple columns to one column?
我正在对地址进行分组,其中有多种类型的地址,但我需要对它们进行分组并应用计数和排序。
输入
Address1 Address2 Address3
a1 b1 c1
b2 a2 e2
输出要求
Address4
a1
b2
b1
a2
c1
e2
你应该能够用 Spark UNION
中的 UNION
解决这个问题 SQL:
spark.sql(
"""
|SELECT Address4
|FROM (
| SELECT Address1 FROM table
| UNION
| SELECT Address2 FROM table
| UNION
| SELECT Address3 FROM table
| )
""".stripMargin).show()
#You can be able to do it with the below approach
val input_rdd = spark.sparkContext.parallelize(List(("a1", "b1", "c1"), ("a1", "b2", "c1"), ("a1", "b1", "c2"), ("a2", "b2", "c2")))
val input_df = input_rdd.toDF("Address1", "Address2", "Address3")
input_df.show()
+--------+--------+--------+
|Address1|Address2|Address3|
+--------+--------+--------+
| a1| b1| c1|
| a1| b2| c1|
| a1| b1| c2|
| a2| b2| c2|
+--------+--------+--------+
val out_address1_df = input_df.groupBy("Address1").agg(count(input_df("Address1")).as("count_address1")).
select(input_df("Address1").as("ADDRESS"), col("count_address1").as("COUNT"))
//out_address1_df.show()
val out_address2_df = input_df.groupBy("Address2").agg(count(input_df("Address2")).as("count_address2")).
select(input_df("Address2").as("ADDRESS"), col("count_address2").as("COUNT"))
//out_address2_df.show()
val out_address3_df = input_df.groupBy("Address3").agg(count(input_df("Address3")).as("count_address3")).
select(input_df("Address3").as("ADDRESS"), col("count_address3").as("COUNT"))
val output_df = out_address1_df.unionAll(out_address2_df).unionAll(out_address3_df)
output_df.show()
+-------+-----+
|ADDRESS|COUNT|
+-------+-----+
| a2| 1|
| a1| 3|
| b2| 2|
| b1| 2|
| c1| 2|
| c2| 2|
+-------+-----+
@Blokje5 给出的相同实现,但使用更高级别的数据集 api。
Dataset<Row> Ad1 = df.select(functions.col("Address1").as("Address4"));
Dataset<Row> Ad2 = df.select("Address2");
Dataset<Row> Ad3 = df.select("Address3");
Dataset<Row> Union_DS = Ad1.union(Ad2).union(Ad3);
Union_DS.show();
Dataset<Row> Union_Sorted = Union_DS
.groupBy("Address4")
.agg(functions.count(functions.col("Address4")).as("Count"))
.sort(functions.desc("Count"))
;
Union_Sorted.show();
我正在对地址进行分组,其中有多种类型的地址,但我需要对它们进行分组并应用计数和排序。
输入
Address1 Address2 Address3
a1 b1 c1
b2 a2 e2
输出要求
Address4
a1
b2
b1
a2
c1
e2
你应该能够用 Spark UNION
中的 UNION
解决这个问题 SQL:
spark.sql(
"""
|SELECT Address4
|FROM (
| SELECT Address1 FROM table
| UNION
| SELECT Address2 FROM table
| UNION
| SELECT Address3 FROM table
| )
""".stripMargin).show()
#You can be able to do it with the below approach
val input_rdd = spark.sparkContext.parallelize(List(("a1", "b1", "c1"), ("a1", "b2", "c1"), ("a1", "b1", "c2"), ("a2", "b2", "c2")))
val input_df = input_rdd.toDF("Address1", "Address2", "Address3")
input_df.show()
+--------+--------+--------+
|Address1|Address2|Address3|
+--------+--------+--------+
| a1| b1| c1|
| a1| b2| c1|
| a1| b1| c2|
| a2| b2| c2|
+--------+--------+--------+
val out_address1_df = input_df.groupBy("Address1").agg(count(input_df("Address1")).as("count_address1")).
select(input_df("Address1").as("ADDRESS"), col("count_address1").as("COUNT"))
//out_address1_df.show()
val out_address2_df = input_df.groupBy("Address2").agg(count(input_df("Address2")).as("count_address2")).
select(input_df("Address2").as("ADDRESS"), col("count_address2").as("COUNT"))
//out_address2_df.show()
val out_address3_df = input_df.groupBy("Address3").agg(count(input_df("Address3")).as("count_address3")).
select(input_df("Address3").as("ADDRESS"), col("count_address3").as("COUNT"))
val output_df = out_address1_df.unionAll(out_address2_df).unionAll(out_address3_df)
output_df.show()
+-------+-----+
|ADDRESS|COUNT|
+-------+-----+
| a2| 1|
| a1| 3|
| b2| 2|
| b1| 2|
| c1| 2|
| c2| 2|
+-------+-----+
@Blokje5 给出的相同实现,但使用更高级别的数据集 api。
Dataset<Row> Ad1 = df.select(functions.col("Address1").as("Address4"));
Dataset<Row> Ad2 = df.select("Address2");
Dataset<Row> Ad3 = df.select("Address3");
Dataset<Row> Union_DS = Ad1.union(Ad2).union(Ad3);
Union_DS.show();
Dataset<Row> Union_Sorted = Union_DS
.groupBy("Address4")
.agg(functions.count(functions.col("Address4")).as("Count"))
.sort(functions.desc("Count"))
;
Union_Sorted.show();