如何将多列附加到一列?

How to append multiple columns to one column?

我正在对地址进行分组,其中有多种类型的地址,但我需要对它们进行分组并应用计数和排序。

输入

Address1  Address2   Address3
    a1       b1        c1 
    b2       a2        e2 

输出要求

Address4
a1
b2
b1
a2
c1
e2

你应该能够用 Spark UNION 中的 UNION 解决这个问题 SQL:

spark.sql(
  """
    |SELECT Address4
    |FROM (
    | SELECT Address1 FROM table
    | UNION
    | SELECT Address2 FROM table
    | UNION
    | SELECT Address3 FROM table
    | )
  """.stripMargin).show()
#You can be able to do it with the below approach
val input_rdd = spark.sparkContext.parallelize(List(("a1", "b1", "c1"), ("a1", "b2", "c1"), ("a1", "b1", "c2"), ("a2", "b2", "c2")))
    val input_df = input_rdd.toDF("Address1", "Address2", "Address3")
    input_df.show()
+--------+--------+--------+
|Address1|Address2|Address3|
+--------+--------+--------+
|      a1|      b1|      c1|
|      a1|      b2|      c1|
|      a1|      b1|      c2|
|      a2|      b2|      c2|
+--------+--------+--------+
    val out_address1_df = input_df.groupBy("Address1").agg(count(input_df("Address1")).as("count_address1")).
      select(input_df("Address1").as("ADDRESS"), col("count_address1").as("COUNT"))
    //out_address1_df.show()
    val out_address2_df = input_df.groupBy("Address2").agg(count(input_df("Address2")).as("count_address2")).
      select(input_df("Address2").as("ADDRESS"), col("count_address2").as("COUNT"))
    //out_address2_df.show()
    val out_address3_df = input_df.groupBy("Address3").agg(count(input_df("Address3")).as("count_address3")).
      select(input_df("Address3").as("ADDRESS"), col("count_address3").as("COUNT"))

    val output_df = out_address1_df.unionAll(out_address2_df).unionAll(out_address3_df)
    output_df.show()
+-------+-----+
|ADDRESS|COUNT|
+-------+-----+
|     a2|    1|
|     a1|    3|
|     b2|    2|
|     b1|    2|
|     c1|    2|
|     c2|    2|
+-------+-----+

@Blokje5 给出的相同实现,但使用更高级别的数据集 api。


        Dataset<Row> Ad1 = df.select(functions.col("Address1").as("Address4"));

        Dataset<Row> Ad2 = df.select("Address2");
        Dataset<Row> Ad3 = df.select("Address3");


        Dataset<Row> Union_DS = Ad1.union(Ad2).union(Ad3);

        Union_DS.show();


        Dataset<Row> Union_Sorted = Union_DS
        .groupBy("Address4")
        .agg(functions.count(functions.col("Address4")).as("Count"))

        .sort(functions.desc("Count"))

        ;
        Union_Sorted.show();