处理每一行以获取日期
Process each row to get date
我有一个包含年份和 mon01、mon02 的文件
使用列名的最后两个字符提取月份(即 - 来自 MON01 的 01)
相应月份(MON01、MON02..)中文本值的长度与该月的天数相同。
检索每 1 次发生的日期。
喜欢 - 2018-01-02(02 天因为 1 出现在第 2 天)
2018-01-03
2018-01-07
我们在spark-scala中如何做才能得到结果?
获取列表列
val cols = df.columns.toList
val res = cols.foldLeft(...)
{ val filterDF = colWhere.foldLeft(pDYF){(tmpDF, colName) =>
{
val cn = s"${colName}_${colName}"
val v: Option[String] = if (colName == "countries"
|| colName == "states"
|| colName == "zipCodes"
|| colName == "genders"
|| colName == "providers") {
val vc = tmpDF.first().getAs[String](colName).asInstanceOf[mutable.WrappedArray[String]].map(x=>x).toArray
val vc1: Option[String] = if(vc.length == 0) None else Some(vc.map(i=> s"$colName" + " = '" + i.toString + "'").mkString(" or "))
vc1
} else {
val vc = tmpDF.first().getAs[Long](colName).asInstanceOf[mutable.WrappedArray[Long]].map(x=>x).toArray
val vc1: Option[String] = if(vc.length == 0) None else Some(vc.map(i=> s"$colName" + " = " + i.toString).mkString(" or "))
vc1
}
tmpDF.withColumn(cn, lit(v.getOrElse("")))
}}}
UDF:
def myudf =(month:String,year:String ,value:String ) => {
val month1 = month.replaceAll("[A-Za-z]+","")
var date=1
val dateList = ListBuffer[String]()
for(char<-value){
if(char=='1'){
dateList += year+"-"+month1+"-"+date
} else {
dateList += ""
}
date += 1
}
dateList.filter(_.nonEmpty)
}
//Main Method
val data = spark.read.option("header", "true").csv("data.csv")
data.show()
+----+-----+-----+
|Year|Mon01|Mon02|
+----+-----+-----+
|2018|01110|00111|
|2019|01100|00001|
+----+-----+-----+
val myCostumeudf = udf(myudf)
val monthCols = data.columns.drop(1)
val requiredDF = monthCols.foldLeft(data){
case (df, month) =>
df.withColumn("Date_"+month, myCostumeudf(lit(month),data("Year"),data(month)))
}
requiredDF.show(false)
+----+-----+-----+---------------------------------+---------------------------------+
|Year|Mon01|Mon02|Date_Mon01 |Date_Mon02 |
+----+-----+-----+---------------------------------+---------------------------------+
|2018|01110|00111|[2018-01-2, 2018-01-3, 2018-01-4]|[2018-02-3, 2018-02-4, 2018-02-5]|
|2019|01100|00001|[2019-01-2, 2019-01-3] |[2019-02-5] |
+----+-----+-----+---------------------------------+---------------------------------+
希望对您有所帮助..
我有一个包含年份和 mon01、mon02 的文件
使用列名的最后两个字符提取月份(即 - 来自 MON01 的 01) 相应月份(MON01、MON02..)中文本值的长度与该月的天数相同。 检索每 1 次发生的日期。
喜欢 - 2018-01-02(02 天因为 1 出现在第 2 天)
2018-01-03
2018-01-07
我们在spark-scala中如何做才能得到结果?
获取列表列
val cols = df.columns.toList val res = cols.foldLeft(...)
{ val filterDF = colWhere.foldLeft(pDYF){(tmpDF, colName) => { val cn = s"${colName}_${colName}" val v: Option[String] = if (colName == "countries" || colName == "states" || colName == "zipCodes" || colName == "genders" || colName == "providers") { val vc = tmpDF.first().getAs[String](colName).asInstanceOf[mutable.WrappedArray[String]].map(x=>x).toArray val vc1: Option[String] = if(vc.length == 0) None else Some(vc.map(i=> s"$colName" + " = '" + i.toString + "'").mkString(" or ")) vc1 } else { val vc = tmpDF.first().getAs[Long](colName).asInstanceOf[mutable.WrappedArray[Long]].map(x=>x).toArray val vc1: Option[String] = if(vc.length == 0) None else Some(vc.map(i=> s"$colName" + " = " + i.toString).mkString(" or ")) vc1 } tmpDF.withColumn(cn, lit(v.getOrElse(""))) }}}
UDF:
def myudf =(month:String,year:String ,value:String ) => {
val month1 = month.replaceAll("[A-Za-z]+","")
var date=1
val dateList = ListBuffer[String]()
for(char<-value){
if(char=='1'){
dateList += year+"-"+month1+"-"+date
} else {
dateList += ""
}
date += 1
}
dateList.filter(_.nonEmpty)
}
//Main Method
val data = spark.read.option("header", "true").csv("data.csv")
data.show()
+----+-----+-----+
|Year|Mon01|Mon02|
+----+-----+-----+
|2018|01110|00111|
|2019|01100|00001|
+----+-----+-----+
val myCostumeudf = udf(myudf)
val monthCols = data.columns.drop(1)
val requiredDF = monthCols.foldLeft(data){
case (df, month) =>
df.withColumn("Date_"+month, myCostumeudf(lit(month),data("Year"),data(month)))
}
requiredDF.show(false)
+----+-----+-----+---------------------------------+---------------------------------+
|Year|Mon01|Mon02|Date_Mon01 |Date_Mon02 |
+----+-----+-----+---------------------------------+---------------------------------+
|2018|01110|00111|[2018-01-2, 2018-01-3, 2018-01-4]|[2018-02-3, 2018-02-4, 2018-02-5]|
|2019|01100|00001|[2019-01-2, 2019-01-3] |[2019-02-5] |
+----+-----+-----+---------------------------------+---------------------------------+
希望对您有所帮助..