org.apache.avro.UnresolvedUnionException:不在联合 ["long","null"]
org.apache.avro.UnresolvedUnionException: Not in union ["long","null"]
我编写了这段代码,使用 Spark (1.3.0)、Scala (2.10.4) 和 Spark Avro (1.0.0) 将 Spark 数据帧保存到 avro 文件中
def getMatchingLine(line: String) : Option[Row] = {
val regex = "^.*&50=(\w+)&.*&62-\d=8&63-\d=(\w+)&.*timestamp=(\d+).*$".r
line match {
case regex(a, b, c) => Some(Row(a, b, c))
case _ => None
}
}
val schema = StructType(List(StructField("a", StringType, true), StructField("b", StringType, true), StructField("c", LongType, true)))
val rdd = sc.textFile(inputPath).cache()
val rdd = rdd.map(getMatchingLine).filter(_.isDefined).flatMap(x => x)
val df = sqlSc.createDataFrame(rdd, schema)
df.save("/user/foo/", "com.databricks.spark.avro")
错误消息的堆栈跟踪是
ion: org.apache.avro.UnresolvedUnionException: Not in union ["long","null"]: 1426647603695
at org.apache.avro.file.DataFileWriter.append(DataFileWriter.java:296)
at org.apache.avro.mapred.AvroOutputFormat.write(AvroOutputFormat.java:169)
at org.apache.avro.mapred.AvroOutputFormat.write(AvroOutputFormat.java:166)
at org.apache.spark.SparkHadoopWriter.write(SparkHadoopWriter.scala:96)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun.apply(PairRDDFunctions.scala:1073)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun.apply(PairRDDFunctions.scala:1059)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
尝试将元组中的第 3 位转换为长。可能是 Integer,不能转换为 Long。
一些(行(a,b,(长)c))
我编写了这段代码,使用 Spark (1.3.0)、Scala (2.10.4) 和 Spark Avro (1.0.0) 将 Spark 数据帧保存到 avro 文件中
def getMatchingLine(line: String) : Option[Row] = {
val regex = "^.*&50=(\w+)&.*&62-\d=8&63-\d=(\w+)&.*timestamp=(\d+).*$".r
line match {
case regex(a, b, c) => Some(Row(a, b, c))
case _ => None
}
}
val schema = StructType(List(StructField("a", StringType, true), StructField("b", StringType, true), StructField("c", LongType, true)))
val rdd = sc.textFile(inputPath).cache()
val rdd = rdd.map(getMatchingLine).filter(_.isDefined).flatMap(x => x)
val df = sqlSc.createDataFrame(rdd, schema)
df.save("/user/foo/", "com.databricks.spark.avro")
错误消息的堆栈跟踪是
ion: org.apache.avro.UnresolvedUnionException: Not in union ["long","null"]: 1426647603695
at org.apache.avro.file.DataFileWriter.append(DataFileWriter.java:296)
at org.apache.avro.mapred.AvroOutputFormat.write(AvroOutputFormat.java:169)
at org.apache.avro.mapred.AvroOutputFormat.write(AvroOutputFormat.java:166)
at org.apache.spark.SparkHadoopWriter.write(SparkHadoopWriter.scala:96)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun.apply(PairRDDFunctions.scala:1073)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun.apply(PairRDDFunctions.scala:1059)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
尝试将元组中的第 3 位转换为长。可能是 Integer,不能转换为 Long。
一些(行(a,b,(长)c))