为什么保存到超过 10000 列的镶木地板文件会导致 JaninoRuntimeException?
Why saving to parquet file with over 10000 columns lead to JaninoRuntimeException?
我有这段代码可以生成随机 df 并将其作为 spark 2.1 中的 parquet 文件写入磁盘。当列数达到 10000 时,这会遇到问题,但对于 10000,它似乎工作正常。
在 100000 列的情况下,spark 只是在屏幕上打印了一堆代码并抛出如下错误。
我怎样才能在没有错误的情况下将它写入 parquet?
import org.apache.spark.sql.types.{StructType,StructField,IntegerType,DoubleType}
import org.apache.spark.ml.Pipeline
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import scala.util.Random
import scala.math
val nRows = 10000
val nCols = 100000
val rD = sc.parallelize(0 to nRows-1).map { _ => Row.fromSeq(Seq.fill(nCols)(math.ceil(1000*Random.nextDouble()))) }
val schema = StructType((0 to nCols-1).map { i => StructField("C" + i, DoubleType, true) } )
val df = spark.createDataFrame(rD, schema)
df.select("*").write.format("parquet").save("df.parquet")
/* 379357 */ private void apply_22702(InternalRow i) {
/* 379358 */
/* 379359 */
/* 379360 */ boolean isNull90808 = i.isNullAt(90808);
/* 379361 */ double value90808 = isNull90808 ? -1.0 : (i.getDouble(90808));
/* 379362 */ if (isNull90808) {
/* 379363 */ rowWriter.setNullAt(90808);
/* 379364 */ } else {
/* 379365 */ rowWriter.write(90808, value90808);
/* 379366 */ }
/* 379367 */
/* 379368 */
/* 379369 */ boolean isNull90809 = i.isNullAt(90809);
/* 379370 */ double value90809 = isNull90809 ? -1.0 : (i.getDouble(90809));
/* 379371 */ if (isNull90809) {
/* 379372 */ rowWriter.setNullAt(90809);
/* 379373 */ } else {
/* 379374 */ rowWriter.write(90809, value90809);
/* 379375 */ }
/* 379376 */
/* 379377 */
/* 379378 */ boolean isNull90810 = i.isNullAt(90810);
/* 379379 */ double value90810 = isNull90810 ? -1.0 : (i.getDouble(90810));
/* 379380 */ if (isNull90810) {
/* 379381 */ rowWriter.setNullAt(90810);
/* 379382 */ } else {
/* 379383 */ rowWriter.write(90810, value90810);
/* 379384 */ }
/* 379385 */
.
.
.
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:941)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon.load(CodeGenerator.scala:998)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon.load(CodeGenerator.scala:995)
at org.spark_project.guava.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
at org.spark_project.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
at org.spark_project.guava.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
... 25 more
Caused by: org.codehaus.janino.JaninoRuntimeException: Constant pool for class org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection has grown past JVM limit of 0xFFFF
at org.codehaus.janino.util.ClassFile.addToConstantPool(ClassFile.java:499)
at org.codehaus.janino.util.ClassFile.addConstantIntegerInfo(ClassFile.java:395)
at org.codehaus.janino.UnitCompiler.addConstantIntegerInfo(UnitCompiler.java:11137)
at org.codehaus.janino.UnitCompiler.pushConstant(UnitCompiler.java:9681)
at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4911)
at org.codehaus.janino.UnitCompiler.access00(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler.visitIntegerLiteral(UnitCompiler.java:3776)
at org.codehaus.janino.UnitCompiler.visitIntegerLiteral(UnitCompiler.java:3762)
at org.codehaus.janino.Java$IntegerLiteral.accept(Java.java:4635)
at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:3762)
at org.codehaus.janino.UnitCompiler.fakeCompile(UnitCompiler.java:3128)
at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:4927)
at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4526)
at org.codehaus.janino.UnitCompiler.access00(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler.visitMethodInvocation(UnitCompiler.java:3774)
at org.codehaus.janino.UnitCompiler.visitMethodInvocation(UnitCompiler.java:3762)
at org.codehaus.janino.Java$MethodInvocation.accept(Java.java:4328)
at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:3762)
at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:4933)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:2330)
at org.codehaus.janino.UnitCompiler.access00(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler.visitLocalVariableDeclarationStatement(UnitCompiler.java:1386)
at org.codehaus.janino.UnitCompiler.visitLocalVariableDeclarationStatement(UnitCompiler.java:1370)
at org.codehaus.janino.Java$LocalVariableDeclarationStatement.accept(Java.java:2974)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:1370)
at org.codehaus.janino.UnitCompiler.compileStatements(UnitCompiler.java:1450)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:2811)
at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1262)
at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1234)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:538)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:890)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:894)
at org.codehaus.janino.UnitCompiler.access0(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler.visitMemberClassDeclaration(UnitCompiler.java:377)
at org.codehaus.janino.UnitCompiler.visitMemberClassDeclaration(UnitCompiler.java:369)
at org.codehaus.janino.Java$MemberClassDeclaration.accept(Java.java:1128)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:369)
at org.codehaus.janino.UnitCompiler.compileDeclaredMemberTypes(UnitCompiler.java:1209)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:564)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:420)
at org.codehaus.janino.UnitCompiler.access0(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler.visitPackageMemberClassDeclaration(UnitCompiler.java:374)
at org.codehaus.janino.UnitCompiler.visitPackageMemberClassDeclaration(UnitCompiler.java:369)
at org.codehaus.janino.Java$AbstractPackageMemberClassDeclaration.accept(Java.java:1309)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:369)
at org.codehaus.janino.UnitCompiler.compileUnit(UnitCompiler.java:345)
at org.codehaus.janino.SimpleCompiler.compileToClassLoader(SimpleCompiler.java:396)
at org.codehaus.janino.ClassBodyEvaluator.compileToClass(ClassBodyEvaluator.java:311)
at org.codehaus.janino.ClassBodyEvaluator.cook(ClassBodyEvaluator.java:229)
at org.codehaus.janino.SimpleCompiler.cook(SimpleCompiler.java:196)
at org.codehaus.commons.compiler.Cookable.cook(Cookable.java:91)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:935)
... 30 more
这看起来像是 codegen 超出 64k 方法限制的那些讨厌的问题之一(如 SPARK-18492 and SPARK-16845 中所报告)。
您可能想查看 the Nightly Packages and Artifacts 的每晚 2.2.0-SNAPSHOT 版本之一,看看它是否可以在将来解决您的问题(当版本发布时)。
我有这段代码可以生成随机 df 并将其作为 spark 2.1 中的 parquet 文件写入磁盘。当列数达到 10000 时,这会遇到问题,但对于 10000,它似乎工作正常。
在 100000 列的情况下,spark 只是在屏幕上打印了一堆代码并抛出如下错误。
我怎样才能在没有错误的情况下将它写入 parquet?
import org.apache.spark.sql.types.{StructType,StructField,IntegerType,DoubleType}
import org.apache.spark.ml.Pipeline
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import scala.util.Random
import scala.math
val nRows = 10000
val nCols = 100000
val rD = sc.parallelize(0 to nRows-1).map { _ => Row.fromSeq(Seq.fill(nCols)(math.ceil(1000*Random.nextDouble()))) }
val schema = StructType((0 to nCols-1).map { i => StructField("C" + i, DoubleType, true) } )
val df = spark.createDataFrame(rD, schema)
df.select("*").write.format("parquet").save("df.parquet")
/* 379357 */ private void apply_22702(InternalRow i) {
/* 379358 */
/* 379359 */
/* 379360 */ boolean isNull90808 = i.isNullAt(90808);
/* 379361 */ double value90808 = isNull90808 ? -1.0 : (i.getDouble(90808));
/* 379362 */ if (isNull90808) {
/* 379363 */ rowWriter.setNullAt(90808);
/* 379364 */ } else {
/* 379365 */ rowWriter.write(90808, value90808);
/* 379366 */ }
/* 379367 */
/* 379368 */
/* 379369 */ boolean isNull90809 = i.isNullAt(90809);
/* 379370 */ double value90809 = isNull90809 ? -1.0 : (i.getDouble(90809));
/* 379371 */ if (isNull90809) {
/* 379372 */ rowWriter.setNullAt(90809);
/* 379373 */ } else {
/* 379374 */ rowWriter.write(90809, value90809);
/* 379375 */ }
/* 379376 */
/* 379377 */
/* 379378 */ boolean isNull90810 = i.isNullAt(90810);
/* 379379 */ double value90810 = isNull90810 ? -1.0 : (i.getDouble(90810));
/* 379380 */ if (isNull90810) {
/* 379381 */ rowWriter.setNullAt(90810);
/* 379382 */ } else {
/* 379383 */ rowWriter.write(90810, value90810);
/* 379384 */ }
/* 379385 */
.
.
.
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:941)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon.load(CodeGenerator.scala:998)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon.load(CodeGenerator.scala:995)
at org.spark_project.guava.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
at org.spark_project.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
at org.spark_project.guava.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
... 25 more
Caused by: org.codehaus.janino.JaninoRuntimeException: Constant pool for class org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection has grown past JVM limit of 0xFFFF
at org.codehaus.janino.util.ClassFile.addToConstantPool(ClassFile.java:499)
at org.codehaus.janino.util.ClassFile.addConstantIntegerInfo(ClassFile.java:395)
at org.codehaus.janino.UnitCompiler.addConstantIntegerInfo(UnitCompiler.java:11137)
at org.codehaus.janino.UnitCompiler.pushConstant(UnitCompiler.java:9681)
at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4911)
at org.codehaus.janino.UnitCompiler.access00(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler.visitIntegerLiteral(UnitCompiler.java:3776)
at org.codehaus.janino.UnitCompiler.visitIntegerLiteral(UnitCompiler.java:3762)
at org.codehaus.janino.Java$IntegerLiteral.accept(Java.java:4635)
at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:3762)
at org.codehaus.janino.UnitCompiler.fakeCompile(UnitCompiler.java:3128)
at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:4927)
at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4526)
at org.codehaus.janino.UnitCompiler.access00(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler.visitMethodInvocation(UnitCompiler.java:3774)
at org.codehaus.janino.UnitCompiler.visitMethodInvocation(UnitCompiler.java:3762)
at org.codehaus.janino.Java$MethodInvocation.accept(Java.java:4328)
at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:3762)
at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:4933)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:2330)
at org.codehaus.janino.UnitCompiler.access00(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler.visitLocalVariableDeclarationStatement(UnitCompiler.java:1386)
at org.codehaus.janino.UnitCompiler.visitLocalVariableDeclarationStatement(UnitCompiler.java:1370)
at org.codehaus.janino.Java$LocalVariableDeclarationStatement.accept(Java.java:2974)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:1370)
at org.codehaus.janino.UnitCompiler.compileStatements(UnitCompiler.java:1450)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:2811)
at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1262)
at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1234)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:538)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:890)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:894)
at org.codehaus.janino.UnitCompiler.access0(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler.visitMemberClassDeclaration(UnitCompiler.java:377)
at org.codehaus.janino.UnitCompiler.visitMemberClassDeclaration(UnitCompiler.java:369)
at org.codehaus.janino.Java$MemberClassDeclaration.accept(Java.java:1128)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:369)
at org.codehaus.janino.UnitCompiler.compileDeclaredMemberTypes(UnitCompiler.java:1209)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:564)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:420)
at org.codehaus.janino.UnitCompiler.access0(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler.visitPackageMemberClassDeclaration(UnitCompiler.java:374)
at org.codehaus.janino.UnitCompiler.visitPackageMemberClassDeclaration(UnitCompiler.java:369)
at org.codehaus.janino.Java$AbstractPackageMemberClassDeclaration.accept(Java.java:1309)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:369)
at org.codehaus.janino.UnitCompiler.compileUnit(UnitCompiler.java:345)
at org.codehaus.janino.SimpleCompiler.compileToClassLoader(SimpleCompiler.java:396)
at org.codehaus.janino.ClassBodyEvaluator.compileToClass(ClassBodyEvaluator.java:311)
at org.codehaus.janino.ClassBodyEvaluator.cook(ClassBodyEvaluator.java:229)
at org.codehaus.janino.SimpleCompiler.cook(SimpleCompiler.java:196)
at org.codehaus.commons.compiler.Cookable.cook(Cookable.java:91)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:935)
... 30 more
这看起来像是 codegen 超出 64k 方法限制的那些讨厌的问题之一(如 SPARK-18492 and SPARK-16845 中所报告)。
您可能想查看 the Nightly Packages and Artifacts 的每晚 2.2.0-SNAPSHOT 版本之一,看看它是否可以在将来解决您的问题(当版本发布时)。