Glue AWS:调用 o60.getDynamicFrame 时出错
Glue AWS: error occurred while calling o60.getDynamicFrame
我已经定义了一个基本脚本来创建一个 DF,其中的数据来自我在 redshift 中的一个表。我 运行 这个过程,但我一直在努力处理一条我无法解释的消息。
日志中错误输出为:
"/mnt/yarn/usercache/root/appcache/application_1525803778049_0004/container_1525803778049_0004_01_000001/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value py4j.protocol.Py4JJavaError: An error occurred while calling o60.getDynamicFrame. : java.lang.UnsupportedOperationException: empty.reduceLeft at scala.collection.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame, DynamicFrameReader, DynamicFrameWriter, DynamicFrameCollection
from pyspark.sql.functions import lit
from awsglue.job import Job
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
table = glueContext.create_dynamic_frame.from_options(connection_type="redshift", connection_options =
{"url": "jdbc:redshift://xxxxx.yyyyy.us-east-1.redshift.amazonaws.com:5439/db",
"user": "yyyy",
"password": "yyyyy",
"dbtable": "schema.table_name",
"redshiftTmpDir": "s3://aws-glue-temporary-accountnumber-us-east-1/"},
format="orc",
transformation_ctx="table" )
table.show()
dfred = table.toDF().createOrReplaceTempView("table_df")
job.commit()
非常感谢您能为我提供的任何帮助。非常感谢
好吧,在继续努力解决这个问题之后,我查看了 DynamicFrame 的官方代码 class 所以,我在我的代码中添加了一个应用格式转换来映射来自读取的结果 table 在 redshift 中和提取 table 的方法我跳过了参数 transformation_ctx
那是错误 o60
我的最终版本代码是:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame, DynamicFrameReader, DynamicFrameWriter, DynamicFrameCollection
from pyspark.sql.functions import lit
from awsglue.job import Job
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
table = glueContext.create_dynamic_frame.from_options(connection_type="redshift", connection_options =
{"url": "jdbc:redshift://xxxxx.yyyyy.us-east-1.redshift.amazonaws.com:5439/db",
"user": "yyyy",
"password": "yyyyy",
"dbtable": "schema.table_name",
"redshiftTmpDir": "s3://aws-glue-temporary-accountnumber-us-east-1/"}
)
applyformat = ApplyMapping.apply(frame =table, mappings =
[("field1","string","field1","string"),
("field2","string","field2","string") ], transformation_ctx = "applyformat")
dfred = table.toDF().createOrReplaceTempView("table_df")
sqlDF = spark.sql(
"SELECT COUNT(*) FROM table_df"
)
print sqlDF.show()
job.commit()
我已经定义了一个基本脚本来创建一个 DF,其中的数据来自我在 redshift 中的一个表。我 运行 这个过程,但我一直在努力处理一条我无法解释的消息。
日志中错误输出为:
"/mnt/yarn/usercache/root/appcache/application_1525803778049_0004/container_1525803778049_0004_01_000001/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value py4j.protocol.Py4JJavaError: An error occurred while calling o60.getDynamicFrame. : java.lang.UnsupportedOperationException: empty.reduceLeft at scala.collection.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame, DynamicFrameReader, DynamicFrameWriter, DynamicFrameCollection
from pyspark.sql.functions import lit
from awsglue.job import Job
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
table = glueContext.create_dynamic_frame.from_options(connection_type="redshift", connection_options =
{"url": "jdbc:redshift://xxxxx.yyyyy.us-east-1.redshift.amazonaws.com:5439/db",
"user": "yyyy",
"password": "yyyyy",
"dbtable": "schema.table_name",
"redshiftTmpDir": "s3://aws-glue-temporary-accountnumber-us-east-1/"},
format="orc",
transformation_ctx="table" )
table.show()
dfred = table.toDF().createOrReplaceTempView("table_df")
job.commit()
非常感谢您能为我提供的任何帮助。非常感谢
好吧,在继续努力解决这个问题之后,我查看了 DynamicFrame 的官方代码 class 所以,我在我的代码中添加了一个应用格式转换来映射来自读取的结果 table 在 redshift 中和提取 table 的方法我跳过了参数 transformation_ctx
那是错误 o60
我的最终版本代码是:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame, DynamicFrameReader, DynamicFrameWriter, DynamicFrameCollection
from pyspark.sql.functions import lit
from awsglue.job import Job
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
table = glueContext.create_dynamic_frame.from_options(connection_type="redshift", connection_options =
{"url": "jdbc:redshift://xxxxx.yyyyy.us-east-1.redshift.amazonaws.com:5439/db",
"user": "yyyy",
"password": "yyyyy",
"dbtable": "schema.table_name",
"redshiftTmpDir": "s3://aws-glue-temporary-accountnumber-us-east-1/"}
)
applyformat = ApplyMapping.apply(frame =table, mappings =
[("field1","string","field1","string"),
("field2","string","field2","string") ], transformation_ctx = "applyformat")
dfred = table.toDF().createOrReplaceTempView("table_df")
sqlDF = spark.sql(
"SELECT COUNT(*) FROM table_df"
)
print sqlDF.show()
job.commit()