如何将 json 写回 aws Glue 中的 s3?
how to write json back to the s3 in aws Glue?
我是 aws-glue 的新手。我正在尝试读取 csv 并转换为 json 对象。如我所见,方法是通过爬虫读取 csv 并转换为 Pyspark DF,然后转换为 json 对象。
到现在为止,我已经转换为 json 对象。现在我需要将这些 json 写回 s3 存储桶?
下面是代码
#########################################
### IMPORT LIBRARIES AND SET VARIABLES
#########################################
#Import python modules
from datetime import datetime
#Import pyspark modules
from pyspark.context import SparkContext
import pyspark.sql.functions as f
#Import glue modules
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
import json
import boto3
#Initialize contexts and session
spark_context = SparkContext.getOrCreate()
glue_context = GlueContext(spark_context)
session = glue_context.spark_session
s3_source = boto3.resource('s3')
#Parameters
glue_db = "umesh-db"
glue_tbl = "read"
#########################################
### EXTRACT (READ DATA)
#########################################
#Read movie data to Glue dynamic frame
dynamic_frame_read = glue_context.create_dynamic_frame.from_catalog(database = glue_db, table_name = glue_tbl)
#Convert dynamic frame to data frame to use standard pyspark functions
data_frame = dynamic_frame_read.toDF()
## Show DF data
print("Showing Df data")
data_frame.show()
### Convert the DF to the json
jsonContent = data_frame.toJSON()
jsonValue={}
arrraYObj=[]
for row in jsonContent.collect():
print("Row data ", row)
arrraYObj.append(row)
print("Array Obj",arrraYObj)
jsonValue['Employee']=arrraYObj
print("Json object ", jsonValue)
#Log end time
#dt_end = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
#print("Start time:", dt_end)
感谢是否有人可以帮助提供正确的方法?
谢谢
data_frame.write.format(‘json’).save(‘s3://bucket/key’)
或直接从动态框架
glue_context.write_dynamic_frame.from_options(frame = dynamic_frame_read,
connection_type = "s3",
connection_options = {"path": "s3://bucket/key"},
format = "json")
我是 aws-glue 的新手。我正在尝试读取 csv 并转换为 json 对象。如我所见,方法是通过爬虫读取 csv 并转换为 Pyspark DF,然后转换为 json 对象。 到现在为止,我已经转换为 json 对象。现在我需要将这些 json 写回 s3 存储桶?
下面是代码
#########################################
### IMPORT LIBRARIES AND SET VARIABLES
#########################################
#Import python modules
from datetime import datetime
#Import pyspark modules
from pyspark.context import SparkContext
import pyspark.sql.functions as f
#Import glue modules
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
import json
import boto3
#Initialize contexts and session
spark_context = SparkContext.getOrCreate()
glue_context = GlueContext(spark_context)
session = glue_context.spark_session
s3_source = boto3.resource('s3')
#Parameters
glue_db = "umesh-db"
glue_tbl = "read"
#########################################
### EXTRACT (READ DATA)
#########################################
#Read movie data to Glue dynamic frame
dynamic_frame_read = glue_context.create_dynamic_frame.from_catalog(database = glue_db, table_name = glue_tbl)
#Convert dynamic frame to data frame to use standard pyspark functions
data_frame = dynamic_frame_read.toDF()
## Show DF data
print("Showing Df data")
data_frame.show()
### Convert the DF to the json
jsonContent = data_frame.toJSON()
jsonValue={}
arrraYObj=[]
for row in jsonContent.collect():
print("Row data ", row)
arrraYObj.append(row)
print("Array Obj",arrraYObj)
jsonValue['Employee']=arrraYObj
print("Json object ", jsonValue)
#Log end time
#dt_end = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
#print("Start time:", dt_end)
感谢是否有人可以帮助提供正确的方法? 谢谢
data_frame.write.format(‘json’).save(‘s3://bucket/key’)
或直接从动态框架
glue_context.write_dynamic_frame.from_options(frame = dynamic_frame_read,
connection_type = "s3",
connection_options = {"path": "s3://bucket/key"},
format = "json")