如何将 JSON 模式从 Camel 大小写转换为小写
How to convert JSON Schema from Camel case to lower case
我有一个 JSON 架构,其中键采用驼峰式大小写,我正在尝试将所有数据类型转换为小写。
我遇到了 ArrayType
.
的问题
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.types.{DataType, StructType}
import spark.implicits._
val spark: SparkSession = SparkSession.builder().enableHiveSupport().getOrCreate()
var sample_schema = spark.read.json("path").schema
def columnsToLowercase(schema: StructType): StructType = {
def recurRename(schema: StructType): Seq[StructField] =
schema.fields.map {
case StructField(name, dtype: StructType, nullable, meta) =>
StructField(name.toLowerCase, StructType(recurRename(dtype)), nullable, meta)
case StructField(name, dtype, nullable, meta) =>
StructField(name.toLowerCase, dtype, nullable, meta)
}
StructType(recurRename(schema))
}
val jsonDFrame: DataFrame = spark.read.schema(columnsToLowercase(sample_schema)).json("path")
示例架构:
root
|-- id: string (nullable = true)
|-- master: struct (nullable = true)
| |-- code: string (nullable = true)
| |-- provInfo: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- claimInfo: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- claimId: string (nullable = true)
| | | |-- demoInfo: struct (nullable = true)
| | | | |-- family: struct (nullable = true)
| | | | | |-- outOrder: struct (nullable = true)
| | | | | | |-- LocOut: boolean (nullable = true)
| | | | | | |-- found: boolean (nullable = true)
| |-- claimAddr: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- address: string (nullable = true)
|-- system: string (nullable = true)
您应该能够通过添加另一个 case 子句将嵌套在 ArrayType
中的字段小写。对于数组列,还需要检查其子元素类型:
def columnsToLowercase(schema: StructType): StructType = {
// ....
case StructField(name, dtype: ArrayType, nullable, meta) => dtype.elementType match {
case s: StructType => StructField(name.toLowerCase, ArrayType(StructType(recurRename(s)), true), nullable, meta)
case dt => StructField(name.toLowerCase, dt, nullable, meta)
}
//....
}
正在应用您的架构:
df.printSchema
//root
// |-- id: string (nullable = true)
// |-- master: struct (nullable = true)
// | |-- provInfo: struct (nullable = true)
// | | |-- claimInfo: array (nullable = true)
// | | | |-- element: struct (containsNull = true)
// | | | | |-- claimId: string (nullable = true)
// | | |-- demoInfo: struct (nullable = true)
// | | | |-- family: struct (nullable = true)
// | | | | |-- outOrder: struct (nullable = true)
// | | | | | |-- LocOut: boolean (nullable = false)
// | | | | | |-- found: boolean (nullable = false)
// | |-- claimAddr: array (nullable = true)
// | | |-- element: struct (containsNull = true)
// | | | |-- address: string (nullable = true)
// |-- system: string (nullable = true)
columnsToLowercase(df.schema).printTreeString()
//root
// |-- id: string (nullable = true)
// |-- master: struct (nullable = true)
// | |-- provinfo: struct (nullable = true)
// | | |-- claiminfo: array (nullable = true)
// | | | |-- element: struct (containsNull = true)
// | | | | |-- claimid: string (nullable = true)
// | | |-- demoinfo: struct (nullable = true)
// | | | |-- family: struct (nullable = true)
// | | | | |-- outorder: struct (nullable = true)
// | | | | | |-- locout: boolean (nullable = false)
// | | | | | |-- found: boolean (nullable = false)
// | |-- claimaddr: array (nullable = true)
// | | |-- element: struct (containsNull = true)
// | | | |-- address: string (nullable = true)
// |-- system: string (nullable = true)
我有一个 JSON 架构,其中键采用驼峰式大小写,我正在尝试将所有数据类型转换为小写。
我遇到了 ArrayType
.
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.types.{DataType, StructType}
import spark.implicits._
val spark: SparkSession = SparkSession.builder().enableHiveSupport().getOrCreate()
var sample_schema = spark.read.json("path").schema
def columnsToLowercase(schema: StructType): StructType = {
def recurRename(schema: StructType): Seq[StructField] =
schema.fields.map {
case StructField(name, dtype: StructType, nullable, meta) =>
StructField(name.toLowerCase, StructType(recurRename(dtype)), nullable, meta)
case StructField(name, dtype, nullable, meta) =>
StructField(name.toLowerCase, dtype, nullable, meta)
}
StructType(recurRename(schema))
}
val jsonDFrame: DataFrame = spark.read.schema(columnsToLowercase(sample_schema)).json("path")
示例架构:
root
|-- id: string (nullable = true)
|-- master: struct (nullable = true)
| |-- code: string (nullable = true)
| |-- provInfo: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- claimInfo: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- claimId: string (nullable = true)
| | | |-- demoInfo: struct (nullable = true)
| | | | |-- family: struct (nullable = true)
| | | | | |-- outOrder: struct (nullable = true)
| | | | | | |-- LocOut: boolean (nullable = true)
| | | | | | |-- found: boolean (nullable = true)
| |-- claimAddr: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- address: string (nullable = true)
|-- system: string (nullable = true)
您应该能够通过添加另一个 case 子句将嵌套在 ArrayType
中的字段小写。对于数组列,还需要检查其子元素类型:
def columnsToLowercase(schema: StructType): StructType = {
// ....
case StructField(name, dtype: ArrayType, nullable, meta) => dtype.elementType match {
case s: StructType => StructField(name.toLowerCase, ArrayType(StructType(recurRename(s)), true), nullable, meta)
case dt => StructField(name.toLowerCase, dt, nullable, meta)
}
//....
}
正在应用您的架构:
df.printSchema
//root
// |-- id: string (nullable = true)
// |-- master: struct (nullable = true)
// | |-- provInfo: struct (nullable = true)
// | | |-- claimInfo: array (nullable = true)
// | | | |-- element: struct (containsNull = true)
// | | | | |-- claimId: string (nullable = true)
// | | |-- demoInfo: struct (nullable = true)
// | | | |-- family: struct (nullable = true)
// | | | | |-- outOrder: struct (nullable = true)
// | | | | | |-- LocOut: boolean (nullable = false)
// | | | | | |-- found: boolean (nullable = false)
// | |-- claimAddr: array (nullable = true)
// | | |-- element: struct (containsNull = true)
// | | | |-- address: string (nullable = true)
// |-- system: string (nullable = true)
columnsToLowercase(df.schema).printTreeString()
//root
// |-- id: string (nullable = true)
// |-- master: struct (nullable = true)
// | |-- provinfo: struct (nullable = true)
// | | |-- claiminfo: array (nullable = true)
// | | | |-- element: struct (containsNull = true)
// | | | | |-- claimid: string (nullable = true)
// | | |-- demoinfo: struct (nullable = true)
// | | | |-- family: struct (nullable = true)
// | | | | |-- outorder: struct (nullable = true)
// | | | | | |-- locout: boolean (nullable = false)
// | | | | | |-- found: boolean (nullable = false)
// | |-- claimaddr: array (nullable = true)
// | | |-- element: struct (containsNull = true)
// | | | |-- address: string (nullable = true)
// |-- system: string (nullable = true)