如何使用 Python 将 Hive 模式转换为 Bigquery 模式?
How to convert Hive schema to Bigquery schema using Python?
我从api得到的:
"name":"reports"
"col_type":"array<struct<imageUrl:string,reportedBy:string>>"
所以在配置单元模式中我得到:
reports array<struct<imageUrl:string,reportedBy:string>>
注意: 我从 api
中获得了作为字符串的配置单元数组模式
我的目标:
bigquery.SchemaField("reports", "RECORD", mode="NULLABLE",
fields=(
bigquery.SchemaField('imageUrl', 'STRING'),
bigquery.SchemaField('reportedBy', 'STRING')
)
)
注意:我想创建通用代码,当我在数组中接收到任意数量的结构时可以处理。
欢迎任何提示。
我尝试创建一个脚本来解析您的输入 reports array<struct<imageUrl:string,reportedBy:string>>
。这会将您的输入转换为可用作 schema when creating a table 的字典。该方法的主要思想是不使用 SchemaField(),您可以创建一个字典,这比使用您的示例输入创建带有参数的 SchemaField() 对象要容易得多。
注意:该脚本仅根据您的输入进行测试,如果添加到 struct<.
中,它可以解析更多字段
import re
from google.cloud import bigquery
def is_even(number):
if (number % 2) == 0:
return True
else:
return False
def clean_string(str_value):
return re.sub(r'[\W_]+', '', str_value)
def convert_to_bqdict(api_string):
"""
This only works for a struct with multiple fields
This could give you an idea on constructing a schema dict for BigQuery
"""
num_even = True
main_dict = {}
struct_dict = {}
field_arr = []
schema_arr = []
# Hard coded this since not sure what the string will look like if there are more inputs
init_struct = sample.split(' ')
main_dict["name"] = init_struct[0]
main_dict["type"] = "RECORD"
main_dict["mode"] = "NULLABLE"
cont_struct = init_struct[1].split('<')
num_elem = len(cont_struct)
# parse fields inside of struct<
for i in range(0,num_elem):
num_even = is_even(i)
# fields are seen on even indices
if num_even and i != 0:
temp = list(filter(None,cont_struct[i].split(','))) # remove blank elements
for elem in temp:
fields = list(filter(None,elem.split(':')))
struct_dict["name"] = clean_string(fields[0])
# "type" works for STRING as of the moment refer to
# https://cloud.google.com/bigquery/docs/schemas#standard_sql_data_types
# for the accepted data types
struct_dict["type"] = clean_string(fields[1]).upper()
struct_dict["mode"] = "NULLABLE"
field_arr.append(struct_dict)
struct_dict = {}
main_dict["fields"] = field_arr # assign dict to array of fields
schema_arr.append(main_dict)
return schema_arr
sample = "reports array<struct<imageUrl:string,reportedBy:string,newfield:bool>>"
bq_dict = convert_to_bqdict(sample)
client = bigquery.Client()
project = client.project
dataset_ref = bigquery.DatasetReference(project, '20211228')
table_ref = dataset_ref.table("20220203")
table = bigquery.Table(table_ref, schema=bq_dict)
table = client.create_table(table)
输出:
我从api得到的:
"name":"reports"
"col_type":"array<struct<imageUrl:string,reportedBy:string>>"
所以在配置单元模式中我得到:
reports array<struct<imageUrl:string,reportedBy:string>>
注意: 我从 api
中获得了作为字符串的配置单元数组模式我的目标:
bigquery.SchemaField("reports", "RECORD", mode="NULLABLE",
fields=(
bigquery.SchemaField('imageUrl', 'STRING'),
bigquery.SchemaField('reportedBy', 'STRING')
)
)
注意:我想创建通用代码,当我在数组中接收到任意数量的结构时可以处理。
欢迎任何提示。
我尝试创建一个脚本来解析您的输入 reports array<struct<imageUrl:string,reportedBy:string>>
。这会将您的输入转换为可用作 schema when creating a table 的字典。该方法的主要思想是不使用 SchemaField(),您可以创建一个字典,这比使用您的示例输入创建带有参数的 SchemaField() 对象要容易得多。
注意:该脚本仅根据您的输入进行测试,如果添加到 struct<.
中,它可以解析更多字段import re
from google.cloud import bigquery
def is_even(number):
if (number % 2) == 0:
return True
else:
return False
def clean_string(str_value):
return re.sub(r'[\W_]+', '', str_value)
def convert_to_bqdict(api_string):
"""
This only works for a struct with multiple fields
This could give you an idea on constructing a schema dict for BigQuery
"""
num_even = True
main_dict = {}
struct_dict = {}
field_arr = []
schema_arr = []
# Hard coded this since not sure what the string will look like if there are more inputs
init_struct = sample.split(' ')
main_dict["name"] = init_struct[0]
main_dict["type"] = "RECORD"
main_dict["mode"] = "NULLABLE"
cont_struct = init_struct[1].split('<')
num_elem = len(cont_struct)
# parse fields inside of struct<
for i in range(0,num_elem):
num_even = is_even(i)
# fields are seen on even indices
if num_even and i != 0:
temp = list(filter(None,cont_struct[i].split(','))) # remove blank elements
for elem in temp:
fields = list(filter(None,elem.split(':')))
struct_dict["name"] = clean_string(fields[0])
# "type" works for STRING as of the moment refer to
# https://cloud.google.com/bigquery/docs/schemas#standard_sql_data_types
# for the accepted data types
struct_dict["type"] = clean_string(fields[1]).upper()
struct_dict["mode"] = "NULLABLE"
field_arr.append(struct_dict)
struct_dict = {}
main_dict["fields"] = field_arr # assign dict to array of fields
schema_arr.append(main_dict)
return schema_arr
sample = "reports array<struct<imageUrl:string,reportedBy:string,newfield:bool>>"
bq_dict = convert_to_bqdict(sample)
client = bigquery.Client()
project = client.project
dataset_ref = bigquery.DatasetReference(project, '20211228')
table_ref = dataset_ref.table("20220203")
table = bigquery.Table(table_ref, schema=bq_dict)
table = client.create_table(table)
输出: