使用 boto3 从 AWS Glue 获取表
Get tables from AWS Glue using boto3
我需要从 AWS Glue 爬虫元数据目录中获取表名和列名。我使用 boto3
但不断获得 100 个表的数量,即使还有更多。设置 NextToken
没有帮助。如果可能请帮忙。
想要的结果如下:
lst = [table_one.col_one, table_one.col_two, table_two.col_one.....table_n.col_n]
def harvest_aws_crawler():
glue = boto3.client('glue', region_name='')
response = glue.get_tables(DatabaseName='', NextToken = '')
#response syntax:
#https://boto3.amazonaws.com/v1/documentation/api/1.9.42/reference/services/glue.html#Glue.Client.get_tables
crawler_list_tables = []
for tables in response['TableList']:
while (response.get('NextToken') is not None):
crawler_list_tables.append(tables['Name'])
break
print(len(crawler_list_tables))
harvest_aws_crawler()
更新代码,仍然需要有表名+列名:
def harvest_aws_crawler():
glue = boto3.client('glue', region_name='')
next_token = ""
#response syntax:
#https://boto3.amazonaws.com/v1/documentation/api/1.9.42/reference/services/glue.html#Glue.Client.get_tables
response = glue.get_tables(DatabaseName='', NextToken = next_token)
tables_from_crawler = []
while True:
table_list = response['TableList']
for table_dict in table_list:
table_name = table_dict['Name']
#append table_name+column_name
for columns in table_name['StorageDescriptor']['Columns']:
tables_from_crawler.append(table_name + '.' + columns['Name'])
#tables_from_crawler.append(table_name)
next_token = response.get('NextToken')
if next_token is None:
break
print(tables_from_crawler)
harvest_aws_crawler()
您可以使用 paginator
选项尝试以下方法:
def get_tables_for_database(database):
starting_token = None
next_page = True
tables = []
while next_page:
paginator = glue_client.get_paginator(operation_name="get_tables")
response_iterator = paginator.paginate(
DatabaseName=database,
PaginationConfig={"PageSize": 100, "StartingToken": starting_token},
)
for elem in response_iterator:
tables += [
{
"name": table["Name"],
}
for table in elem["TableList"]
]
try:
starting_token = elem["NextToken"]
except:
next_page = False
return tables
然后调用该方法列出给定数据库的表:
for table in get_tables_for_database(database):
print(f"Table: {table['name']}")
如果您想在 Glue 中列出每个数据库的表,您可能需要执行额外的 for 循环以便首先检索数据库,然后使用上面的代码片段作为内循环提取表对于每个数据库。
添加子循环获得了 table+ 列结果。
#harvest aws crawler metadata
next_token = ""
client = boto3.client('glue',region_name='us-east-1')
crawler_tables = []
while True:
response = client.get_tables(DatabaseName = '', NextToken = next_token)
for tables in response['TableList']:
for columns in tables['StorageDescriptor']['Columns']:
crawler_tables.append(tables['Name'] + '.' + columns['Name'])
next_token = response.get('NextToken')
if next_token is None:
break
print(crawler_tables)
我需要从 AWS Glue 爬虫元数据目录中获取表名和列名。我使用 boto3
但不断获得 100 个表的数量,即使还有更多。设置 NextToken
没有帮助。如果可能请帮忙。
想要的结果如下:
lst = [table_one.col_one, table_one.col_two, table_two.col_one.....table_n.col_n]
def harvest_aws_crawler():
glue = boto3.client('glue', region_name='')
response = glue.get_tables(DatabaseName='', NextToken = '')
#response syntax:
#https://boto3.amazonaws.com/v1/documentation/api/1.9.42/reference/services/glue.html#Glue.Client.get_tables
crawler_list_tables = []
for tables in response['TableList']:
while (response.get('NextToken') is not None):
crawler_list_tables.append(tables['Name'])
break
print(len(crawler_list_tables))
harvest_aws_crawler()
更新代码,仍然需要有表名+列名:
def harvest_aws_crawler():
glue = boto3.client('glue', region_name='')
next_token = ""
#response syntax:
#https://boto3.amazonaws.com/v1/documentation/api/1.9.42/reference/services/glue.html#Glue.Client.get_tables
response = glue.get_tables(DatabaseName='', NextToken = next_token)
tables_from_crawler = []
while True:
table_list = response['TableList']
for table_dict in table_list:
table_name = table_dict['Name']
#append table_name+column_name
for columns in table_name['StorageDescriptor']['Columns']:
tables_from_crawler.append(table_name + '.' + columns['Name'])
#tables_from_crawler.append(table_name)
next_token = response.get('NextToken')
if next_token is None:
break
print(tables_from_crawler)
harvest_aws_crawler()
您可以使用 paginator
选项尝试以下方法:
def get_tables_for_database(database):
starting_token = None
next_page = True
tables = []
while next_page:
paginator = glue_client.get_paginator(operation_name="get_tables")
response_iterator = paginator.paginate(
DatabaseName=database,
PaginationConfig={"PageSize": 100, "StartingToken": starting_token},
)
for elem in response_iterator:
tables += [
{
"name": table["Name"],
}
for table in elem["TableList"]
]
try:
starting_token = elem["NextToken"]
except:
next_page = False
return tables
然后调用该方法列出给定数据库的表:
for table in get_tables_for_database(database):
print(f"Table: {table['name']}")
如果您想在 Glue 中列出每个数据库的表,您可能需要执行额外的 for 循环以便首先检索数据库,然后使用上面的代码片段作为内循环提取表对于每个数据库。
添加子循环获得了 table+ 列结果。
#harvest aws crawler metadata
next_token = ""
client = boto3.client('glue',region_name='us-east-1')
crawler_tables = []
while True:
response = client.get_tables(DatabaseName = '', NextToken = next_token)
for tables in response['TableList']:
for columns in tables['StorageDescriptor']['Columns']:
crawler_tables.append(tables['Name'] + '.' + columns['Name'])
next_token = response.get('NextToken')
if next_token is None:
break
print(crawler_tables)