cx_Oracle Python 中的 executemany() 将分页 API JSON 数据加载到 Oracle 数据库

Question

tbls = ['tbl1' , 'tbl2', 'tbl3']

            for tbl in tbls:
                apidata = []
                offset = 0
                limit = 3000
                
                while True:
                    print("----")
                    baseurl = f'someurl'
                    url = baseurl + tbl + '?sysparm_offset=' + '{}'.format(offset) + '&sysparm_limit=' + '{}'.format(limit)
                    print("Requesting", url)
                    response = requests.get(url, auth=(apiuser, apipwd), headers=headers )
                    data = response.json()
                    json_string = json.dumps(data)
                    df = pd.DataFrame(data)

                    try:
                        with cx_Oracle.connect(dbuser, dbpass, dsn, encoding='UTF-8') as connection:
                            cursor = connection.cursor()
                            sql = 'insert into ' + tbl + '(req_data) values (:req_data)'
                            cursor.execute(sql, [json_string]) #works fine with this sql but super slow
                            
                            #cursor.executemany(sql, ((i,) for i in data['result'])) #TypeError: parameters should be a list of sequences/dictionaries or an integer specifying the number of times to execute the statement
                            #cursor.executemany(sql, df.values.tolist()) #Python value of type dict not supported.
                            connection.commit()
                    except cx_Oracle.Error as error:
                        print('CX_Oracle Error occurred:')
                        print(error)
                    if len(data['result']) == 0:
                        break
                    else:
                        apidata.extend(data['result'])
                        offset = offset + 3000
                continue

我正在尝试提取 api 格式的 json 数据，然后尝试将其插入到 Oracle 中的 json table 中。我的代码遍历 tbls 列表中的每个 tbl 并一次提取分页的 api 数据 3000 行并循环 api table 直到它完成提取。完成拉动后，它会对 tbls 中剩余的 'tbl' 执行相同的操作。这在使用 cursor.execute 时工作正常但非常慢（api table 有 >150k 行）。

当我尝试使用 executemany 时，它似乎对我不起作用。我知道对于 executemany，我必须提供一个列表，我尝试使用 ((i,) for i in data['result']) 以及 pandas 数据框 df.values.tolist() . 我仍然是一个 Python 菜鸟，如果您能指出我在这里做错了什么，我将不胜感激。我在 executemnay 上经历了很多 articles/examples，但我仍然对此感到困难。

api数据样本 {'result': [{'owner': {'link': 'https://someurl.com/api/now/table/sys_user/39aec4146fedda00f3ab4ecbbb3ee4ec', 'value': '39aec4146fedda00f3ab4ecbbb3ee4ec'}, 'sys_id': '00c67bbbdbbcb2c01e05fb541d96196b', 'sys_updated_by': 'xa0380', 'sys_created_on': '2017-02-14 22:25:04', 'document': {'link': 'https://someurl.com/api/now/table/vtb_task/b1b6bb7bdbbcb2c01e05fb541d961923', 'value': 'b1b6bb7bdbbcb2c01e05fb541d961923'}, 'name': '', 'sys_mod_count': '0', 'sys_updated_on': '2017-02-14 22:25:04', 'sys_tags': '', 'sys_created_by': 'xa0380', 'table': 'vtb_task'}]}

api数据的长度因 table 的不同而不同，但由于我在 oracle 中将其作为 json 插入，所以这无关紧要。

Oracle json table ddl: 创建 table tbl1 ( req_data blob 检查（req_data 是 json） );

Oracle 版本 - 19c CX_Oracle 版本 - 8.3

提前谢谢你。如果您需要任何其他信息，请告诉我。

Answer 1

尝试像这样与 executemany() 绑定：

import cx_Oracle as oracledb
import os
import sys

if sys.platform.startswith('darwin'):
    oracledb.init_oracle_client(lib_dir=os.environ.get('HOME')+'/Downloads/instantclient_19_8')

un = os.environ.get('PYTHON_USERNAME')
pw = os.environ.get('PYTHON_PASSWORD')
cs = os.environ.get('PYTHON_CONNECTSTRING')

connection = oracledb.connect(user=un, password=pw, dsn=cs)

# For the SELECT statement
def output_type_handler(cursor, name, default_type, size, precision, scale):
    if default_type == oracledb.CLOB:
        return cursor.var(oracledb.LONG_STRING, arraysize=cursor.arraysize)
    if default_type == oracledb.BLOB:
        return cursor.var(oracledb.LONG_BINARY, arraysize=cursor.arraysize)

connection.outputtypehandler = output_type_handler


d1 = """{'result': [{'owner': {'link': 'https://someurl.com/api/now/table/sys_user/39aec4146fedda00f3ab4ecbbb3ee4ec', 'value': '39aec4146fedda00f3ab4ecbbb3ee4ec'}, 'sys_id': '00c67bbbdbbcb2c01e05fb541d96196b', 'sys_updated_by': 'xa0380', 'sys_created_on': '2017-02-14 22:25:04', 'document': {'link': 'https://someurl.com/api/now/table/vtb_task/b1b6bb7bdbbcb2c01e05fb541d961923', 'value': 'b1b6bb7bdbbcb2c01e05fb541d961923'}, 'name': '', 'sys_mod_count': '0', 'sys_updated_on': '2017-02-14 22:25:04', 'sys_tags': '', 'sys_created_by': 'xa0380', 'table': 'vtb_task'}]}"""

d2 = """{'result': [{'owner': {'link': 'https://someurl2.com/api/now/table/sys_user/xyz', 'value': 'xyz'}, 'sys_id': 'xyz', 'sys_updated_by': 'xa0380', 'sys_created_on': '2017-02-14 22:25:04', 'document': {'link': 'https://someurl.com/api/now/table/vtb_task/b1b6bb7bdbbcb2c01e05fb541d961923', 'value': 'xyz'}, 'name': '', 'sys_mod_count': '0', 'sys_updated_on': '2017-02-14 22:25:04', 'sys_tags': '', 'sys_created_by': 'xa0380', 'table': 'vtb_task'}]}"""

data = [
    {"req_data": d1},
    {"req_data": d2}
]

sql = 'insert into tbl1 (req_data) values (:req_data)'

with connection.cursor() as cursor:
    cursor.executemany(sql, data)


with connection.cursor() as cursor:
    cursor.execute("select * from tbl1")
    r = cursor.fetchall()
    print(r)

或者您可以更改语句以使用命名绑定变量，然后构造数据，如：

data = [
    (d1,),
    (d2,)
]

sql = 'insert into tbl1 (req_data) values (:1)'

其他可以加快应用速度的事情：

避免多次连接。而是在 while 循环之外连接。
尝试 cx_Oracle 的新主要版本，它具有 'Thin' 模式。更新已重命名为 'python-oracledb'，请参阅 release announcement。

cx_Oracle Python 中的 executemany() 将分页 API JSON 数据加载到 Oracle 数据库

cx_Oracle executemany() in Python to load paginated API JSON data to Oracle Database

python

oracle

json

cx-oracle

executemany