PyTables 以块的形式从大型 CSV 中读取:
PyTables Read From Large CSV in chunks :
我有以下从 CSV 读取并写入 PyTables 的代码。但是,pd.read_csv 创建了一个数据框,这在 PyTables 中没有处理。我该如何解决这个问题?我可以创建 numpy 数组,但这似乎太过分了而且可能很耗时? (事务记录是 class 我使用正确的数据类型创建的 - 如果使用 numpy,我必须复制它)
def get_transaction_report_in_chunks(transaction_file):
transaction_report_data = pd.read_csv(transaction_file, index_col=None, parse_dates=False, chunksize=500000)
return transaction_report_data
def write_to_hdf_from_multiple_csv(transaction_file_path):
hdf = tables.open_file(filename='MyDB.h5', mode='a')
transaction_report_table = hdf.create_table(hdf.root, 'Transaction_Report_Table_x', Transaction_Record, "Transaction Report Table")
all_files = glob.glob(os.path.join(transaction_file_path, "*.csv"))
for transaction_file in all_files:
for transaction_chunk in get_transaction_report_in_chunks(transaction_file):
transaction_report_table.append(transaction_chunk)
transaction_report_table.flush()
hdf.Close()
我会使用 Pandas HDF Store,这对于底层的 PyTables 来说非常方便 API:
def write_to_hdf_from_multiple_csv(csv_file_path,
hdf_fn='/default_path/to/MyDB.h5',
hdf_key='Transaction_Report_Table_x',
df_cols_to_index=True): # you can specify here a list of columns that must be indexed, i.e.: ['name', 'department']
files = glob.glob(os.path.join(csv_file_path, '*.csv'))
# create HDF file (AKA '.h5' or PyTables)
store = pd.HDFStore(hdf_fn)
for f in files:
for chunk in pd.read_csv(f, chunksize=500000):
# don't index data columns in each iteration - we'll do it later ...
store.append(hdf_key, chunk, data_columns=df_cols_to_index, index=False)
# index data columns in HDFStore
store.create_table_index(hdf_key, columns=df_cols_to_index, optlevel=9, kind='full')
store.close()
我有以下从 CSV 读取并写入 PyTables 的代码。但是,pd.read_csv 创建了一个数据框,这在 PyTables 中没有处理。我该如何解决这个问题?我可以创建 numpy 数组,但这似乎太过分了而且可能很耗时? (事务记录是 class 我使用正确的数据类型创建的 - 如果使用 numpy,我必须复制它)
def get_transaction_report_in_chunks(transaction_file):
transaction_report_data = pd.read_csv(transaction_file, index_col=None, parse_dates=False, chunksize=500000)
return transaction_report_data
def write_to_hdf_from_multiple_csv(transaction_file_path):
hdf = tables.open_file(filename='MyDB.h5', mode='a')
transaction_report_table = hdf.create_table(hdf.root, 'Transaction_Report_Table_x', Transaction_Record, "Transaction Report Table")
all_files = glob.glob(os.path.join(transaction_file_path, "*.csv"))
for transaction_file in all_files:
for transaction_chunk in get_transaction_report_in_chunks(transaction_file):
transaction_report_table.append(transaction_chunk)
transaction_report_table.flush()
hdf.Close()
我会使用 Pandas HDF Store,这对于底层的 PyTables 来说非常方便 API:
def write_to_hdf_from_multiple_csv(csv_file_path,
hdf_fn='/default_path/to/MyDB.h5',
hdf_key='Transaction_Report_Table_x',
df_cols_to_index=True): # you can specify here a list of columns that must be indexed, i.e.: ['name', 'department']
files = glob.glob(os.path.join(csv_file_path, '*.csv'))
# create HDF file (AKA '.h5' or PyTables)
store = pd.HDFStore(hdf_fn)
for f in files:
for chunk in pd.read_csv(f, chunksize=500000):
# don't index data columns in each iteration - we'll do it later ...
store.append(hdf_key, chunk, data_columns=df_cols_to_index, index=False)
# index data columns in HDFStore
store.create_table_index(hdf_key, columns=df_cols_to_index, optlevel=9, kind='full')
store.close()