无法打开大于内存的 HDF5 文件... ValueError
Can't open HDF5 file bigger than memory... ValueError
我有很多来自 nyc.gov 的纽约市出租车的 .csv,一个 .csv = 年月。我在那里抓取 cca 15 的 csvs 并从中制作 HDF5:
import h5py
import pandas as pd
import os
import glob
import numpy as np
import vaex
from tqdm import tqdm_notebook as tqdm
#hdf = pd.HDFStore('c:/Projekty/H5Edu/NYCTaxi/NYCTaxi.hp')
#df1 = pd.read_csv('path nejake csvcko')
#hdf.put('DF1', df1, format = 'table', data_columns = True)
csv_list = np.sort(np.array(glob.glob('G:\NYCTaxi\*.csv')))[::-1]
csv_list = csv_list[20:39]
output_dir = 'c:\Datasety\YelowTaxi\DataH5\'
for file in tqdm(csv_list, leave=False, desc='Converting to hdf5...'):
# Setting up the files, and directories
#zip_file = ZipFile(file)
output_file = file.split('\')[-1][:-3]+'hdf5'
output = output_dir + output_file
#output = output_file
# Check if a converted file already exists: if it does skip it, otherwise read in the raw csv and convert it
if (os.path.exists(output) and os.path.isfile(output)):
pass
else:
# Importing the data into pandas
#pandas_df = [pd.read_csv(file, index_col=None, header=0)][0]
pandas_df = [pd.read_csv(file, index_col=None, header=0, low_memory=False)][0]
# Rename some columns to match the more well known dataset from
# http://stat-computing.org/dataexpo/2009/the-data.html
# Importing the data from pandas to vaex
vaex_df = vaex.from_pandas(pandas_df, copy_index=False)
# Export the data with vaex to hdf5
vaex_df.export_hdf5(path=output, progress=False)
接下来我做一个大HDF5:
import re
import glob
import vaex
import numpy as np
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [ tryint(c) for c in re.split('([0-9]+)', s) ]
hdf5_list = glob.glob('c:\Datasety\YelowTaxi\DataH5\*.hdf5')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)
#assert len(hdf5_list) == 3, "Incorrect number of files"
# This is an important step
master_df = vaex.open_many(hdf5_list)
# exporting
#master_df.export_hdf5(path='c:\Datasety\YelowTaxi\DataH5\Spojene.hd5', progress=True)
master_df.export_hdf5(path='c:\Datasety\YelowTaxi\DataH5\Spojene.hdf5', progress=True)
到目前为止,一切正常,我可以打开输出文件 Spojene.hdf5。
接下来,我将新的 .csv 附加到 Spojene.hdf5:
for file in csv_list:
#file = csv_list[0]
df2 = pd.read_csv(file, index_col=None, header=0, low_memory=False)
filename = 'c:\Datasety\YelowTaxi\DataH5\Spojene.hdf5'
df2.to_hdf(filename, 'data', append=True)
但是,当我将新的 .csv 附加到 Spojene.hdf5 时,我无法打开它:
df = vaex.open('c:\Datasety\YelowTaxi\DataH5\Spojene.hdf5')
ValueError:第一列的长度为 289184484,而第 table 列的长度为 60107988
请问,我能做什么?
我认为这与 pandas 创建 hdf5 文件的方式有关。根据 vaex 的 documentation,如果 HDF5 文件是通过 to_hdf
pandas 方法创建的,则无法使用 vaex 打开它。如果您附加到现有的 HDF5 文件,我认为它是相同的。
为避免此错误,您可以重用将 pandas 数据帧转换为 vaex 数据帧的逻辑,将其导出到 HDF5,然后使用 open_many
。这样的事情应该有效:
main_hdf5_file_path = "c:\Datasety\YelowTaxi\DataH5\Spojene.hdf5"
hdf5_files_created = []
for file in csv_list:
hdf5_file = file.replace(".csv", ".hdf5")
# from_csv can take additional parameters to forward to pd.read_csv
# You can also use convert=True to convert it automatically to hdf5 without the export_hdf5
# Refer to https://vaex.readthedocs.io/en/docs/api.html#vaex.from_csv
df = vaex.from_csv(file)
df.export_hdf5(hdf5_file)
hdf5_files_created.append(hdf5_file)
hdf5_to_read = hdf5_files_created + [main_hdf5_file_path]
final_df = vaex.open_many(hdf5_to_read)
final_df.export_hdf5(main_hdf5_file_path)
我有很多来自 nyc.gov 的纽约市出租车的 .csv,一个 .csv = 年月。我在那里抓取 cca 15 的 csvs 并从中制作 HDF5:
import h5py
import pandas as pd
import os
import glob
import numpy as np
import vaex
from tqdm import tqdm_notebook as tqdm
#hdf = pd.HDFStore('c:/Projekty/H5Edu/NYCTaxi/NYCTaxi.hp')
#df1 = pd.read_csv('path nejake csvcko')
#hdf.put('DF1', df1, format = 'table', data_columns = True)
csv_list = np.sort(np.array(glob.glob('G:\NYCTaxi\*.csv')))[::-1]
csv_list = csv_list[20:39]
output_dir = 'c:\Datasety\YelowTaxi\DataH5\'
for file in tqdm(csv_list, leave=False, desc='Converting to hdf5...'):
# Setting up the files, and directories
#zip_file = ZipFile(file)
output_file = file.split('\')[-1][:-3]+'hdf5'
output = output_dir + output_file
#output = output_file
# Check if a converted file already exists: if it does skip it, otherwise read in the raw csv and convert it
if (os.path.exists(output) and os.path.isfile(output)):
pass
else:
# Importing the data into pandas
#pandas_df = [pd.read_csv(file, index_col=None, header=0)][0]
pandas_df = [pd.read_csv(file, index_col=None, header=0, low_memory=False)][0]
# Rename some columns to match the more well known dataset from
# http://stat-computing.org/dataexpo/2009/the-data.html
# Importing the data from pandas to vaex
vaex_df = vaex.from_pandas(pandas_df, copy_index=False)
# Export the data with vaex to hdf5
vaex_df.export_hdf5(path=output, progress=False)
接下来我做一个大HDF5:
import re
import glob
import vaex
import numpy as np
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [ tryint(c) for c in re.split('([0-9]+)', s) ]
hdf5_list = glob.glob('c:\Datasety\YelowTaxi\DataH5\*.hdf5')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)
#assert len(hdf5_list) == 3, "Incorrect number of files"
# This is an important step
master_df = vaex.open_many(hdf5_list)
# exporting
#master_df.export_hdf5(path='c:\Datasety\YelowTaxi\DataH5\Spojene.hd5', progress=True)
master_df.export_hdf5(path='c:\Datasety\YelowTaxi\DataH5\Spojene.hdf5', progress=True)
到目前为止,一切正常,我可以打开输出文件 Spojene.hdf5。
接下来,我将新的 .csv 附加到 Spojene.hdf5:
for file in csv_list:
#file = csv_list[0]
df2 = pd.read_csv(file, index_col=None, header=0, low_memory=False)
filename = 'c:\Datasety\YelowTaxi\DataH5\Spojene.hdf5'
df2.to_hdf(filename, 'data', append=True)
但是,当我将新的 .csv 附加到 Spojene.hdf5 时,我无法打开它:
df = vaex.open('c:\Datasety\YelowTaxi\DataH5\Spojene.hdf5')
ValueError:第一列的长度为 289184484,而第 table 列的长度为 60107988
请问,我能做什么?
我认为这与 pandas 创建 hdf5 文件的方式有关。根据 vaex 的 documentation,如果 HDF5 文件是通过 to_hdf
pandas 方法创建的,则无法使用 vaex 打开它。如果您附加到现有的 HDF5 文件,我认为它是相同的。
为避免此错误,您可以重用将 pandas 数据帧转换为 vaex 数据帧的逻辑,将其导出到 HDF5,然后使用 open_many
。这样的事情应该有效:
main_hdf5_file_path = "c:\Datasety\YelowTaxi\DataH5\Spojene.hdf5"
hdf5_files_created = []
for file in csv_list:
hdf5_file = file.replace(".csv", ".hdf5")
# from_csv can take additional parameters to forward to pd.read_csv
# You can also use convert=True to convert it automatically to hdf5 without the export_hdf5
# Refer to https://vaex.readthedocs.io/en/docs/api.html#vaex.from_csv
df = vaex.from_csv(file)
df.export_hdf5(hdf5_file)
hdf5_files_created.append(hdf5_file)
hdf5_to_read = hdf5_files_created + [main_hdf5_file_path]
final_df = vaex.open_many(hdf5_to_read)
final_df.export_hdf5(main_hdf5_file_path)