如何正确记录HDF5文件下的数据
How to record data under HDF5 file correctly
我收到 Type Error: Object dtype dtype('O') has no native HDF5 equivalent
。
这是我的 python 代码;
mel_train, mfcc_train, and y_train
的 dtype 都是 float32
。
数组形状为:mfcc_train: (6398,)
; mel_train: (6398,)
和 y_train: (6398, 16)
.
with h5py.File(train_file,'w') as f:
f['mfcc_train'] = mfcc_train
f['mel_train'] = mel_train
f['y_train'] = y_train
import h5py
import pickle
from utils import change_label, make_dir
# open training, validation, and testing sets from csv files
train_csv = pd.read_csv("training.csv",index_col=0)
valid_csv = pd.read_csv("validation.csv",index_col=0)
test_csv = pd.read_csv("testing.csv",index_col=0)
feature_dir = 'features' # directory to store the extracted features
make_dir(feature_dir) # create directory if it does not exist
mel_train = []
mel_valid = []
mel_test = []
mfcc_train = []
mfcc_valid = []
mfcc_test = []
change = change_label() # to encode string label (class name)
into binary matrix or vice versa
for i in range(train_csv.shape[0]):
sr, audio = wavfile.read(train_csv.iloc[i,0])
audio = pad_input(audio)
mel = normalise_feature(extract_mel(audio))
mfcc = normalise_feature(extract_mfcc(audio))
mel_train.append(mel.T)
mfcc_train.append(mfcc.T)
mel_train = np.asarray(mel_train)
print(mel_train.shape)
mfcc_train = np.asarray(mfcc_train)
print(mfcc_train.shape)
y = train_csv.iloc[:,1].to_list()
y_train = change.str2bin(y)
print(y_train.shape)
train_file = os.path.join(feature_dir,'mel_mfcc_train.h5')
print ("Storing extracted features and associated label
from training set into a file: "+train_file)
with h5py.File(train_file,'w') as f:
f['mel_train'] = mel_train
f['mfcc_train'] = mfcc_train
f['y_train'] = y_train`
好的,我认为我知道发生了什么(有根据的猜测)。您将音频数据提取到数组 mel
和 mfcc
,然后添加到列表 mel_train
和 mfcc_train
(循环 6398 个音频文件)。退出循环后,将列表转换为数组。如果每个 mel
和 mfcc
数组具有相同的形状(例如 (m,n)
),则新数组的形状将为 (6398,m,n)
,其中 6398 为 len(mel_train)
。但是,我怀疑每个 mel
和 mfcc
数组都有不同的形状。因此,当您将不同形状的数组列表转换为单个数组时,您将获得 (6398,)
和 dtype=object
的数组形状(其中对象是 float32
数组)。
为了证明差异,我创建了 2 个几乎相同的示例:
- 创建 5 个 相同 二维形状
(10,2)
的数组,添加到列表中,然后将列表转换为数组.请注意最终数组的形状是 (5,10,2)
并且 dtype 是 float64
。您可以直接从此数组创建 HDF5 数据集。
- 创建 5 个 变量数组 二维形状,添加到列表,然后将列表转换为数组。请注意最终数组的形状是
(5,)
并且 dtype 是 object
。您 不能 直接从该数组创建 HDF5 数据集。这就是为什么你得到 TypeError: Object dtype dtype('O') has no native HDF5 equivalent
.
注意:我在第二种方法的np.asarray()
函数中添加了dtype=object
以避免VisibleDeprecationWarning
。
示例 2 显示了 2 种加载数据的方法。它从示例 1 继续并将数据加载到同一个 HDF5 文件中。在 运行 它们之后,您可以比较数据集 mel_train1
、组 mel_train2
和数据集 mel_train3
。每个都有一个“注释”属性来描述数据。
代码如下:
示例 1 - 常量形状数组:
train_file = 'mel_mfcc_train.h5'
## Example 1 - Create arrays of constant shape
a0, a1, n = 10, 2, 5
mel_train = []
for i in range(n):
arr = np.random.random(a0*a1).reshape(a0,a1)
mel_train.append(arr)
print('\nFor mel_train arrays of constant size:')
print(f'Size of mel_train list: {len(mel_train)}')
mel_train = np.asarray(mel_train)
print(f'For mel_train array: Dtype: {mel_train.dtype}; Shape: {mel_train.shape}')
with h5py.File(train_file,'w') as f:
f['mel_train1'] = mel_train
f['mel_train1'].attrs['Note'] = f'{n} Constant shaped arrays: {a0} x {a1}'
示例 2 - 可变形状数组:
## Example 2 - Create arrays of random shape
mel_train = []
for i in range(n):
a0 = np.random.randint(6,10) # set a0 dimension to random length
##a1 = np.random.randint(3,6)
arr = np.random.random(a0*a1).reshape(a0,a1)
mel_train.append(arr)
print('\nFor mel_train arrays of random size:')
print(f'Size of mel_train list: {len(mel_train)}')
# mel_train = np.asarray(mel_train)
mel_train = np.asarray(mel_train,dtype=object)
print(f'For mel_train array: Dtype: {mel_train.dtype}; Shape: {mel_train.shape}')
for i, arr in enumerate(mel_train):
print(f'\tFor a0= {i}; shape: {arr.shape}')
加载示例2数据as-is会抛出异常
# Creating a dataset with arrays of different sizes will throw
# an exception (exception trapped and printed in code below)
try:
with h5py.File(train_file,'a') as f:
f['mel_train2'] = mel_train
except Exception as e:
print(f'\nh5py Exception: {e}\n')
加载示例 2 数据的推荐方法
## Example 2A
# To avoid exception, write each object/array to seperate datasets in 1 group
with h5py.File(train_file,'a') as f:
grp = f.create_group('mel_train2')
f['mel_train2'].attrs['Note'] = f'1 group and {n} datasets for variable shaped arrays'
for i, arr in enumerate(mel_train):
f[f'mel_train2/dataset_{i:04}'] = arr
加载示例 2 数据的替代方法(不推荐)
## Example 2B - for completeness; NOT recommended
# Alternately, size dataset to hold largest array.
# dataset will have zeros where smaller arrays are loaded
ds_dtype = mel_train[0].dtype
ds_a0 = mel_train.shape[0]
ds_a1, ds_a2 = 0, 0
for arr in mel_train:
ds_a1 = max(ds_a1, arr.shape[0])
ds_a2 = max(ds_a2, arr.shape[1])
with h5py.File(train_file,'a') as f:
ds2 = f.create_dataset('mel_train2',dtype=ds_dtype,shape=(ds_a0,ds_a1,ds_a2))
for i, arr in enumerate(mel_train):
j,k = arr.shape[0], arr.shape[1]
ds2[i,0:j,0:k] = arr
上述 运行ning 代码的典型输出:
For mel_train arrays of constant size:
Size of mel_train list: 5
For mel_train array: Dtype: float64; Shape: (5, 10, 2)
For mel_train arrays of random size:
Size of mel_train list: 5
For mel_train array: Dtype: object; Shape: (5,)
For a0= 0; shape: (6, 2)
For a0= 1; shape: (7, 2)
For a0= 2; shape: (8, 2)
For a0= 3; shape: (6, 2)
For a0= 4; shape: (9, 2)
h5py Exception: Object dtype dtype('O') has no native HDF5 equivalent
我收到 Type Error: Object dtype dtype('O') has no native HDF5 equivalent
。
这是我的 python 代码;
mel_train, mfcc_train, and y_train
的 dtype 都是 float32
。
数组形状为:mfcc_train: (6398,)
; mel_train: (6398,)
和 y_train: (6398, 16)
.
with h5py.File(train_file,'w') as f:
f['mfcc_train'] = mfcc_train
f['mel_train'] = mel_train
f['y_train'] = y_train
import h5py
import pickle
from utils import change_label, make_dir
# open training, validation, and testing sets from csv files
train_csv = pd.read_csv("training.csv",index_col=0)
valid_csv = pd.read_csv("validation.csv",index_col=0)
test_csv = pd.read_csv("testing.csv",index_col=0)
feature_dir = 'features' # directory to store the extracted features
make_dir(feature_dir) # create directory if it does not exist
mel_train = []
mel_valid = []
mel_test = []
mfcc_train = []
mfcc_valid = []
mfcc_test = []
change = change_label() # to encode string label (class name)
into binary matrix or vice versa
for i in range(train_csv.shape[0]):
sr, audio = wavfile.read(train_csv.iloc[i,0])
audio = pad_input(audio)
mel = normalise_feature(extract_mel(audio))
mfcc = normalise_feature(extract_mfcc(audio))
mel_train.append(mel.T)
mfcc_train.append(mfcc.T)
mel_train = np.asarray(mel_train)
print(mel_train.shape)
mfcc_train = np.asarray(mfcc_train)
print(mfcc_train.shape)
y = train_csv.iloc[:,1].to_list()
y_train = change.str2bin(y)
print(y_train.shape)
train_file = os.path.join(feature_dir,'mel_mfcc_train.h5')
print ("Storing extracted features and associated label
from training set into a file: "+train_file)
with h5py.File(train_file,'w') as f:
f['mel_train'] = mel_train
f['mfcc_train'] = mfcc_train
f['y_train'] = y_train`
好的,我认为我知道发生了什么(有根据的猜测)。您将音频数据提取到数组 mel
和 mfcc
,然后添加到列表 mel_train
和 mfcc_train
(循环 6398 个音频文件)。退出循环后,将列表转换为数组。如果每个 mel
和 mfcc
数组具有相同的形状(例如 (m,n)
),则新数组的形状将为 (6398,m,n)
,其中 6398 为 len(mel_train)
。但是,我怀疑每个 mel
和 mfcc
数组都有不同的形状。因此,当您将不同形状的数组列表转换为单个数组时,您将获得 (6398,)
和 dtype=object
的数组形状(其中对象是 float32
数组)。
为了证明差异,我创建了 2 个几乎相同的示例:
- 创建 5 个 相同 二维形状
(10,2)
的数组,添加到列表中,然后将列表转换为数组.请注意最终数组的形状是(5,10,2)
并且 dtype 是float64
。您可以直接从此数组创建 HDF5 数据集。 - 创建 5 个 变量数组 二维形状,添加到列表,然后将列表转换为数组。请注意最终数组的形状是
(5,)
并且 dtype 是object
。您 不能 直接从该数组创建 HDF5 数据集。这就是为什么你得到TypeError: Object dtype dtype('O') has no native HDF5 equivalent
.
注意:我在第二种方法的np.asarray()
函数中添加了dtype=object
以避免VisibleDeprecationWarning
。
示例 2 显示了 2 种加载数据的方法。它从示例 1 继续并将数据加载到同一个 HDF5 文件中。在 运行 它们之后,您可以比较数据集 mel_train1
、组 mel_train2
和数据集 mel_train3
。每个都有一个“注释”属性来描述数据。
代码如下:
示例 1 - 常量形状数组:
train_file = 'mel_mfcc_train.h5'
## Example 1 - Create arrays of constant shape
a0, a1, n = 10, 2, 5
mel_train = []
for i in range(n):
arr = np.random.random(a0*a1).reshape(a0,a1)
mel_train.append(arr)
print('\nFor mel_train arrays of constant size:')
print(f'Size of mel_train list: {len(mel_train)}')
mel_train = np.asarray(mel_train)
print(f'For mel_train array: Dtype: {mel_train.dtype}; Shape: {mel_train.shape}')
with h5py.File(train_file,'w') as f:
f['mel_train1'] = mel_train
f['mel_train1'].attrs['Note'] = f'{n} Constant shaped arrays: {a0} x {a1}'
示例 2 - 可变形状数组:
## Example 2 - Create arrays of random shape
mel_train = []
for i in range(n):
a0 = np.random.randint(6,10) # set a0 dimension to random length
##a1 = np.random.randint(3,6)
arr = np.random.random(a0*a1).reshape(a0,a1)
mel_train.append(arr)
print('\nFor mel_train arrays of random size:')
print(f'Size of mel_train list: {len(mel_train)}')
# mel_train = np.asarray(mel_train)
mel_train = np.asarray(mel_train,dtype=object)
print(f'For mel_train array: Dtype: {mel_train.dtype}; Shape: {mel_train.shape}')
for i, arr in enumerate(mel_train):
print(f'\tFor a0= {i}; shape: {arr.shape}')
加载示例2数据as-is会抛出异常
# Creating a dataset with arrays of different sizes will throw
# an exception (exception trapped and printed in code below)
try:
with h5py.File(train_file,'a') as f:
f['mel_train2'] = mel_train
except Exception as e:
print(f'\nh5py Exception: {e}\n')
加载示例 2 数据的推荐方法
## Example 2A
# To avoid exception, write each object/array to seperate datasets in 1 group
with h5py.File(train_file,'a') as f:
grp = f.create_group('mel_train2')
f['mel_train2'].attrs['Note'] = f'1 group and {n} datasets for variable shaped arrays'
for i, arr in enumerate(mel_train):
f[f'mel_train2/dataset_{i:04}'] = arr
加载示例 2 数据的替代方法(不推荐)
## Example 2B - for completeness; NOT recommended
# Alternately, size dataset to hold largest array.
# dataset will have zeros where smaller arrays are loaded
ds_dtype = mel_train[0].dtype
ds_a0 = mel_train.shape[0]
ds_a1, ds_a2 = 0, 0
for arr in mel_train:
ds_a1 = max(ds_a1, arr.shape[0])
ds_a2 = max(ds_a2, arr.shape[1])
with h5py.File(train_file,'a') as f:
ds2 = f.create_dataset('mel_train2',dtype=ds_dtype,shape=(ds_a0,ds_a1,ds_a2))
for i, arr in enumerate(mel_train):
j,k = arr.shape[0], arr.shape[1]
ds2[i,0:j,0:k] = arr
上述 运行ning 代码的典型输出:
For mel_train arrays of constant size:
Size of mel_train list: 5
For mel_train array: Dtype: float64; Shape: (5, 10, 2)
For mel_train arrays of random size:
Size of mel_train list: 5
For mel_train array: Dtype: object; Shape: (5,)
For a0= 0; shape: (6, 2)
For a0= 1; shape: (7, 2)
For a0= 2; shape: (8, 2)
For a0= 3; shape: (6, 2)
For a0= 4; shape: (9, 2)
h5py Exception: Object dtype dtype('O') has no native HDF5 equivalent