如何正确记录HDF5文件下的数据

Question

我收到 Type Error: Object dtype dtype('O') has no native HDF5 equivalent。这是我的 python 代码； mel_train, mfcc_train, and y_train 的 dtype 都是 float32。数组形状为：mfcc_train: (6398,)； mel_train: (6398,) 和 y_train: (6398, 16).

with h5py.File(train_file,'w') as f:
    f['mfcc_train'] = mfcc_train
    f['mel_train'] = mel_train
    f['y_train'] = y_train

Answer 1

import h5py
import pickle
from utils import change_label, make_dir
# open training, validation, and testing sets from csv files
train_csv = pd.read_csv("training.csv",index_col=0)
valid_csv = pd.read_csv("validation.csv",index_col=0)
test_csv = pd.read_csv("testing.csv",index_col=0)
feature_dir = 'features' # directory to store the extracted features
make_dir(feature_dir) # create directory if it does not exist
mel_train = []
mel_valid = []
mel_test = []
mfcc_train = []
mfcc_valid = []
mfcc_test = []
change = change_label() # to encode string label (class name) 
into binary matrix or vice versa
for i in range(train_csv.shape[0]):
   sr, audio = wavfile.read(train_csv.iloc[i,0])
   audio = pad_input(audio)
   mel = normalise_feature(extract_mel(audio))
   mfcc = normalise_feature(extract_mfcc(audio))
   mel_train.append(mel.T)
   mfcc_train.append(mfcc.T)
mel_train = np.asarray(mel_train)
print(mel_train.shape)
mfcc_train = np.asarray(mfcc_train)
print(mfcc_train.shape)
y = train_csv.iloc[:,1].to_list()
y_train = change.str2bin(y)
print(y_train.shape)
train_file = os.path.join(feature_dir,'mel_mfcc_train.h5')
print ("Storing extracted features and associated label 
 from training set     into a file: "+train_file)
with h5py.File(train_file,'w') as f:
    f['mel_train'] = mel_train
    f['mfcc_train'] = mfcc_train
    f['y_train'] = y_train`

Answer 2

好的，我认为我知道发生了什么（有根据的猜测）。您将音频数据提取到数组 mel 和 mfcc，然后添加到列表 mel_train 和 mfcc_train（循环 6398 个音频文件）。退出循环后，将列表转换为数组。如果每个 mel 和 mfcc 数组具有相同的形状（例如 (m,n)），则新数组的形状将为 (6398,m,n)，其中 6398 为 len(mel_train)。但是，我怀疑每个 mel 和 mfcc 数组都有不同的形状。因此，当您将不同形状的数组列表转换为单个数组时，您将获得 (6398,) 和 dtype=object 的数组形状（其中对象是 float32 数组）。

为了证明差异，我创建了 2 个几乎相同的示例：

创建 5 个相同二维形状 (10,2) 的数组，添加到列表中，然后将列表转换为数组.请注意最终数组的形状是 (5,10,2) 并且 dtype 是 float64。您可以直接从此数组创建 HDF5 数据集。
创建 5 个 变量数组 二维形状，添加到列表，然后将列表转换为数组。请注意最终数组的形状是 (5,) 并且 dtype 是 object。您不能直接从该数组创建 HDF5 数据集。这就是为什么你得到 TypeError: Object dtype dtype('O') has no native HDF5 equivalent.

注意：我在第二种方法的np.asarray()函数中添加了dtype=object以避免VisibleDeprecationWarning。

示例 2 显示了 2 种加载数据的方法。它从示例 1 继续并将数据加载到同一个 HDF5 文件中。在运行它们之后，您可以比较数据集 mel_train1、组 mel_train2 和数据集 mel_train3。每个都有一个“注释”属性来描述数据。

代码如下：

示例 1 - 常量形状数组：

train_file = 'mel_mfcc_train.h5'

## Example 1 - Create arrays of constant shape 
a0, a1, n = 10, 2, 5
mel_train = [] 

for i in range(n):         
    arr = np.random.random(a0*a1).reshape(a0,a1)
    mel_train.append(arr) 

print('\nFor mel_train arrays of constant size:')
print(f'Size of mel_train list: {len(mel_train)}')
mel_train = np.asarray(mel_train) 
print(f'For mel_train array: Dtype: {mel_train.dtype}; Shape: {mel_train.shape}')

with h5py.File(train_file,'w') as f: 
    f['mel_train1'] = mel_train
    f['mel_train1'].attrs['Note'] = f'{n} Constant shaped arrays: {a0} x {a1}'

示例 2 - 可变形状数组：

## Example 2 - Create arrays of random shape 
mel_train = [] 

for i in range(n): 
    a0 = np.random.randint(6,10) # set a0 dimension to random length
    ##a1 = np.random.randint(3,6)        
    arr = np.random.random(a0*a1).reshape(a0,a1)
    mel_train.append(arr) 

print('\nFor mel_train arrays of random size:')
print(f'Size of mel_train list: {len(mel_train)}')
# mel_train = np.asarray(mel_train) 
mel_train = np.asarray(mel_train,dtype=object) 
print(f'For mel_train array: Dtype: {mel_train.dtype}; Shape: {mel_train.shape}')  
for i, arr in enumerate(mel_train):
    print(f'\tFor a0= {i}; shape: {arr.shape}')

加载示例2数据as-is会抛出异常

# Creating a dataset with arrays of different sizes will throw
# an exception (exception trapped and printed in code below)   
try: 
    with h5py.File(train_file,'a') as f: 
        f['mel_train2'] = mel_train 
except Exception as e:
    print(f'\nh5py Exception: {e}\n')

加载示例 2 数据的推荐方法

## Example 2A
# To avoid exception, write each object/array to seperate datasets in 1 group    
with h5py.File(train_file,'a') as f: 
    grp = f.create_group('mel_train2')
    f['mel_train2'].attrs['Note'] = f'1 group and {n} datasets for variable shaped arrays'
    for i, arr in enumerate(mel_train):
        f[f'mel_train2/dataset_{i:04}'] = arr

加载示例 2 数据的替代方法（不推荐）

## Example 2B - for completeness; NOT recommended
# Alternately, size dataset to hold largest array.
# dataset will have zeros where smaller arrays are loaded

ds_dtype = mel_train[0].dtype
ds_a0 = mel_train.shape[0]
ds_a1, ds_a2 = 0, 0
for arr in mel_train:
    ds_a1 = max(ds_a1, arr.shape[0])
    ds_a2 = max(ds_a2, arr.shape[1])
    
with h5py.File(train_file,'a') as f: 
    ds2 = f.create_dataset('mel_train2',dtype=ds_dtype,shape=(ds_a0,ds_a1,ds_a2))
    for i, arr in enumerate(mel_train):
        j,k = arr.shape[0], arr.shape[1]
        ds2[i,0:j,0:k] = arr

上述运行ning 代码的典型输出：

For mel_train arrays of constant size:
Size of mel_train list: 5
For mel_train array: Dtype: float64; Shape: (5, 10, 2)

For mel_train arrays of random size:
Size of mel_train list: 5
For mel_train array: Dtype: object; Shape: (5,)
    For a0= 0; shape: (6, 2)
    For a0= 1; shape: (7, 2)
    For a0= 2; shape: (8, 2)
    For a0= 3; shape: (6, 2)
    For a0= 4; shape: (9, 2)

h5py Exception: Object dtype dtype('O') has no native HDF5 equivalent

如何正确记录HDF5文件下的数据

How to record data under HDF5 file correctly

python

audio

hdf5

feature-extraction

h5py