如何使用 Keras 实现 CNN-LSTM
How to implement a CNN-LSTM using Keras
我正在尝试实现一个 CNN-LSTM,该 CNN-LSTM 对表示帕金森 Disease/Healthy 控制的人的语音的梅尔频谱图图像进行分类。我正在尝试使用 LSTM 模型实现预先存在的模型 (DenseNet-169),但是我 运行 出现以下错误:ValueError: Input 0 of layer zero_padding2d is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: [None, 216, 1].
谁能告诉我哪里出错了?
import librosa
import os
import glob
import IPython.display as ipd
from pathlib import Path
import timeit
import time, sys
%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import cv2
import seaborn as sns
%tensorflow_version 1.x #version 1 works without problems
import tensorflow
from tensorflow.keras import models
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import TimeDistributed
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense, BatchNormalization, Activation, GaussianNoise, LSTM
from sklearn.metrics import accuracy_score
DATA_DIR = Path('/content/drive/MyDrive/PhD_Project_Experiments/Spontaneous_Dialogue_PD_Dataset')
diagnosis = [x.name for x in DATA_DIR.glob('*') if x.is_dir()]
diagnosis
def create_paths_ds(paths: Path, label: str) -> list:
EXTENSION_TYPE = '.wav'
return [(x, label) for x in paths.glob('*' + EXTENSION_TYPE)]
from collections import Counter
categories_to_use = [
'Parkinsons_Disease',
'Healthy_Control',
]
NUM_CLASSES = len(categories_to_use)
print(f'Number of classes: {NUM_CLASSES}')
paths_all_labels = []
for cat in categories_to_use:
paths_all_labels += create_paths_ds(DATA_DIR / cat, cat)
X_train, X_test = train_test_split(paths_all_labels,test_size=0.1, stratify = [paths_all_labels[y][1] for y in range(len(paths_all_labels))] ) #fix stratified sampling for test data
X_train, X_val = train_test_split(X_train, test_size=0.2, stratify = [X_train[y][1] for y in range(len(X_train))] )
for i in categories_to_use:
print('Number of train samples for '+i+': '+ str([X_train[y][1] for y in range(len(X_train))].count(i))) #checks whether train samples are equally divided
print('Number of test samples for '+i+': '+ str([X_test[y][1] for y in range(len(X_test))].count(i))) #checks whether test samples are equally divided
print('Number of validation samples for '+i+': '+ str([X_val[y][1] for y in range(len(X_val))].count(i))) #checks whether val samples are equally divided
print(f'Train length: {len(X_train)}')
print(f'Validation length: {len(X_val)}')
print(f'Test length: {len(X_test)}')
def load_and_preprocess_lstm(dataset, SAMPLE_SIZE = 30):
IMG_SIZE = (216,128)
progress=0
data = []
labels = []
for (path, label) in dataset:
audio, sr = librosa.load(path)
dur = librosa.get_duration(audio, sr = sr)
sampleNum = int(dur / SAMPLE_SIZE)
offset = (dur % SAMPLE_SIZE) / 2
for i in range(sampleNum):
audio, sr = librosa.load(path, offset= offset+i, duration=SAMPLE_SIZE)
sample = librosa.feature.melspectrogram(audio, sr=sr)
# print(sample.shape)
sample = cv2.resize(sample, dsize=IMG_SIZE)
sample = np.expand_dims(sample,-1)
print(sample.shape)
data += [(sample, label)]
labels += [label]
progress +=1
print('\r Progress: '+str(round(100*progress/len(dataset))) + '%', end='')
return data, labels
def retrieve_samples(sample_size, model_type):
if model_type == 'cnn':
print("\nLoading train samples")
X_train_samples, train_labels = load_and_preprocess_cnn(X_train,sample_size)
print("\nLoading test samples")
X_test_samples, test_labels = load_and_preprocess_cnn(X_test,sample_size)
print("\nLoading val samples")
X_val_samples, val_labels = load_and_preprocess_cnn(X_val,sample_size)
print('\n')
elif model_type == 'lstm':
print("\nLoading train samples")
X_train_samples, train_labels = load_and_preprocess_lstm(X_train,sample_size)
print("\nLoading test samples")
X_test_samples, test_labels = load_and_preprocess_lstm(X_test,sample_size)
print("\nLoading val samples")
X_val_samples, val_labels = load_and_preprocess_lstm(X_val,sample_size)
print('\n')
elif model_type == "cnnlstm":
print("\nLoading train samples")
X_train_samples, train_labels = load_and_preprocess_lstm(X_train,sample_size)
print("\nLoading test samples")
X_test_samples, test_labels = load_and_preprocess_lstm(X_test,sample_size)
print("\nLoading val samples")
X_val_samples, val_labels = load_and_preprocess_lstm(X_val,sample_size)
print('\n')
print("shape: " + str(X_train_samples[0][0].shape))
print("number of training samples: "+ str(len(X_train_samples)))
print("number of validation samples: "+ str(len(X_val_samples)))
print("number of test samples: "+ str(len(X_test_samples)))
return X_train_samples, X_test_samples, X_val_samples
def create_cnn_lstm_model(input_shape):
model = Sequential()
cnn = tensorflow.keras.applications.DenseNet169(include_top=True, weights=None, input_tensor=None, input_shape=input_shape, pooling=None, classes=2)
# define LSTM model
model.add(tensorflow.keras.layers.TimeDistributed(cnn, input_shape=input_shape))
model.add(LSTM(units = 512, dropout=0.5, recurrent_dropout=0.3, return_sequences = True, input_shape = input_shape))
model.add(LSTM(units = 512, dropout=0.5, recurrent_dropout=0.3, return_sequences = False))
model.add(Dense(units=NUM_CLASSES, activation='sigmoid'))#Compile
model.compile(loss=tensorflow.keras.losses.binary_crossentropy, optimizer='adam', metrics=['accuracy'])
print(model.summary())
return model
def create_model_data_and_labels(X_train_samples, X_val_samples, X_test_samples):
#Prepare samples to work for training the model
labelizer = LabelEncoder()
#prepare training data and labels
x_train = np.array([x[0] for x in X_train_samples])
y_train = np.array([x[1] for x in X_train_samples])
y_train = labelizer.fit_transform(y_train)
y_train = to_categorical(y_train)
#prepare validation data and labels
x_val = np.array([x[0] for x in X_val_samples])
y_val = np.array([x[1] for x in X_val_samples])
y_val = labelizer.transform(y_val)
y_val = to_categorical(y_val)
#prepare test data and labels
x_test = np.array([x[0] for x in X_test_samples])
y_test = np.array([x[1] for x in X_test_samples])
y_test = labelizer.transform(y_test)
y_test = to_categorical(y_test)
return x_train, y_train, x_val, y_val, x_test, y_test, labelizer
#Main loop for testing multiple sample sizes
#choose model type: 'cnn' or 'lstm'
model_type = 'cnnlstm'
n_epochs = 20
patience= 20
es = EarlyStopping(patience=20)
fragment_sizes = [5,10]
start = timeit.default_timer()
ModelData = pd.DataFrame(columns = ['Model Type','Fragment size (s)', 'Time to Compute (s)', 'Early Stopping epoch', 'Training accuracy', 'Validation accuracy', 'Test Accuracy']) #create a DataFrame for storing the results
conf_matrix_data = []
for i in fragment_sizes:
start_per_size = timeit.default_timer()
print(f'\n---------- Model trained on fragments of size: {i} seconds ----------------')
X_train_samples, X_test_samples, X_val_samples = retrieve_samples(i,model_type)
x_train, y_train, x_val, y_val, x_test, y_test, labelizer = create_model_data_and_labels(X_train_samples, X_val_samples, X_test_samples)
if model_type == 'cnn':
model = create_cnn_model(X_train_samples[0][0].shape)
elif model_type == 'lstm':
model = create_lstm_model(X_train_samples[0][0].shape)
elif model_type == 'cnnlstm':
model = create_cnn_lstm_model(X_train_samples[0][0].shape)
history = model.fit(x_train, y_train,
batch_size = 8,
epochs=n_epochs,
verbose=1,
callbacks=[es],
validation_data=(x_val, y_val))
print('Finished training')
early_stopping_epoch = len(history.history['accuracy'])
training_accuracy = history.history['accuracy'][early_stopping_epoch-1-patience]
validation_accuracy = history.history['val_accuracy'][early_stopping_epoch-1-patience]
plot_data(history, i)
predictions = model.predict(x_test)
score = accuracy_score(labelizer.inverse_transform(y_test.argmax(axis=1)), labelizer.inverse_transform(predictions.argmax(axis=1)))
print('Fragment size = ' + str(i) + ' seconds')
print('Accuracy on test samples: ' + str(score))
conf_matrix_data += [(predictions, y_test, i)]
stop_per_size = timeit.default_timer()
time_to_compute = round(stop_per_size - start_per_size)
print ('Time to compute: '+str(time_to_compute))
ModelData.loc[len(ModelData)] = [model_type, i, time_to_compute, early_stopping_epoch, training_accuracy, validation_accuracy, score] #store particular settings configuration, early stoppping epoch and accuracies in dataframe
stop = timeit.default_timer()
print ('\ntime to compute: '+str(stop-start))
I believe the input_shape is (128, 216, 1)
这里的问题是你没有时间轴来分配你的 CNN (DenseNet169) 层。
在这一步-
tensorflow.keras.layers.TimeDistributed(cnn, input_shape=(128,216,1)))
您正在将 128 维轴作为时间轴传递。这意味着每个 CNN (DenseNet169) 都留下 (216,1)
的输入形状,这不是图像,因此会抛出错误,因为它需要 3D 张量(图像)而不是 2D 张量。
您的输入形状需要是一个 4D 张量,例如 - (10, 128, 216, 1)
,以便 10
成为时间轴(用于时间分布),(128, 216, 1)
成为图像CNN (DenseNet169) 的输入。
具有参差不齐的张量和时间分布层的解决方案
IIUC,您的数据包含 n 个音频文件,每个文件包含可变数量的梅尔频谱图图像。
- 您需要使用
tf.raggedtensors
才能使用可变张量形状作为模型的输入
- 这需要明确定义您设置的输入层
ragged=True
- 这允许您将每个音频文件作为单个样本传递,并带有可变图像,每个图像都将按时间分布。
- 定义模型时必须使用
None
作为时间分布轴形状
1。创建虚拟数据集
让我们从示例数据集开始 -
import tensorflow as tf
from tensorflow.keras import layers, Model, utils, applications
#Assuming there are 5 audio files
num_audio = 5
data = []
#Create a random number of mel-spectrograms for each audio file
for i in range(num_audio):
n_images = np.random.randint(4,10)
data.append(np.random.random((n_images,128,216,1)))
print([i.shape for i in data])
[(5, 128, 216, 1),
(5, 128, 216, 1),
(9, 128, 216, 1),
(6, 128, 216, 1),
(4, 128, 216, 1)]
所以,您的数据应该看起来像这样。在这里,我有一个包含 5 个音频文件的虚拟数据集,第一个有 5 个形状 (128,216,1)
的图像,而最后一个有 4 个相同形状的图像。
2。将它们转换为参差不齐的张量
接下来,让我们转换并存储这些参差不齐的张量。参差不齐的张量允许存储可变长度的对象,在本例中为可变数量的图像。了解更多关于它们的信息 here。
#Convert each set of images (for each audio) to tensors and then a ragged tensor
tensors = [tensorflow.convert_to_tensor(i) for i in data]
X_train = tensorflow.ragged.stack(tensors).to_tensor()
#Creating dummy y_train, one for each audio files
y_train = tensorflow.convert_to_tensor(np.random.randint(0,2,(5,2)))
3。创建模型
我正在使用 functional API
,因为我发现它更具可读性并且在显式输入层上工作得更好,但您也可以在 Sequential API
中使用输入层。随意将其转换为您的喜好。
请注意,我使用 (None,128,216,1)
作为输入形状。这将创建 5 个通道(第一个用于批次的隐式通道)作为 - (Batch, audio_files, h, w, channels)
我有一个虚拟的 LSTM 层来展示架构的工作原理,可以随意堆叠更多层。另请注意,您的 DenseNet169
仅返回 2 个功能。因此,您的 TimeDistributed
层返回 (None, None, 2)
形状的张量,其中第一个 None
是音频文件的数量,第二个 None
是图像的数量(时间轴) .因此,请相应地选择下一层,因为 512 个 LSTM 单元可能太多了:)
#Create model
inp = layers.Input((None,128,216,1), ragged=True)
cnn = tensorflow.keras.applications.DenseNet169(include_top=True,
weights=None,
input_tensor=None,
input_shape=(128,216,1), #<----- input shape for cnn is just the image
pooling=None, classes=2)
#Feel free to modify these layers!
x = layers.TimeDistributed(cnn)(inp)
x = layers.LSTM(8)(x)
out = layers.Dense(2)(x)
model = Model(inp, out)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics='accuracy')
utils.plot_model(model, show_shapes=True, show_layer_names=False)
4。火车!
下一步就是训练。随意添加您自己的参数。
model.fit(X_train, y_train, epochs=2)
Epoch 1/2
WARNING:tensorflow:5 out of the last 5 calls to <function Model.make_train_function.<locals>.train_function at 0x7f8e55b4fe50> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.
1/1 [==============================] - 37s 37s/step - loss: 3.4057 - accuracy: 0.4000
Epoch 2/2
1/1 [==============================] - 16s 16s/step - loss: 3.3544 - accuracy: 0.4000
希望对您有所帮助。
我正在尝试实现一个 CNN-LSTM,该 CNN-LSTM 对表示帕金森 Disease/Healthy 控制的人的语音的梅尔频谱图图像进行分类。我正在尝试使用 LSTM 模型实现预先存在的模型 (DenseNet-169),但是我 运行 出现以下错误:ValueError: Input 0 of layer zero_padding2d is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: [None, 216, 1].
谁能告诉我哪里出错了?
import librosa
import os
import glob
import IPython.display as ipd
from pathlib import Path
import timeit
import time, sys
%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import cv2
import seaborn as sns
%tensorflow_version 1.x #version 1 works without problems
import tensorflow
from tensorflow.keras import models
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import TimeDistributed
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense, BatchNormalization, Activation, GaussianNoise, LSTM
from sklearn.metrics import accuracy_score
DATA_DIR = Path('/content/drive/MyDrive/PhD_Project_Experiments/Spontaneous_Dialogue_PD_Dataset')
diagnosis = [x.name for x in DATA_DIR.glob('*') if x.is_dir()]
diagnosis
def create_paths_ds(paths: Path, label: str) -> list:
EXTENSION_TYPE = '.wav'
return [(x, label) for x in paths.glob('*' + EXTENSION_TYPE)]
from collections import Counter
categories_to_use = [
'Parkinsons_Disease',
'Healthy_Control',
]
NUM_CLASSES = len(categories_to_use)
print(f'Number of classes: {NUM_CLASSES}')
paths_all_labels = []
for cat in categories_to_use:
paths_all_labels += create_paths_ds(DATA_DIR / cat, cat)
X_train, X_test = train_test_split(paths_all_labels,test_size=0.1, stratify = [paths_all_labels[y][1] for y in range(len(paths_all_labels))] ) #fix stratified sampling for test data
X_train, X_val = train_test_split(X_train, test_size=0.2, stratify = [X_train[y][1] for y in range(len(X_train))] )
for i in categories_to_use:
print('Number of train samples for '+i+': '+ str([X_train[y][1] for y in range(len(X_train))].count(i))) #checks whether train samples are equally divided
print('Number of test samples for '+i+': '+ str([X_test[y][1] for y in range(len(X_test))].count(i))) #checks whether test samples are equally divided
print('Number of validation samples for '+i+': '+ str([X_val[y][1] for y in range(len(X_val))].count(i))) #checks whether val samples are equally divided
print(f'Train length: {len(X_train)}')
print(f'Validation length: {len(X_val)}')
print(f'Test length: {len(X_test)}')
def load_and_preprocess_lstm(dataset, SAMPLE_SIZE = 30):
IMG_SIZE = (216,128)
progress=0
data = []
labels = []
for (path, label) in dataset:
audio, sr = librosa.load(path)
dur = librosa.get_duration(audio, sr = sr)
sampleNum = int(dur / SAMPLE_SIZE)
offset = (dur % SAMPLE_SIZE) / 2
for i in range(sampleNum):
audio, sr = librosa.load(path, offset= offset+i, duration=SAMPLE_SIZE)
sample = librosa.feature.melspectrogram(audio, sr=sr)
# print(sample.shape)
sample = cv2.resize(sample, dsize=IMG_SIZE)
sample = np.expand_dims(sample,-1)
print(sample.shape)
data += [(sample, label)]
labels += [label]
progress +=1
print('\r Progress: '+str(round(100*progress/len(dataset))) + '%', end='')
return data, labels
def retrieve_samples(sample_size, model_type):
if model_type == 'cnn':
print("\nLoading train samples")
X_train_samples, train_labels = load_and_preprocess_cnn(X_train,sample_size)
print("\nLoading test samples")
X_test_samples, test_labels = load_and_preprocess_cnn(X_test,sample_size)
print("\nLoading val samples")
X_val_samples, val_labels = load_and_preprocess_cnn(X_val,sample_size)
print('\n')
elif model_type == 'lstm':
print("\nLoading train samples")
X_train_samples, train_labels = load_and_preprocess_lstm(X_train,sample_size)
print("\nLoading test samples")
X_test_samples, test_labels = load_and_preprocess_lstm(X_test,sample_size)
print("\nLoading val samples")
X_val_samples, val_labels = load_and_preprocess_lstm(X_val,sample_size)
print('\n')
elif model_type == "cnnlstm":
print("\nLoading train samples")
X_train_samples, train_labels = load_and_preprocess_lstm(X_train,sample_size)
print("\nLoading test samples")
X_test_samples, test_labels = load_and_preprocess_lstm(X_test,sample_size)
print("\nLoading val samples")
X_val_samples, val_labels = load_and_preprocess_lstm(X_val,sample_size)
print('\n')
print("shape: " + str(X_train_samples[0][0].shape))
print("number of training samples: "+ str(len(X_train_samples)))
print("number of validation samples: "+ str(len(X_val_samples)))
print("number of test samples: "+ str(len(X_test_samples)))
return X_train_samples, X_test_samples, X_val_samples
def create_cnn_lstm_model(input_shape):
model = Sequential()
cnn = tensorflow.keras.applications.DenseNet169(include_top=True, weights=None, input_tensor=None, input_shape=input_shape, pooling=None, classes=2)
# define LSTM model
model.add(tensorflow.keras.layers.TimeDistributed(cnn, input_shape=input_shape))
model.add(LSTM(units = 512, dropout=0.5, recurrent_dropout=0.3, return_sequences = True, input_shape = input_shape))
model.add(LSTM(units = 512, dropout=0.5, recurrent_dropout=0.3, return_sequences = False))
model.add(Dense(units=NUM_CLASSES, activation='sigmoid'))#Compile
model.compile(loss=tensorflow.keras.losses.binary_crossentropy, optimizer='adam', metrics=['accuracy'])
print(model.summary())
return model
def create_model_data_and_labels(X_train_samples, X_val_samples, X_test_samples):
#Prepare samples to work for training the model
labelizer = LabelEncoder()
#prepare training data and labels
x_train = np.array([x[0] for x in X_train_samples])
y_train = np.array([x[1] for x in X_train_samples])
y_train = labelizer.fit_transform(y_train)
y_train = to_categorical(y_train)
#prepare validation data and labels
x_val = np.array([x[0] for x in X_val_samples])
y_val = np.array([x[1] for x in X_val_samples])
y_val = labelizer.transform(y_val)
y_val = to_categorical(y_val)
#prepare test data and labels
x_test = np.array([x[0] for x in X_test_samples])
y_test = np.array([x[1] for x in X_test_samples])
y_test = labelizer.transform(y_test)
y_test = to_categorical(y_test)
return x_train, y_train, x_val, y_val, x_test, y_test, labelizer
#Main loop for testing multiple sample sizes
#choose model type: 'cnn' or 'lstm'
model_type = 'cnnlstm'
n_epochs = 20
patience= 20
es = EarlyStopping(patience=20)
fragment_sizes = [5,10]
start = timeit.default_timer()
ModelData = pd.DataFrame(columns = ['Model Type','Fragment size (s)', 'Time to Compute (s)', 'Early Stopping epoch', 'Training accuracy', 'Validation accuracy', 'Test Accuracy']) #create a DataFrame for storing the results
conf_matrix_data = []
for i in fragment_sizes:
start_per_size = timeit.default_timer()
print(f'\n---------- Model trained on fragments of size: {i} seconds ----------------')
X_train_samples, X_test_samples, X_val_samples = retrieve_samples(i,model_type)
x_train, y_train, x_val, y_val, x_test, y_test, labelizer = create_model_data_and_labels(X_train_samples, X_val_samples, X_test_samples)
if model_type == 'cnn':
model = create_cnn_model(X_train_samples[0][0].shape)
elif model_type == 'lstm':
model = create_lstm_model(X_train_samples[0][0].shape)
elif model_type == 'cnnlstm':
model = create_cnn_lstm_model(X_train_samples[0][0].shape)
history = model.fit(x_train, y_train,
batch_size = 8,
epochs=n_epochs,
verbose=1,
callbacks=[es],
validation_data=(x_val, y_val))
print('Finished training')
early_stopping_epoch = len(history.history['accuracy'])
training_accuracy = history.history['accuracy'][early_stopping_epoch-1-patience]
validation_accuracy = history.history['val_accuracy'][early_stopping_epoch-1-patience]
plot_data(history, i)
predictions = model.predict(x_test)
score = accuracy_score(labelizer.inverse_transform(y_test.argmax(axis=1)), labelizer.inverse_transform(predictions.argmax(axis=1)))
print('Fragment size = ' + str(i) + ' seconds')
print('Accuracy on test samples: ' + str(score))
conf_matrix_data += [(predictions, y_test, i)]
stop_per_size = timeit.default_timer()
time_to_compute = round(stop_per_size - start_per_size)
print ('Time to compute: '+str(time_to_compute))
ModelData.loc[len(ModelData)] = [model_type, i, time_to_compute, early_stopping_epoch, training_accuracy, validation_accuracy, score] #store particular settings configuration, early stoppping epoch and accuracies in dataframe
stop = timeit.default_timer()
print ('\ntime to compute: '+str(stop-start))
I believe the input_shape is (128, 216, 1)
这里的问题是你没有时间轴来分配你的 CNN (DenseNet169) 层。
在这一步-
tensorflow.keras.layers.TimeDistributed(cnn, input_shape=(128,216,1)))
您正在将 128 维轴作为时间轴传递。这意味着每个 CNN (DenseNet169) 都留下 (216,1)
的输入形状,这不是图像,因此会抛出错误,因为它需要 3D 张量(图像)而不是 2D 张量。
您的输入形状需要是一个 4D 张量,例如 - (10, 128, 216, 1)
,以便 10
成为时间轴(用于时间分布),(128, 216, 1)
成为图像CNN (DenseNet169) 的输入。
具有参差不齐的张量和时间分布层的解决方案
IIUC,您的数据包含 n 个音频文件,每个文件包含可变数量的梅尔频谱图图像。
- 您需要使用
tf.raggedtensors
才能使用可变张量形状作为模型的输入 - 这需要明确定义您设置的输入层
ragged=True
- 这允许您将每个音频文件作为单个样本传递,并带有可变图像,每个图像都将按时间分布。
- 定义模型时必须使用
None
作为时间分布轴形状
1。创建虚拟数据集
让我们从示例数据集开始 -
import tensorflow as tf
from tensorflow.keras import layers, Model, utils, applications
#Assuming there are 5 audio files
num_audio = 5
data = []
#Create a random number of mel-spectrograms for each audio file
for i in range(num_audio):
n_images = np.random.randint(4,10)
data.append(np.random.random((n_images,128,216,1)))
print([i.shape for i in data])
[(5, 128, 216, 1),
(5, 128, 216, 1),
(9, 128, 216, 1),
(6, 128, 216, 1),
(4, 128, 216, 1)]
所以,您的数据应该看起来像这样。在这里,我有一个包含 5 个音频文件的虚拟数据集,第一个有 5 个形状 (128,216,1)
的图像,而最后一个有 4 个相同形状的图像。
2。将它们转换为参差不齐的张量
接下来,让我们转换并存储这些参差不齐的张量。参差不齐的张量允许存储可变长度的对象,在本例中为可变数量的图像。了解更多关于它们的信息 here。
#Convert each set of images (for each audio) to tensors and then a ragged tensor
tensors = [tensorflow.convert_to_tensor(i) for i in data]
X_train = tensorflow.ragged.stack(tensors).to_tensor()
#Creating dummy y_train, one for each audio files
y_train = tensorflow.convert_to_tensor(np.random.randint(0,2,(5,2)))
3。创建模型
我正在使用 functional API
,因为我发现它更具可读性并且在显式输入层上工作得更好,但您也可以在 Sequential API
中使用输入层。随意将其转换为您的喜好。
请注意,我使用 (None,128,216,1)
作为输入形状。这将创建 5 个通道(第一个用于批次的隐式通道)作为 - (Batch, audio_files, h, w, channels)
我有一个虚拟的 LSTM 层来展示架构的工作原理,可以随意堆叠更多层。另请注意,您的 DenseNet169
仅返回 2 个功能。因此,您的 TimeDistributed
层返回 (None, None, 2)
形状的张量,其中第一个 None
是音频文件的数量,第二个 None
是图像的数量(时间轴) .因此,请相应地选择下一层,因为 512 个 LSTM 单元可能太多了:)
#Create model
inp = layers.Input((None,128,216,1), ragged=True)
cnn = tensorflow.keras.applications.DenseNet169(include_top=True,
weights=None,
input_tensor=None,
input_shape=(128,216,1), #<----- input shape for cnn is just the image
pooling=None, classes=2)
#Feel free to modify these layers!
x = layers.TimeDistributed(cnn)(inp)
x = layers.LSTM(8)(x)
out = layers.Dense(2)(x)
model = Model(inp, out)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics='accuracy')
utils.plot_model(model, show_shapes=True, show_layer_names=False)
4。火车!
下一步就是训练。随意添加您自己的参数。
model.fit(X_train, y_train, epochs=2)
Epoch 1/2
WARNING:tensorflow:5 out of the last 5 calls to <function Model.make_train_function.<locals>.train_function at 0x7f8e55b4fe50> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.
1/1 [==============================] - 37s 37s/step - loss: 3.4057 - accuracy: 0.4000
Epoch 2/2
1/1 [==============================] - 16s 16s/step - loss: 3.3544 - accuracy: 0.4000
希望对您有所帮助。