如何使用 Keras 实现 CNN-LSTM

Question

我正在尝试实现一个 CNN-LSTM，该 CNN-LSTM 对表示帕金森 Disease/Healthy 控制的人的语音的梅尔频谱图图像进行分类。我正在尝试使用 LSTM 模型实现预先存在的模型 (DenseNet-169)，但是我运行出现以下错误：ValueError: Input 0 of layer zero_padding2d is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: [None, 216, 1]. 谁能告诉我哪里出错了？

import librosa
import os
import glob
import IPython.display as ipd
from pathlib import Path
import timeit
import time, sys

%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display

import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import cv2
import seaborn as sns

%tensorflow_version 1.x #version 1 works without problems
import tensorflow

from tensorflow.keras import models
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import TimeDistributed

import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense, BatchNormalization, Activation, GaussianNoise, LSTM
from sklearn.metrics import accuracy_score

DATA_DIR = Path('/content/drive/MyDrive/PhD_Project_Experiments/Spontaneous_Dialogue_PD_Dataset') 
diagnosis = [x.name for x in DATA_DIR.glob('*') if x.is_dir()]
diagnosis

def create_paths_ds(paths: Path, label: str) -> list:
    EXTENSION_TYPE = '.wav'
    return [(x, label) for x in paths.glob('*' + EXTENSION_TYPE)]

from collections import Counter

categories_to_use = [
    'Parkinsons_Disease',
    'Healthy_Control',
]

NUM_CLASSES = len(categories_to_use)

print(f'Number of classes: {NUM_CLASSES}')

paths_all_labels = []
for cat in categories_to_use:
    paths_all_labels += create_paths_ds(DATA_DIR / cat, cat)
 
X_train, X_test = train_test_split(paths_all_labels,test_size=0.1, stratify = [paths_all_labels[y][1] for y in range(len(paths_all_labels))] ) #fix stratified sampling for test data
X_train, X_val = train_test_split(X_train, test_size=0.2, stratify = [X_train[y][1] for y in range(len(X_train))] ) 

for i in categories_to_use:
  print('Number of train samples for '+i+': '+ str([X_train[y][1] for y in range(len(X_train))].count(i))) #checks whether train samples are equally divided
  print('Number of test samples for '+i+': '+ str([X_test[y][1] for y in range(len(X_test))].count(i))) #checks whether test samples are equally divided
  print('Number of validation samples for '+i+': '+ str([X_val[y][1] for y in range(len(X_val))].count(i))) #checks whether val samples are equally divided

print(f'Train length: {len(X_train)}')
print(f'Validation length: {len(X_val)}')
print(f'Test length: {len(X_test)}')

def load_and_preprocess_lstm(dataset, SAMPLE_SIZE = 30):
    IMG_SIZE = (216,128) 
    progress=0

    data = []
    labels = []
    for (path, label) in dataset:
        audio, sr = librosa.load(path)
        dur = librosa.get_duration(audio, sr = sr)
        sampleNum = int(dur / SAMPLE_SIZE)
        offset = (dur % SAMPLE_SIZE) / 2
        for i in range(sampleNum):
            audio, sr = librosa.load(path, offset= offset+i, duration=SAMPLE_SIZE)
            sample = librosa.feature.melspectrogram(audio, sr=sr)
            # print(sample.shape)
            sample = cv2.resize(sample, dsize=IMG_SIZE)
            sample = np.expand_dims(sample,-1)
            print(sample.shape)
            data += [(sample, label)]
            labels += [label]
        progress +=1
        print('\r Progress: '+str(round(100*progress/len(dataset))) + '%', end='')
    return data, labels

def retrieve_samples(sample_size, model_type):

    if model_type == 'cnn':
  
        print("\nLoading train samples")
        X_train_samples, train_labels = load_and_preprocess_cnn(X_train,sample_size)
        print("\nLoading test samples")
        X_test_samples, test_labels = load_and_preprocess_cnn(X_test,sample_size)
        print("\nLoading val samples")
        X_val_samples, val_labels = load_and_preprocess_cnn(X_val,sample_size)
        print('\n')

    elif model_type == 'lstm':

        print("\nLoading train samples")
        X_train_samples, train_labels = load_and_preprocess_lstm(X_train,sample_size)
        print("\nLoading test samples")
        X_test_samples, test_labels = load_and_preprocess_lstm(X_test,sample_size)
        print("\nLoading val samples")
        X_val_samples, val_labels = load_and_preprocess_lstm(X_val,sample_size)      
        print('\n')

    elif model_type == "cnnlstm":

        print("\nLoading train samples")
        X_train_samples, train_labels = load_and_preprocess_lstm(X_train,sample_size)
        print("\nLoading test samples")
        X_test_samples, test_labels = load_and_preprocess_lstm(X_test,sample_size)
        print("\nLoading val samples")
        X_val_samples, val_labels = load_and_preprocess_lstm(X_val,sample_size)      
        print('\n')

    print("shape: " + str(X_train_samples[0][0].shape))
    print("number of training samples: "+ str(len(X_train_samples)))
    print("number of validation samples: "+ str(len(X_val_samples)))
    print("number of test samples: "+ str(len(X_test_samples)))


    return X_train_samples, X_test_samples, X_val_samples

def create_cnn_lstm_model(input_shape):

    model = Sequential()
    cnn = tensorflow.keras.applications.DenseNet169(include_top=True, weights=None, input_tensor=None, input_shape=input_shape, pooling=None, classes=2)
    # define LSTM model
    model.add(tensorflow.keras.layers.TimeDistributed(cnn, input_shape=input_shape))
    model.add(LSTM(units = 512, dropout=0.5, recurrent_dropout=0.3, return_sequences = True, input_shape = input_shape))
    model.add(LSTM(units = 512, dropout=0.5, recurrent_dropout=0.3, return_sequences = False))
    model.add(Dense(units=NUM_CLASSES, activation='sigmoid'))#Compile

    model.compile(loss=tensorflow.keras.losses.binary_crossentropy, optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    return model

def create_model_data_and_labels(X_train_samples, X_val_samples, X_test_samples):
    #Prepare samples to work for training the model
    labelizer = LabelEncoder()

    #prepare training data and labels
    x_train = np.array([x[0] for x in X_train_samples])
    y_train = np.array([x[1] for x in X_train_samples])
    y_train = labelizer.fit_transform(y_train) 
    y_train = to_categorical(y_train)

    #prepare validation data and labels
    x_val = np.array([x[0] for x in X_val_samples])
    y_val = np.array([x[1] for x in X_val_samples])
    y_val = labelizer.transform(y_val)
    y_val = to_categorical(y_val)

    #prepare test data and labels
    x_test = np.array([x[0] for x in X_test_samples])
    y_test = np.array([x[1] for x in X_test_samples])
    y_test = labelizer.transform(y_test)
    y_test = to_categorical(y_test)

    return x_train, y_train, x_val, y_val, x_test, y_test, labelizer


#Main loop for testing multiple sample sizes

#choose model type: 'cnn' or 'lstm'
model_type = 'cnnlstm'

n_epochs = 20
patience= 20
es = EarlyStopping(patience=20)
fragment_sizes = [5,10]
start = timeit.default_timer()

ModelData = pd.DataFrame(columns = ['Model Type','Fragment size (s)', 'Time to Compute (s)',  'Early Stopping epoch', 'Training accuracy', 'Validation accuracy', 'Test Accuracy']) #create a DataFrame for storing the results 

conf_matrix_data = []

for i in fragment_sizes:

    start_per_size = timeit.default_timer()

    print(f'\n---------- Model trained on fragments of size: {i} seconds ----------------')
    X_train_samples, X_test_samples, X_val_samples = retrieve_samples(i,model_type)
    x_train, y_train, x_val, y_val, x_test, y_test, labelizer = create_model_data_and_labels(X_train_samples, X_val_samples, X_test_samples)

    if model_type == 'cnn':
        model = create_cnn_model(X_train_samples[0][0].shape)
    elif model_type == 'lstm':
        model = create_lstm_model(X_train_samples[0][0].shape)
    elif model_type == 'cnnlstm':
        model = create_cnn_lstm_model(X_train_samples[0][0].shape)


    history = model.fit(x_train, y_train, 
              batch_size = 8, 
              epochs=n_epochs,
              verbose=1, 
              callbacks=[es],
              validation_data=(x_val, y_val))
    print('Finished training')


    early_stopping_epoch = len(history.history['accuracy'])
    training_accuracy = history.history['accuracy'][early_stopping_epoch-1-patience]
    validation_accuracy = history.history['val_accuracy'][early_stopping_epoch-1-patience]

    plot_data(history, i)

    predictions = model.predict(x_test)
    score = accuracy_score(labelizer.inverse_transform(y_test.argmax(axis=1)), labelizer.inverse_transform(predictions.argmax(axis=1)))

    print('Fragment size = ' + str(i) + ' seconds')
    print('Accuracy on test samples: ' + str(score))
    
    conf_matrix_data += [(predictions, y_test, i)]

    stop_per_size = timeit.default_timer()
    time_to_compute = round(stop_per_size - start_per_size)

    print ('Time to compute: '+str(time_to_compute))

    ModelData.loc[len(ModelData)] = [model_type, i, time_to_compute, early_stopping_epoch, training_accuracy, validation_accuracy, score] #store particular settings configuration, early stoppping epoch and accuracies in dataframe

stop = timeit.default_timer()
print ('\ntime to compute: '+str(stop-start))

Answer 1

I believe the input_shape is (128, 216, 1)

这里的问题是你没有时间轴来分配你的 CNN (DenseNet169) 层。

在这一步-

tensorflow.keras.layers.TimeDistributed(cnn, input_shape=(128,216,1)))

您正在将 128 维轴作为时间轴传递。这意味着每个 CNN (DenseNet169) 都留下 (216,1) 的输入形状，这不是图像，因此会抛出错误，因为它需要 3D 张量（图像）而不是 2D 张量。

您的输入形状需要是一个 4D 张量，例如 - (10, 128, 216, 1)，以便 10 成为时间轴（用于时间分布），(128, 216, 1) 成为图像CNN (DenseNet169) 的输入。

具有参差不齐的张量和时间分布层的解决方案

IIUC，您的数据包含 n 个音频文件，每个文件包含可变数量的梅尔频谱图图像。

您需要使用 tf.raggedtensors 才能使用可变张量形状作为模型的输入
这需要明确定义您设置的输入层 ragged=True
这允许您将每个音频文件作为单个样本传递，并带有可变图像，每个图像都将按时间分布。
定义模型时必须使用None作为时间分布轴形状

1。创建虚拟数据集

让我们从示例数据集开始 -

import tensorflow as tf
from tensorflow.keras import layers, Model, utils, applications


#Assuming there are 5 audio files
num_audio = 5

data = []

#Create a random number of mel-spectrograms for each audio file
for i in range(num_audio):
    n_images = np.random.randint(4,10)
    data.append(np.random.random((n_images,128,216,1)))
    
print([i.shape for i in data])

[(5, 128, 216, 1), 
 (5, 128, 216, 1), 
 (9, 128, 216, 1), 
 (6, 128, 216, 1), 
 (4, 128, 216, 1)]

所以，您的数据应该看起来像这样。在这里，我有一个包含 5 个音频文件的虚拟数据集，第一个有 5 个形状 (128,216,1) 的图像，而最后一个有 4 个相同形状的图像。

2。将它们转换为参差不齐的张量

接下来，让我们转换并存储这些参差不齐的张量。参差不齐的张量允许存储可变长度的对象，在本例中为可变数量的图像。了解更多关于它们的信息 here。

#Convert each set of images (for each audio) to tensors and then a ragged tensor
tensors = [tensorflow.convert_to_tensor(i) for i in data]
X_train = tensorflow.ragged.stack(tensors).to_tensor()

#Creating dummy y_train, one for each audio files
y_train = tensorflow.convert_to_tensor(np.random.randint(0,2,(5,2)))

3。创建模型

我正在使用 functional API，因为我发现它更具可读性并且在显式输入层上工作得更好，但您也可以在 Sequential API 中使用输入层。随意将其转换为您的喜好。

请注意，我使用 (None,128,216,1) 作为输入形状。这将创建 5 个通道（第一个用于批次的隐式通道）作为 - (Batch, audio_files, h, w, channels)

我有一个虚拟的 LSTM 层来展示架构的工作原理，可以随意堆叠更多层。另请注意，您的 DenseNet169 仅返回 2 个功能。因此，您的 TimeDistributed 层返回 (None, None, 2) 形状的张量，其中第一个 None 是音频文件的数量，第二个 None 是图像的数量（时间轴） .因此，请相应地选择下一层，因为 512 个 LSTM 单元可能太多了:)

#Create model
inp = layers.Input((None,128,216,1), ragged=True)

cnn = tensorflow.keras.applications.DenseNet169(include_top=True, 
                                                weights=None, 
                                                input_tensor=None, 
                                                input_shape=(128,216,1), #<----- input shape for cnn is just the image
                                                pooling=None, classes=2)


#Feel free to modify these layers!
x = layers.TimeDistributed(cnn)(inp)
x = layers.LSTM(8)(x)
out = layers.Dense(2)(x)

model = Model(inp, out)
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics='accuracy')

utils.plot_model(model, show_shapes=True, show_layer_names=False)

4。火车！

下一步就是训练。随意添加您自己的参数。

model.fit(X_train, y_train, epochs=2)

Epoch 1/2
WARNING:tensorflow:5 out of the last 5 calls to <function Model.make_train_function.<locals>.train_function at 0x7f8e55b4fe50> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
1/1 [==============================] - 37s 37s/step - loss: 3.4057 - accuracy: 0.4000
Epoch 2/2
1/1 [==============================] - 16s 16s/step - loss: 3.3544 - accuracy: 0.4000

希望对您有所帮助。

如何使用 Keras 实现 CNN-LSTM

How to implement a CNN-LSTM using Keras

python

conv-neural-network

lstm

keras

具有参差不齐的张量和时间分布层的解决方案

1。创建虚拟数据集

2。将它们转换为参差不齐的张量

3。创建模型

4。火车！