在我的简单 Python 代码中筛选出预测

Filter singled out predictions in my simple Python code

我有这个简单的 Python 代码可以预测面部的情绪(从 here 中获取,以防你需要 运行),并显示在相机上脸部周围的矩形。但问题是它有很多噪音。例如,Fearful -- Sad -- -- Sad 等。我想平滑预测并过滤掉单独的预测。如果连续 n 个预测数表示 Sad,我该如何更改,然后将其显示为 Sad

您只需要更改最后几行,因为初始部分都是用于预测的。

import numpy as np
import argparse
import matplotlib.pyplot as plt
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# command line argument
ap = argparse.ArgumentParser()
ap.add_argument("--mode",help="train/display")
mode = ap.parse_args().mode

# plots accuracy and loss curves
def plot_model_history(model_history):
    """
    Plot Accuracy and Loss curves given the model_history
    """
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    # summarize history for accuracy
    axs[0].plot(range(1,len(model_history.history['accuracy'])+1),model_history.history['accuracy'])
    axs[0].plot(range(1,len(model_history.history['val_accuracy'])+1),model_history.history['val_accuracy'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].set_xticks(np.arange(1,len(model_history.history['accuracy'])+1),len(model_history.history['accuracy'])/10)
    axs[0].legend(['train', 'val'], loc='best')
    # summarize history for loss
    axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
    axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
    axs[1].legend(['train', 'val'], loc='best')
    fig.savefig('plot.png')
    plt.show()

# Define data generators
train_dir = 'data/train'
val_dir = 'data/test'

num_train = 28709
num_val = 7178
batch_size = 64
num_epoch = 50

train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        train_dir,
        target_size=(48,48),
        batch_size=batch_size,
        color_mode="grayscale",
        class_mode='categorical')

validation_generator = val_datagen.flow_from_directory(
        val_dir,
        target_size=(48,48),
        batch_size=batch_size,
        color_mode="grayscale",
        class_mode='categorical')

# Create the model
model = Sequential()

model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(48,48,1)))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(7, activation='softmax'))

# If you want to train the same model or try other models, go for this
if mode == "train":
    model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.0001, decay=1e-6),metrics=['accuracy'])
    model_info = model.fit_generator(
            train_generator,
            steps_per_epoch=num_train // batch_size,
            epochs=num_epoch,
            validation_data=validation_generator,
            validation_steps=num_val // batch_size)
    plot_model_history(model_info)
    model.save_weights('model.h5')

# emotions will be displayed on your face from the webcam feed
elif mode == "display":
    model.load_weights('model.h5')

    # prevents openCL usage and unnecessary logging messages
    cv2.ocl.setUseOpenCL(False)

    # dictionary which assigns each label an emotion (alphabetical order)
    emotion_dict = {0: "Angry", 1: "Disgusted", 2: "Fearful", 3: "Happy", 4: "Neutral", 5: "Sad", 6: "Surprised"}

    # start the webcam feed
    cap = cv2.VideoCapture(1)
    while True:
        # Find haar cascade to draw bounding box around face
        ret, frame = cap.read()
        if not ret:
            break
        facecasc = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = facecasc.detectMultiScale(gray,scaleFactor=1.3, minNeighbors=5)

        for (x, y, w, h) in faces:
            cv2.rectangle(frame, (x, y-50), (x+w, y+h+10), (255, 0, 0), 2)
            roi_gray = gray[y:y + h, x:x + w]
            cropped_img = np.expand_dims(np.expand_dims(cv2.resize(roi_gray, (48, 48)), -1), 0)
            prediction = model.predict(cropped_img)
            maxindex = int(np.argmax(prediction))
            text = emotion_dict[maxindex]
            if ("Sad" in text) or ("Angry" in text) or ("Disgusted" in text):
                text = "Sad"
            if ("Happy" in text) or ("Sad" in text):
                cv2.putText(frame, text, (x+20, y-60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        cv2.imshow('Video', cv2.resize(frame,(1600,960),interpolation = cv2.INTER_CUBIC))
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

我会列出预测并采用模式,如下所示:

prediction_history = []
LOOKBACK = 5 # how far you want to look back

# in loop:
prediction_history.append(maxindex)
most_common_index = max(set(prediction_history[-LOOKBACK:][::-1]), key = prediction_history.count)
text = emotion_dict[most_common_index]

具体在您的代码中:

import os
...
prediction_history = []
LOOKBACK = 5 # how far you want to look back
...

    cap = cv2.VideoCapture(1)
    while True:
        # Find haar cascade to draw bounding box around face
        ret, frame = cap.read()
        if not ret:
            break
        facecasc = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = facecasc.detectMultiScale(gray,scaleFactor=1.3, minNeighbors=5)

        for (x, y, w, h) in faces:
            cv2.rectangle(frame, (x, y-50), (x+w, y+h+10), (255, 0, 0), 2)
            roi_gray = gray[y:y + h, x:x + w]
            cropped_img = np.expand_dims(np.expand_dims(cv2.resize(roi_gray, (48, 48)), -1), 0)
            prediction = model.predict(cropped_img)
            
            # updates
            prediction_history.append(int(np.argmax(prediction)))
            most_common_index = max(set(prediction_history[-LOOKBACK:][::-1]), key = prediction_history.count)
            text = emotion_dict[most_common_index]
            
            if ("Sad" in text) or ("Angry" in text) or ("Disgusted" in text):
                text = "Sad"
            if ("Happy" in text) or ("Sad" in text):
                cv2.putText(frame, text, (x+20, y-60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        cv2.imshow('Video', cv2.resize(frame,(1600,960),interpolation = cv2.INTER_CUBIC))
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

我想到的另一个选择是跳过初始 argmax 并获取后续帧的总数(或平均值)。

prediction_history.append(prediction) 而不是 prediction_history.append(int(np.argmax(prediction)))

将原始输出附加到预测历史而不是 argmax。这会给你一个这样的列表:

prediction_history = [
  [.1, .1, .1, .5, .4, .3, .1] # argmax = 3
  [.1, .2, .1, .5, .4, .3, .1] # argmax = 3
  [.1, .4, .1, .5, .4, .3, .1] # argmax = 3
  [.2, .6, .1, .5, .4, .3, .1] # argmax = 2
  [.1, .3, .1, .1, .4, .3, .1] # argmax = 2
  [.1, .6, .1, .1, .4, .3, .1] # argmax = 2
  [.1, .3, .1, .1, .4, .3, .1] # argmax = 4
]

# sum: [.8, 2.5, .7, 2.3, 2.6, 2.1, .7] --> 4
most_common_index = int(np.argmax(np.sum(prediction_history[-LOOKBACK:], 0)))
# mode (other answer) of argmax: [3,3,3,2,2,2,4] --> 3


this is also a nice framework for doing simple moving averages or other things -- I think you may end up with a more continuous state ie not [1,2,1,2,1,2,1...]

鉴于你希望之前的结果影响结果我会说如果你有机会重新设计这个东西我会考虑 RNN 或类似的东西并让 ML 算法处理选择滚动类别