如何处理不同音频文件的 MFCC 功能差异
How to handle difference in MFCC feature for difference audio file
librosa.feature.mfcc returns 不同音频文件的不同尺寸。那么如何处理这种情况以训练或测试模型
#test.py
import os
import pickle
import numpy as np
from scipy.io.wavfile import read
import librosa as mfcc
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
def get_MFCC(sr,audio):
features = mfcc.feature.mfcc(audio,sr,n_mfcc=20, dct_type=2)
feat = np.asarray(())
for i in range(features.shape[0]):
temp = features[i,:]
if np.isnan(np.min(temp)):
continue
else:
if feat.size == 0:
feat = temp
else:
feat = np.vstack((feat, temp))
features = feat;
features = preprocessing.scale(features)
return features
#path to test data
source = "C:\Users\PrashuGupta\Downloads\datasets\pygender\test_data\AudioSet\female_clips\"
#path to save trained model
modelpath = "C:\Users\Prashu Gupta\Downloads\datasets\pygender\"
gmm_files = [os.path.join(modelpath,fname) for fname in
os.listdir(modelpath) if fname.endswith('.gmm')]
models = [pickle.load(open(fname,'rb')) for fname in gmm_files]
genders = [fname.split("\")[-1].split(".gmm")[0] for fname
in gmm_files]
files = [os.path.join(source,f) for f in os.listdir(source)
if f.endswith(".wav")]
for f in files:
print (f.split("\")[-1])
audio,sr = mfcc.load(f, sr = 16000,mono = True)
features = get_MFCC(sr,audio)
scores = None
log_likelihood = np.zeros(len(models))
for i in range(len(models)):
gmm = models[i] #checking with each model one by one
scores = np.array(gmm.score(features))
log_likelihood[i] = scores.sum()
winner = np.argmax(log_likelihood)
print ("\tdetected as - ", genders[winner],"\n\tscores:female",log_likelihood[0],",male ", log_likelihood[1],"\n")
错误
Expected the input data X have 1800 features, but got 313 features in
scores = np.array(gmm.score(features))
要么你必须 truncate/pad 文件,使它们都具有相同的大小(比如 5 秒),要么将文件的特征汇总到一个不依赖于剪辑长度的固定长度向量中(average/min/max),或者让分类器在固定长度特征流上运行 windows(比如 1 秒)。
librosa.feature.mfcc returns 不同音频文件的不同尺寸。那么如何处理这种情况以训练或测试模型
#test.py
import os
import pickle
import numpy as np
from scipy.io.wavfile import read
import librosa as mfcc
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
def get_MFCC(sr,audio):
features = mfcc.feature.mfcc(audio,sr,n_mfcc=20, dct_type=2)
feat = np.asarray(())
for i in range(features.shape[0]):
temp = features[i,:]
if np.isnan(np.min(temp)):
continue
else:
if feat.size == 0:
feat = temp
else:
feat = np.vstack((feat, temp))
features = feat;
features = preprocessing.scale(features)
return features
#path to test data
source = "C:\Users\PrashuGupta\Downloads\datasets\pygender\test_data\AudioSet\female_clips\"
#path to save trained model
modelpath = "C:\Users\Prashu Gupta\Downloads\datasets\pygender\"
gmm_files = [os.path.join(modelpath,fname) for fname in
os.listdir(modelpath) if fname.endswith('.gmm')]
models = [pickle.load(open(fname,'rb')) for fname in gmm_files]
genders = [fname.split("\")[-1].split(".gmm")[0] for fname
in gmm_files]
files = [os.path.join(source,f) for f in os.listdir(source)
if f.endswith(".wav")]
for f in files:
print (f.split("\")[-1])
audio,sr = mfcc.load(f, sr = 16000,mono = True)
features = get_MFCC(sr,audio)
scores = None
log_likelihood = np.zeros(len(models))
for i in range(len(models)):
gmm = models[i] #checking with each model one by one
scores = np.array(gmm.score(features))
log_likelihood[i] = scores.sum()
winner = np.argmax(log_likelihood)
print ("\tdetected as - ", genders[winner],"\n\tscores:female",log_likelihood[0],",male ", log_likelihood[1],"\n")
错误
Expected the input data X have 1800 features, but got 313 features in scores = np.array(gmm.score(features))
要么你必须 truncate/pad 文件,使它们都具有相同的大小(比如 5 秒),要么将文件的特征汇总到一个不依赖于剪辑长度的固定长度向量中(average/min/max),或者让分类器在固定长度特征流上运行 windows(比如 1 秒)。