我可以信任哪个工具?

Which tool can I trust?

我似乎无法确定我可以信任哪个工具...

我一直在测试的工具是 Librosa 和 Kaldi,它们为 绘制 audio file.

的 40 个滤波器组能量的可视化

在 kaldi 中使用这些配置提取滤波器组能量。

fbank.conf

--htk-compat=false
--window-type=hamming
--sample-frequency=16000
--num-mel-bins=40
--use-log-fbank=true

提取的数据使用 librosa 图绘制。 Librosa 使用 matplotlib pcolormesh,这意味着应该没有任何区别,除了 librosa 提供更容易使用的 API。

print static.shape
print type(static)
print np.min(static)
print np.max(static)
fig = plt.figure()
librosa.display.specshow(static.T,sr=16000,x_axis='frames',y_axis='mel',hop_length=160,cmap=cm.jet)
#plt.axis('off')
plt.title("log mel power spectrum of " + name)
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()
plt.savefig(plot+"/"+name+"_plot_static_conv.png")
plt.show()

输出:

(474, 40)
<type 'numpy.ndarray'>
-1.828067
22.70058
Got bus address:  "unix:abstract=/tmp/dbus-aYbBS1JWyw,guid=17dd413abcda54272e1d93d159174cdf" 
Connected to accessibility bus at:  "unix:abstract=/tmp/dbus-aYbBS1JWyw,guid=17dd413abcda54272e1d93d159174cdf" 
Registered DEC:  true 
Registered event listener change listener:  true 

在 Librosa 中创建的类似图:

audio_path="../../../../Dropbox/SI1392.wav"
#audio_path = librosa.util.example_audio_file()
print "Example audio found"
y, sr = librosa.load(audio_path)
print "Example audio loaded"
specto = librosa.feature.melspectrogram(y, sr=sr, n_fft=400, hop_length=160, n_mels=40)
print "Example audio spectogram"
log_specto = librosa.core.logamplitude(specto)

print "min and max"
print np.min(log_specto)
print np.max(log_specto)
print "Example audio log specto"

plt.figure(figsize=(12,4))
librosa.display.specshow(log_specto,sr=sr,x_axis='frames', y_axis='mel', hop_length=160,cmap=cm.jet)

plt.title('mel power spectrogram')

plt.colorbar(format='%+02.0f dB')

plt.tight_layout()
print "See"

print specto.shape

print log_specto.shape
plt.show()

输出这个:

libraries loaded!
Example audio found
Example audio loaded
Example audio spectogram
min and max
-84.6796661558
-4.67966615584
Example audio log specto
See
(40, 657)
(40, 657)

尽管颜色不同,两者都显示相似的图,但能量范围似乎有点不同。

Kaldi 的 min/max 为 -1.828067/22.70058

Librosa 有一个 min/max -84.6796661558/-4.67966615584

问题是我试图将这些图存储为 numpy 数组,以供进一步处理。

这似乎创造了一个不同的情节.. 使用 Librosa 数据,我将绘图创建为:

plt.figure()
min_max_scaled_log_specto = min_max_scaler.fit_transform(log_specto)
convert = plt.get_cmap(cm.jet)
numpy_static = convert(min_max_scaled_log_specto)
plt.imshow(np.flipud(log_specto), aspect='auto')
plt.colorbar()
print "Sooo?"
plt.show()

完美...它类似于原始数据集..

但是对于 Kaldi,我从这段代码中得到了这个情节:

convert = plt.get_cmap(cm.jet)
numpy_output_static = convert(np.flipud(static.T))
plt.imshow(numpy_output_static,aspect = 'auto')
plt.show()
raw_input("sadas")

我从之前的 post 中发现,出现红色的原因可能是由于范围,之前的标准化会有所帮助 - 但这导致了这个:

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
convert = plt.get_cmap(cm.jet)
numpy_output_static = convert(min_max_scaler.fit_transform(np.flipud(static.T)))
plt.imshow(numpy_output_static,aspect = 'auto')
plt.show()

但这绝不可能与 Kaldi 情节中的原始情节有关...那么为什么它看起来像这样呢?.. 为什么我可以使用从 Librosa 中提取的能量来绘制它,而不是从 Kaldi 中提取的能量?

Librosa 的最小工作示例:

#
#   Minimal example of Librosa plot example.
#   Made for testing the plot, and test for accurat
#   Conversion between the two parts.
#

import os
import sys
from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import Normalize
import matplotlib
from PIL import Image
import librosa
import colormaps as cmaps
import librosa.display
import ast
from scipy.misc import toimage
from matplotlib import cm
from sklearn import preprocessing

print "libraries loaded!"
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))

audio_path="../../../../Dropbox/SI1392.wav"
#audio_path = librosa.util.example_audio_file()
print "Example audio found"
y, sr = librosa.load(audio_path)
print "Example audio loaded"
specto = librosa.feature.melspectrogram(y, sr=sr, n_fft=400, hop_length=160, n_mels=40)
print "Example audio spectogram"
log_specto = librosa.core.logamplitude(specto)

print "min and max"
print np.min(log_specto)
print np.max(log_specto)
print "Example audio log specto"

plt.figure(figsize=(12,4))
librosa.display.specshow(log_specto,sr=sr,x_axis='frames', y_axis='mel', hop_length=160,cmap=cm.jet)

plt.title('mel power spectrogram')

plt.colorbar(format='%+02.0f dB')

plt.tight_layout()
print "See"
#plt.show()

print specto.shape

print log_specto.shape

plt.figure()
min_max_scaled_log_specto = min_max_scaler.fit_transform(log_specto)
convert = plt.get_cmap(cm.jet)
numpy_static = convert(min_max_scaled_log_specto)
plt.imshow(np.flipud(log_specto), aspect='auto')
plt.colorbar()
print "Sooo?"
plt.show()

kaldi 的最小工作示例-(真实数据):

#
#   Extracted version:
#
#
#

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from PIL import Image
import librosa
import librosa.display
from matplotlib import cm
from sklearn import preprocessing
import ast
import urllib
import os
import sys
from os import listdir
from os.path import isfile, join

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))

def make_plot_store_data(name,interweaved,static,delta,delta_delta,isTrain,isTest,isDev):

    print static.shape
    print type(static)
    print np.min(static)
    print np.max(static)
    fig = plt.figure()

    librosa.display.specshow(static.T,sr=16000,x_axis='frames',y_axis='mel',hop_length=160,cmap=cm.jet)
    #plt.axis('off')
    plt.title("log mel power spectrum of " + name)
    plt.colorbar(format='%+02.0f dB')
    plt.tight_layout()
    #plt.show()
    #plt.close()
    #raw_input("asd")

    if isTrain == True:
        plt.figure()
        convert = plt.get_cmap(cm.jet)
        numpy_output_static = convert(min_max_scaler.fit_transform(np.flipud(static.T)))
        plt.imshow(numpy_output_static,aspect = 'auto')
        plt.show()
        raw_input("sadas")

link = "https://gist.githubusercontent.com/Miail/51311b34f5e5333bbddf9cb17c737ea4/raw/786b72477190023e93b9dd0cbbb43284ab59921b/feature.txt"
f = urllib.urlopen(link)

temp_list = []
for line in f:
    entries = 0
    data_splitted = line.split()
    if len(data_splitted) == 2:
            file_name = data_splitted[0]
    else:
        entries = 1+entries
        if data_splitted[-1] == ']':
            temp_list.extend([ast.literal_eval(i) for i in data_splitted[:-1]])
        else:
            temp_list.extend([ast.literal_eval(i) for i in data_splitted])


dimension = 120
entries = len(temp_list)/dimension
data = np.array(temp_list)
interweaved = data.reshape(entries,dimension)
static =interweaved[:,:-80]
delta =interweaved[:,40:-40]
delta_delta =interweaved[:,80:]
plot_interweaved = data.reshape(entries*3,dimension/3)
print static.shape
print delta.shape
print delta_delta.shape
make_plot_store_data(file_name,plot_interweaved,static,delta,delta_delta,True,False,False)

我似乎在这个中找到了答案。 问题是我的规范化。所以不要这样做:

numpy_output_static = convert(min_max_scaler.fit_transform(np.flipud(static.T)))

我应该做的:

norm_static = matplotlib.colors.Normalize(vmin=static.min(),vmax=static.max())
numpy_output_static = convert(norm_static(np.flipud(static.T)))