我可以信任哪一种工具?

问题描述:

我似乎已经确定我可以信任的工具的问题...我可以信任哪一种工具?

我一直在测试的工具是Librosa和Kaldi在的audio file的40个滤波器能 地块的可视化数据集的创建。

使用卡尔迪中的这些配置来提取滤波器组能量。

fbank.conf

--htk-compat=false 
--window-type=hamming 
--sample-frequency=16000 
--num-mel-bins=40 
--use-log-fbank=true 

提取的数据是使用librosa情节作图。 Librosa利用matplotlibpcolormesh,这意味着不应该有任何区别,除了librosa提供了一个更容易使用的API。

print static.shape 
print type(static) 
print np.min(static) 
print np.max(static) 
fig = plt.figure() 
librosa.display.specshow(static.T,sr=16000,x_axis='frames',y_axis='mel',hop_length=160,cmap=cm.jet) 
#plt.axis('off') 
plt.title("log mel power spectrum of " + name) 
plt.colorbar(format='%+02.0f dB') 
plt.tight_layout() 
plt.savefig(plot+"/"+name+"_plot_static_conv.png") 
plt.show() 

输出:

(474, 40) 
<type 'numpy.ndarray'> 
-1.828067 
22.70058 
Got bus address: "unix:abstract=/tmp/dbus-aYbBS1JWyw,guid=17dd413abcda54272e1d93d159174cdf" 
Connected to accessibility bus at: "unix:abstract=/tmp/dbus-aYbBS1JWyw,guid=17dd413abcda54272e1d93d159174cdf" 
Registered DEC: true 
Registered event listener change listener: true 

enter image description here

类似的情节在Librosa创建为这样:

audio_path="../../../../Dropbox/SI1392.wav" 
#audio_path = librosa.util.example_audio_file() 
print "Example audio found" 
y, sr = librosa.load(audio_path) 
print "Example audio loaded" 
specto = librosa.feature.melspectrogram(y, sr=sr, n_fft=400, hop_length=160, n_mels=40) 
print "Example audio spectogram" 
log_specto = librosa.core.logamplitude(specto) 

print "min and max" 
print np.min(log_specto) 
print np.max(log_specto) 
print "Example audio log specto" 

plt.figure(figsize=(12,4)) 
librosa.display.specshow(log_specto,sr=sr,x_axis='frames', y_axis='mel', hop_length=160,cmap=cm.jet) 

plt.title('mel power spectrogram') 

plt.colorbar(format='%+02.0f dB') 

plt.tight_layout() 
print "See" 

print specto.shape 

print log_specto.shape 
plt.show() 

输出该:

libraries loaded! 
Example audio found 
Example audio loaded 
Example audio spectogram 
min and max 
-84.6796661558 
-4.67966615584 
Example audio log specto 
See 
(40, 657) 
(40, 657) 

enter image description here

尽管有颜色,但两者都显示类似的图,但能量范围似乎有点不同。

Kaldi有-1.828067/22.70058

最小/ MAX和Librosa具有最小/最大-84.6796661558/-4.67966615584

问题是我想保存这些地块作为numpy的阵列,用于进一步处理。

这似乎创造一个不同的情节.. 使用Librosa数据,我创建的情节一样:

plt.figure() 
min_max_scaled_log_specto = min_max_scaler.fit_transform(log_specto) 
convert = plt.get_cmap(cm.jet) 
numpy_static = convert(min_max_scaled_log_specto) 
plt.imshow(np.flipud(log_specto), aspect='auto') 
plt.colorbar() 
print "Sooo?" 
plt.show() 

enter image description here

这是完美的......它类似于原始数据集..

但随着Kaldi我从这个代码这个情节:

convert = plt.get_cmap(cm.jet) 
numpy_output_static = convert(np.flipud(static.T)) 
plt.imshow(numpy_output_static,aspect = 'auto') 
plt.show() 
raw_input("sadas") 

enter image description here

我从以前的帖子,对于红色发生的历史的原因可能是由于范围和标准化前,将有助于发现 - 但这导致此:

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1)) 
convert = plt.get_cmap(cm.jet) 
numpy_output_static = convert(min_max_scaler.fit_transform(np.flipud(static.T))) 
plt.imshow(numpy_output_static,aspect = 'auto') 
plt.show() 

enter image description here

但是,这绝不可能与Kaldi阴谋的原始阴谋有关...那么,为什么它看起来像这样?为什么我能够用从Librosa提取的能量而不是从Kaldi提取的能量来绘制它?对于Librosa

最小工作示例:与kaldi

# 
# Minimal example of Librosa plot example. 
# Made for testing the plot, and test for accurat 
# Conversion between the two parts. 
# 

import os 
import sys 
from os import listdir 
from os.path import isfile, join 
import numpy as np 
import matplotlib 
matplotlib.use('TkAgg') 
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D 
from matplotlib.colors import Normalize 
import matplotlib 
from PIL import Image 
import librosa 
import colormaps as cmaps 
import librosa.display 
import ast 
from scipy.misc import toimage 
from matplotlib import cm 
from sklearn import preprocessing 

print "libraries loaded!" 
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1)) 

audio_path="../../../../Dropbox/SI1392.wav" 
#audio_path = librosa.util.example_audio_file() 
print "Example audio found" 
y, sr = librosa.load(audio_path) 
print "Example audio loaded" 
specto = librosa.feature.melspectrogram(y, sr=sr, n_fft=400, hop_length=160, n_mels=40) 
print "Example audio spectogram" 
log_specto = librosa.core.logamplitude(specto) 

print "min and max" 
print np.min(log_specto) 
print np.max(log_specto) 
print "Example audio log specto" 

plt.figure(figsize=(12,4)) 
librosa.display.specshow(log_specto,sr=sr,x_axis='frames', y_axis='mel', hop_length=160,cmap=cm.jet) 

plt.title('mel power spectrogram') 

plt.colorbar(format='%+02.0f dB') 

plt.tight_layout() 
print "See" 
#plt.show() 

print specto.shape 

print log_specto.shape 

plt.figure() 
min_max_scaled_log_specto = min_max_scaler.fit_transform(log_specto) 
convert = plt.get_cmap(cm.jet) 
numpy_static = convert(min_max_scaled_log_specto) 
plt.imshow(np.flipud(log_specto), aspect='auto') 
plt.colorbar() 
print "Sooo?" 
plt.show() 

最小工作示例 - (真实数据):

# 
# Extracted version: 
# 
# 
# 

import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib 
from PIL import Image 
import librosa 
import librosa.display 
from matplotlib import cm 
from sklearn import preprocessing 
import ast 
import urllib 
import os 
import sys 
from os import listdir 
from os.path import isfile, join 

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1)) 

def make_plot_store_data(name,interweaved,static,delta,delta_delta,isTrain,isTest,isDev): 

    print static.shape 
    print type(static) 
    print np.min(static) 
    print np.max(static) 
    fig = plt.figure() 

    librosa.display.specshow(static.T,sr=16000,x_axis='frames',y_axis='mel',hop_length=160,cmap=cm.jet) 
    #plt.axis('off') 
    plt.title("log mel power spectrum of " + name) 
    plt.colorbar(format='%+02.0f dB') 
    plt.tight_layout() 
    #plt.show() 
    #plt.close() 
    #raw_input("asd") 

    if isTrain == True: 
     plt.figure() 
     convert = plt.get_cmap(cm.jet) 
     numpy_output_static = convert(min_max_scaler.fit_transform(np.flipud(static.T))) 
     plt.imshow(numpy_output_static,aspect = 'auto') 
     plt.show() 
     raw_input("sadas") 

link = "https://gist.githubusercontent.com/Miail/51311b34f5e5333bbddf9cb17c737ea4/raw/786b72477190023e93b9dd0cbbb43284ab59921b/feature.txt" 
f = urllib.urlopen(link) 

temp_list = [] 
for line in f: 
    entries = 0 
    data_splitted = line.split() 
    if len(data_splitted) == 2: 
      file_name = data_splitted[0] 
    else: 
     entries = 1+entries 
     if data_splitted[-1] == ']': 
      temp_list.extend([ast.literal_eval(i) for i in data_splitted[:-1]]) 
     else: 
      temp_list.extend([ast.literal_eval(i) for i in data_splitted]) 


dimension = 120 
entries = len(temp_list)/dimension 
data = np.array(temp_list) 
interweaved = data.reshape(entries,dimension) 
static =interweaved[:,:-80] 
delta =interweaved[:,40:-40] 
delta_delta =interweaved[:,80:] 
plot_interweaved = data.reshape(entries*3,dimension/3) 
print static.shape 
print delta.shape 
print delta_delta.shape 
make_plot_store_data(file_name,plot_interweaved,static,delta,delta_delta,True,False,False) 

我似乎找到了答案从另一个post类似于此..

问题是我正常化..所以,而不是做

numpy_output_static = convert(min_max_scaler.fit_transform(np.flipud(static.T))) 

我应该做的

norm_static = matplotlib.colors.Normalize(vmin=static.min(),vmax=static.max()) 
    numpy_output_static = convert(norm_static(np.flipud(static.T)))