吴恩达Coursera深度学习课程 deeplearning.ai (5-3) 序列模型和注意力机制--编程作业(二):触发字检测
1.数据处理
可参考博客:
https://blog.****.net/haoyutiangang/article/details/81231887
https://blog.****.net/haoyutiangang/article/details/81231887
2.模型
3.代码实践
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import numpy as np
from pydub import AudioSegment
import random
import sys
import io
import os
import glob
import IPython
from td_utils import *
from keras.callbacks import ModelCheckpoint
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D
from keras.layers import GRU, Bidirectional, BatchNormalization, Reshape
from keras.optimizers import Adam
# 1.加载数据集
# # 加载训练集
X=np.load('./XY_train.npy')
Y=np.load('./XY_train.npy')
# # 加载开发集
X_dev = np.load("./XY_dev/X_dev.npy")
Y_dev = np.load("./XY_dev/Y_dev.npy")
# 2.构建模型
def model(input_shape):
X_input=Input(shape=input_shape)
# 第一步:卷积层
X=Conv1D(filters=196,kernel_size=15,strides=4)(X_input)
X=BatchNormalization()(X)
X=Activation('relu')(X)
X=Dropout(0.8)(X)
# 第二步:第一个GRU层
X=GRU(units=128,return_sequences=True)(X)
X=Dropout(0.8)(X)
X=BatchNormalization()(X)
X=Dropout(0.8)(X)
# 第三步:第二个GRU层
X=GRU(units=128,return_sequences=True)(X)
X=Dropout(0.8)(X)
X=BatchNormalization()(X)
X=Dropout(0.8)(X)
# 第四步:Time-distributed dense layer
X=TimeDistributed(Dense(1,activation='sigmoid'))(X)
model=Model(inputs=X_input,outputs=X)
return model
model = load_model('./models/tr_model.h5')
# 编译模型
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=["accuracy"])
# 训练模型
model.fit(X, Y, batch_size=5, epochs=1)
# 测试模型
loss, acc = model.evaluate(X_dev, Y_dev)
print("Dev Set accuracy = ", acc)
# 预测模型
def detect_triggerword(filename):
plt.subplot(2, 1, 1)
x = graph_spectrogram(filename)
x = x.swapaxes(0, 1)
x = np.expand_dims(x, axis=0)
predictions = model.predict(x)
plt.subplot(2, 1, 2)
plt.plot(predictions[0, :, 0])
plt.ylabel('probability')
plt.show()
return predictions
#当检测到“activate”单词时自动播放“chiming”音乐,
# 但是y<t>中会有很多个1来触发这个音乐,
# 而我们只需在检测到第一个“1”时触发,其他时不需要。
# 因此需要chime_on_activate函数来处理。
chime_file = 'audio_examples/chime.wav'
def chime_on_activate(filename, predictions, threshold):
audio_clip = AudioSegment.from_wave(filename)
chime = AudioSegment.from_wave(chime_file)
Ty = predictions.shape[1]
consecutive_timesteps = 0
for i in range(Ty):
consecutive_timesteps += 1
if predictions[0, i, 0] > threshold and consecutive_timesteps > 75:
audio_clip = audio_clip.overlay(chime, position=((i / Ty) * audio_clip.duration_senconds) * 1000)
consecutive_timesteps = 0
audio_clip.export("chime_output.wav", foramt='wav')
# 在开发集上测试
filename = "./raw_data/dev/1.wav"
prediction = detect_triggerword(filename)
chime_on_activate(filename, prediction, 0.5)
IPython.display.Audio("./chime_out.wav")