使用科大讯飞麦克风阵列实现智能对话

智能时代到来，闲来无事学了点python，顺便就开始找点项目开始练手，做一个智能对话机器人玩一下，实现如下

首先要解决语音输入问题，经过一段时间的研究，发现想要实现远处也能控制就必须用到麦克风阵列，在万能的某宝上搜了一下，发现了一个宝贝，技术支持也比较给力。

使用科大讯飞麦克风阵列实现智能对话

usb供电，自带喇叭功放，自带回声消除，方向指示，串口通信。

使用科大讯飞麦克风阵列实现智能对话

连上树莓派，由于树莓派没有音频输入接口，所以加了一个usb声卡，录音接阵列语音输出接口，播放接回声消除信号接口（可以直接通过喇叭播放）。集成的比较好，少连接好多线，不用自己搭建回声消除电路了。

使用科大讯飞麦克风阵列实现智能对话

树莓派可以识别usb声卡，怎么设置usb为默认声卡请自行搜索，有很多介绍，下面开始用python实现录音和语音识别成文字

录音使用pyaudio，当有生意的时候就保存声音文件

import os
import sys
import wave
import numpy as np
from datetime import datetime
from pyaudio import PyAudio, paInt16

class GenAudio(object):
def __init__(self):
self.num_samples = 8000 #pyaudio内置缓冲大小
self.sampling_rate = 16000 #取样频率
self.level = 1500 #声音保存的阈值
self.count_num = 20 #count_num个取样之内出现COUNT_NUM个大于LEVEL的取样则记录声音
self.save_length = 3 #声音记录的最小长度：save_length * num_samples 个取样
self.time_count = 20 #录音时间，单位s
self.voice_string = []

#保存文件
def save_wav(self, filename):
wf = wave.open(filename, 'wb')
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(self.sampling_rate)
wf.writeframes(np.array(self.voice_string).tostring())
wf.close()


def read_audio(self):
pa = PyAudio()
stream = pa.open(format=paInt16, channels=1, rate=self.sampling_rate, input=True,
frames_per_buffer=self.num_samples)

save_count = 0
save_buffer = []
time_count = self.time_count
start_rec = 0
while True:
if(start_rec == 2):
time_count -= 1

# 读入num_samples个取样
string_audio_data = stream.read(self.num_samples)
# 将读入的数据转换为数组
audio_data = np.fromstring(string_audio_data, dtype = np.short)
#计算大于 level 的取样的个数
large_sample_count = np.sum(audio_data > self.level)

print(np.max(audio_data)), "large_sample_count=>", large_sample_count

# 如果个数大于COUNT_NUM，则至少保存SAVE_LENGTH个块
if large_sample_count > self.count_num:
save_count = self.save_length
start_rec = 1
else:
if(start_rec == 1):
save_count -= 1

if(start_rec == 1):
time_count -=1
if save_count < 0:
save_count = 0

if save_count > 0:
save_buffer.append(string_audio_data)
else:
if len(save_buffer) > 0:
self.voice_string = save_buffer
save_buffer = []
print("Recode a piece of voice successfully!")
stream.close()
pa.terminate()
return True

if time_count == 0: #时间到
if len(save_buffer) > 0:
self.voice_string = save_buffer
save_buffer = []
print("Recode a piece of voice successfully!")
stream.close()
pa.terminate()
return True
else:
stream.close()
pa.terminate()
return False
return True

if __name__ == "__main__":
r = GenAudio()
r.read_audio()
r.save_wav("./mic_rec_data.pcm")

有了声音然后就可以调用语音识别的api实现文字识别了，使用百度的api如下

from aip import AipSpeech
import base64

import json
import time
import RecAudio
import pygame

APP_ID = 'xxxxxxx'
API_KEY = 'xxxxxxxx'
SECRET_KEY = 'xxxxxxxx'

clientSpeech = AipSpeech(APP_ID, API_KEY, SECRET_KEY)

while True:
time.sleep(1)
r = RecAudio.GenAudio()
r.read_audio()
r.save_wav("./mic_rec_data.pcm")

start_time = time.time()
req = clientSpeech.asr(get_file_content('mic_rec_data.pcm'), 'pcm', 16000, { 'dev_pid': 1536,})

result = ''
if req['err_no'] == 0:
result = req['result'][0]
print (result)
else:
print (req['err_msg'])

使用pygame播放声音

def play(file):
pygame.mixer.init(frequency=16000, size=-16, channels=2, buffer=4096) #初始化音频
track = pygame.mixer.music.load(file)#载入音乐文件
pygame.mixer.music.play()#开始播放 loops = 0，maxtime = 0，fade_ms = 0 重复的次数在给定的毫秒数后停止播放以0音量开始播放，并在给定时间内逐渐淡出至全音量
while (pygame.mixer.music.get_busy()):
time.sleep(1)
pygame.mixer.music.stop()#停止播放

'''文字转语音，实现自定义对话
vol 音量，取值0-15，默认为5中音量
per 发音人选择, 0为女声，1为男声，3为情感合成-度逍遥，4为情感合成-度丫丫，默认为普通女
'''
def ttsPlay( resultTxt):
print (resultTxt)
text = u'欢迎您,'+ resultTxt
ttsResult = clientSpeech.synthesis( text , 'zh', 1, {
'spd': 5 ,'pit' : 5 ,'vol': 8, 'per' : 4,
})

filename = './audio/'+'hello'+'.mp3'
if not isinstance(ttsResult, dict):
with open(filename, 'wb') as f:
f.write(ttsResult)
f.close()

play(filename)

先弄到这里，接下来还可以试试用串口控制麦克风阵列实现定向拾音和全向拾音，这个录音效果真的很棒，不过需要注意的是供电和音频线不能同时接在树莓派上，会有噪声，需要用隔离电源或者用手机适配器提供5V电源。

使用科大讯飞麦克风阵列实现智能对话

相关推荐