AI系列网址:AI 系列 总目录
目标需求
使用录音形式,模拟微信语音聊天。按住录音,松开发送语音,并完成语音识别。
ps:百度的语言识别有60秒长度限制,需要自己做好控制。
实现方案
采用C# winform 程序实现桌面版,采用Accord 实现语音录制停止等基础语音操作,操作停止按钮,
自动调用百度语言识别接口将识别内容显示在文本框中。
备注,语音识别需要配套阵列麦克风,(请先注册百度开发者)百度语音识别接口请参考:http://ai.baidu.com/docs#/ASR-Online-Csharp-SDK/top
实现效果展示
实现过程
1、下载Accord 完成语音操作引用
accord 官方 地址:http://accord-framework.net/intro.html
官网中有示例demo,笔者的就是在示例demo上做改造的。
建立自己的项目,引用包中的dll
界面代码:
using System;
using System.Drawing;
using System.IO;
using System.Windows.Forms;
using Accord.Audio;
using Accord.Audio.Formats;
using Accord.DirectSound;
using Accord.Audio.Filters;
using Baidu.Aip.API;
namespace SampleApp
{
public partial class MainForm : Form
{
private MemoryStream stream;
private IAudioSource source;
private IAudioOutput output;
private WaveEncoder encoder;
private WaveDecoder decoder;
private float[] current;
private int frames;
private int samples;
private TimeSpan duration;
///
/// 备注,语音识别需要配套阵列麦克风
///
public MainForm()
{
InitializeComponent();
// Configure the wavechart
chart.SimpleMode = true;
chart.AddWaveform("wave", Color.Green, 1, false);
updateButtons();
// Application.Idle += ProcessFrame;
}
void ProcessFrame(object sender, EventArgs e) {
}
///
/// 从声卡开始录制音频
///
///
private void btnRecord_Click(object sender, EventArgs e)
{
// Create capture device
source = new AudioCaptureDevice()//这里是核心
{
// Listen on 22050 Hz
DesiredFrameSize = 4096,
SampleRate = 16000,//采样率
//SampleRate = 22050,//采样率
Channels=1,
// We will be reading 16-bit PCM
Format = SampleFormat.Format16Bit
};
// Wire up some events
source.NewFrame += source_NewFrame;
source.AudioSourceError += source_AudioSourceError;
// Create buffer for wavechart control
current = new float[source.DesiredFrameSize];
// Create stream to store file
stream = new MemoryStream();
encoder = new WaveEncoder(stream);
// Start
source.Start();
updateButtons();
}
///
/// 播放录制的音频流。
///
///
private void btnPlay_Click(object sender, EventArgs e)
{
// First, we rewind the stream
stream.Seek(0, SeekOrigin.Begin);
// Then we create a decoder for it
decoder = new WaveDecoder(stream);
// Configure the track bar so the cursor
// can show the proper current position
if (trackBar1.Value < decoder.Frames)
decoder.Seek(trackBar1.Value);
trackBar1.Maximum = decoder.Samples;
// Here we can create the output audio device that will be playing the recording
output = new AudioOutputDevice(this.Handle, decoder.SampleRate, decoder.Channels);
// Wire up some events
output.FramePlayingStarted += output_FramePlayingStarted;
output.NewFrameRequested += output_NewFrameRequested;
output.Stopped += output_PlayingFinished;
// Start playing!
output.Play();
updateButtons();
}
///
/// 停止录制或播放流。
///
///
private void btnStop_Click(object sender, EventArgs e)
{
// Stops both cases
if (source != null)
{
// If we were recording
source.SignalToStop();
source.WaitForStop();
}
if (output != null)
{
// If we were playing
output.SignalToStop();
output.WaitForStop();
}
updateButtons();
// Also zero out the buffers and screen
Array.Clear(current, 0, current.Length);
updateWaveform(current, current.Length);
SpeechAPI speechApi = new SpeechAPI();
string result = speechApi.AsrData(stream,"wav");
tb_result.Text = "语音识别结果:"+result;
}
///
/// 当音频有错误时,将调用这个回调函数。
///
///
///
///
private void source_AudioSourceError(object sender, AudioSourceErrorEventArgs e)
{
throw new Exception(e.Description);
}
///
///
/// 每当有新的输入音频帧时,该方法将被调用。
///
///
///
private void source_NewFrame(object sender, NewFrameEventArgs eventArgs)
{
eventArgs.Signal.CopyTo(current);
updateWaveform(current, eventArgs.Signal.Length);
encoder.Encode(eventArgs.Signal);
duration += eventArgs.Signal.Duration;
samples += eventArgs.Signal.Samples;
frames += eventArgs.Signal.Length;
}
private void output_FramePlayingStarted(object sender, PlayFrameEventArgs e)
{
updateTrackbar(e.FrameIndex);
if (e.FrameIndex + e.Count < decoder.Frames)
{
int previous = decoder.Position;
decoder.Seek(e.FrameIndex);
Signal s = decoder.Decode(e.Count);
decoder.Seek(previous);
updateWaveform(s.ToFloat(), s.Length);
}
}
private void output_PlayingFinished(object sender, EventArgs e)
{
updateButtons();
Array.Clear(current, 0, current.Length);
updateWaveform(current, current.Length);
}
///
private void output_NewFrameRequested(object sender, NewFrameRequestedEventArgs e)
{
e.FrameIndex = decoder.Position;
Signal signal = decoder.Decode(e.Frames);
if (signal == null)
{
e.Stop = true;
return;
}
e.Frames = signal.Length;
signal.CopyTo(e.Buffer);
}
private void updateWaveform(float[] samples, int length)
{
if (InvokeRequired)
{
BeginInvoke(new Action(() =>
{
chart.UpdateWaveform("wave", samples, length);
}));
}
else
{
chart.UpdateWaveform("wave", current, length);
}
}
///
private void updateTrackbar(int value)
{
if (InvokeRequired)
{
BeginInvoke(new Action(() =>
{
trackBar1.Value = Math.Max(trackBar1.Minimum, Math.Min(trackBar1.Maximum, value));
}));
}
else
{
trackBar1.Value = Math.Max(trackBar1.Minimum, Math.Min(trackBar1.Maximum, value));
}
}
private void updateButtons()
{
if (InvokeRequired)
{
BeginInvoke(new Action(updateButtons));
return;
}
if (source != null && source.IsRunning)
{
btnBwd.Enabled = false;
btnFwd.Enabled = false;
btnPlay.Enabled = false;
btnStop.Enabled = true;
btnRecord.Enabled = false;
trackBar1.Enabled = false;
}
else if (output != null && output.IsRunning)
{
btnBwd.Enabled = false;
btnFwd.Enabled = false;
btnPlay.Enabled = false;
btnStop.Enabled = true;
btnRecord.Enabled = false;
trackBar1.Enabled = true;
}
else
{
btnBwd.Enabled = false;
btnFwd.Enabled = false;
btnPlay.Enabled = stream != null;
btnStop.Enabled = false;
btnRecord.Enabled = true;
trackBar1.Enabled = decoder != null;
trackBar1.Value = 0;
}
}
private void MainFormFormClosed(object sender, FormClosedEventArgs e)
{
if (source != null) source.SignalToStop();
if (output != null) output.SignalToStop();
}
private void saveFileDialog1_FileOk(object sender, System.ComponentModel.CancelEventArgs e)
{
Stream fileStream = saveFileDialog1.OpenFile();
stream.WriteTo(fileStream);
fileStream.Close();
}
private void saveToolStripMenuItem_Click(object sender, EventArgs e)
{
saveFileDialog1.ShowDialog(this);
}
private void updateTimer_Tick(object sender, EventArgs e)
{
lbLength.Text = String.Format("Length: {0:00.00} sec.", duration.Seconds);
}
private void aboutToolStripMenuItem_Click(object sender, EventArgs e)
{
new AboutBox().ShowDialog(this);
}
private void closeToolStripMenuItem_Click(object sender, EventArgs e)
{
Close();
}
private void btnIncreaseVolume_Click(object sender, EventArgs e)
{
adjustVolume(1.25f);
}
private void btnDecreaseVolume_Click(object sender, EventArgs e)
{
adjustVolume(0.75f);
}
private void adjustVolume(float value)
{
stream.Seek(0, SeekOrigin.Begin);
decoder = new WaveDecoder(stream);
var signal = decoder.Decode();
var volume = new VolumeFilter(value);
volume.ApplyInPlace(signal);
stream.Seek(0, SeekOrigin.Begin);
encoder = new WaveEncoder(stream);
encoder.Encode(signal);
}
}
}
百度语音识别接口:
说明:百度已经提供sdk,对于支持语音格式需要说明,
支持的语音格式
原始 PCM 的录音参数必须符合 8k/16k 采样率、16bit 位深、单声道,支持的格式有:pcm(不压缩)、wav(不压缩,pcm编码)、amr(压缩格式)。
public string AsrData(string filePath, string format = "pcm", int rate = 16000)
{
var data =File.ReadAllBytes(filePath);
var result = _asrClient.Recognize(data, format, 16000);
return result.ToString();
}
结果评测:
对于普通的语言识别效果不好,需要阵列麦克风才可以。