由于如鹏网工作需要要对大语音文件(长度超过5分钟)进行“语音转文字”的语音识别,试了百度和科大讯飞的接口,对大语音文件的识别都支持的不好,本来想找开源的语音识别项目,发现都要自己去做数据集的训练,不仅麻烦,而且训练不够的话就会识别准确度太低。最后试了微软的Azure认知服务(CognitiveServices Speech),感觉非常好用。
1、首先要到azure去申请一个账号,azure上提供了免费的试用账号,具体怎么申请很简单,我就不说了。
2、新建一个.Net项目
Azure认知服务对于.Net、Java、C++、Python、js等主流语言都有支持。我这里用.Net举例子,其他语言用法点击上图中的【快速入门指南】
Nuget安装SDK:Install-Package Microsoft.CognitiveServices.Speech
3、首先是一个工具类Helper.cs
它的作用是把大wav音频文件转换为“音频拉流”PullAudioInputStreamCallback
这个代码是从Azure的GitHub官方例子中Copy出来的。
using Microsoft.CognitiveServices.Speech.Audio;
using System.Diagnostics;
using System.IO;
namespace Demo
{
public class Helper
{
public static AudioConfig OpenWavFile(string filename)
{
BinaryReader reader = new BinaryReader(File.OpenRead(filename));
return OpenWavFile(reader);
}
public static AudioConfig OpenWavFile(BinaryReader reader)
{
AudioStreamFormat format = readWaveHeader(reader);
return AudioConfig.FromStreamInput(new BinaryAudioStreamReader(reader), format);
}
public static BinaryAudioStreamReader CreateWavReader(string filename)
{
BinaryReader reader = new BinaryReader(File.OpenRead(filename));
// read the wave header so that it won't get into the in the following readings
AudioStreamFormat format = readWaveHeader(reader);
return new BinaryAudioStreamReader(reader);
}
public static AudioStreamFormat readWaveHeader(BinaryReader reader)
{
// Tag "RIFF"
char[] data = new char[4];
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'R') && (data[1] == 'I') && (data[2] == 'F') && (data[3] == 'F'), "Wrong wav header");
// Chunk size
long fileSize = reader.ReadInt32();
// Subchunk, Wave Header
// Subchunk, Format
// Tag: "WAVE"
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'W') && (data[1] == 'A') && (data[2] == 'V') && (data[3] == 'E'), "Wrong wav tag in wav header");
// Tag: "fmt"
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'f') && (data[1] == 'm') && (data[2] == 't') && (data[3] == ' '), "Wrong format tag in wav header");
// chunk format size
var formatSize = reader.ReadInt32();
var formatTag = reader.ReadUInt16();
var channels = reader.ReadUInt16();
var samplesPerSecond = reader.ReadUInt32();
var avgBytesPerSec = reader.ReadUInt32();
var blockAlign = reader.ReadUInt16();
var bitsPerSample = reader.ReadUInt16();
// Until now we have read 16 bytes in format, the rest is cbSize and is ignored for now.
if (formatSize > 16)
reader.ReadBytes((int)(formatSize - 16));
// Second Chunk, data
// tag: data.
reader.Read(data, 0, 4);
Trace.Assert((data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a'), "Wrong data tag in wav");
// data chunk size
int dataSize = reader.ReadInt32();
// now, we have the format in the format parameter and the
// reader set to the start of the body, i.e., the raw sample data
return AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, (byte)bitsPerSample, (byte)channels);
}
}
/// <summary>
/// Adapter class to the native stream api.
/// </summary>
public sealed class BinaryAudioStreamReader : PullAudioInputStreamCallback
{
private System.IO.BinaryReader _reader;
/// <summary>
/// Creates and initializes an instance of BinaryAudioStreamReader.
/// </summary>
/// <param name="reader">The underlying stream to read the audio data from. Note: The stream contains the bare sample data, not the container (like wave header data, etc).</param>
public BinaryAudioStreamReader(System.IO.BinaryReader reader)
{
_reader = reader;
}
/// <summary>
/// Creates and initializes an instance of BinaryAudioStreamReader.
/// </summary>
/// <param name="stream">The underlying stream to read the audio data from. Note: The stream contains the bare sample data, not the container (like wave header data, etc).</param>
public BinaryAudioStreamReader(System.IO.Stream stream)
: this(new System.IO.BinaryReader(stream))
{
}
/// <summary>
/// Reads binary data from the stream.
/// </summary>
/// <param name="dataBuffer">The buffer to fill</param>
/// <param name="size">The size of data in the buffer.</param>
/// <returns>The number of bytes filled, or 0 in case the stream hits its end and there is no more data available.
/// If there is no data immediate available, Read() blocks until the next data becomes available.</returns>
public override int Read(byte[] dataBuffer, uint size)
{
return _reader.Read(dataBuffer, 0, (int)size);
}
/// <summary>
/// This method performs cleanup of resources.
/// The Boolean parameter <paramref name="disposing"/> indicates whether the method is called from <see cref="IDisposable.Dispose"/> (if <paramref name="disposing"/> is true) or from the finalizer (if <paramref name="disposing"/> is false).
/// Derived classes should override this method to dispose resource if needed.
/// </summary>
/// <param name="disposing">Flag to request disposal.</param>
protected override void Dispose(bool disposing)
{
if (disposed)
{
return;
}
if (disposing)
{
_reader.Dispose();
}
disposed = true;
base.Dispose(disposing);
}
private bool disposed = false;
}
}
4、编写主体识别代码:
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using System;
using System.Threading.Tasks;
namespace Demo
{
class SpeechToTestMain
{
static void Main(string[] args)
{
T1().Wait();
Console.WriteLine("ok");
Console.ReadKey();
}
static async Task T1()
{
var file = @"E:\1.wav";
var config = SpeechConfig.FromSubscription("这里填写在第一步中拿到的密钥", "westus");
//通过设置config.SpeechRecognitionLanguage属性设定识别的语言,默认是英文
// config.OutputFormat = OutputFormat.Detailed;是让 recognizer.Recognized 中可以通过var best = e.Result.Best();
//拿到一句话的多个识别形式:比如数字是写成3还是three
var stopRecognition = new TaskCompletionSource<int>();
//不要用AudioConfig.FromWavFileInput,因为他无法处理大wav文件
using (var pushStream = AudioInputStream.CreatePushStream())
using (var audioInput = AudioConfig.FromStreamInput(pushStream))
using (var recognizer = new SpeechRecognizer(config, audioInput))
{
recognizer.Recognized += (s, e) =>
{
if (e.Result.Reason == ResultReason.RecognizedSpeech)
{
Console.WriteLine($"RECOGNIZED: Text={e.Result.Text} Duration={e.Result.Duration} OffsetInTicks={e.Result.OffsetInTicks}");
}
else if (e.Result.Reason == ResultReason.NoMatch)
{
Console.WriteLine($"NOMATCH: Speech could not be recognized.");
}
};
recognizer.Canceled += (s, e) =>
{
Console.WriteLine($"CANCELED: Reason={e.Reason}");
if (e.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
}
stopRecognition.TrySetResult(0);
};
recognizer.SessionStarted += (s, e) =>
{
Console.WriteLine("\nSession started event.");
};
recognizer.SessionStopped += (s, e) =>
{
Console.WriteLine("\nSession stopped event.");
stopRecognition.TrySetResult(0);
};
// Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition.
await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);
// open and read the wave file and push the buffers into the recognizer
using (BinaryAudioStreamReader reader = Helper.CreateWavReader(file))
{
byte[] buffer = new byte[1000];
while (true)
{
var readSamples = reader.Read(buffer, (uint)buffer.Length);
if (readSamples == 0)
{
break;
}
pushStream.Write(buffer, readSamples);
}
}
pushStream.Close();
// Waits for completion.
// Use Task.WaitAny to keep the task rooted.
Task.WaitAny(new[] { stopRecognition.Task });
// Stops recognition.
await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
}
}
}
}
可以看到在recognizer.Recognized事件中,我们可以拿到识别出来的一段段的话,一个比较长的语音会分为多次触发recognizer.Recognized事件识别出来。e.Result.Text属性是识别出来的文本,e.Result.Duration是识别出来的这段话的长度,e.Result.OffsetInTicks是识别出来这句话在整个音频中的位置,可以用 TimeSpan.FromTicks(e.Result.OffsetInTicks)转换为TimeSpan类型。
5、注意:包括Azure认知服务在内的几乎所有语音识别引擎都只支持wav文件,不支持mp3等格式的文件。而且需要注意的是wav码率必须是16000,否则OffsetInTicks时间将会不准确。
可以用NAudio这个组件把mp3转换为wav文件,NAudio是全托管代码,不像ffmpeg是单独运行一个进程,无论是调试还是其他的都很麻烦。
下面是使用NAudio进行mp3转为wav的代码,再次强调:码率必须是16000.
using (Mp3FileReader reader = new Mp3FileReader(mp3file))
using (WaveStream pcmStream = new WaveFormatConversionStream(new WaveFormat(16000, 1),reader))
{
WaveFileWriter.CreateWaveFile(wavFile, pcmStream);
}