下面是对kaldi音频文件解码的改造,用于实时流解码。
static void DecodeSegment(bool &more_data, int32 chunk_length,
BaseFloat traceback_period_secs) {
OnlineNnet2FeaturePipeline feature_pipeline(*(this->feature_info));
feature_pipeline.SetAdaptationState(*(this->adaptation_state));
SingleUtteranceNnet3Decoder decoder(*(this->decoder_opts),
*(this->trans_model),
*(this->decodable_info_nnet3),
*(this->decode_fst),
&feature_pipeline);
OnlineSilenceWeighting silence_weighting(*(this->trans_model),
*(this->silence_weighting_config));
Vector<BaseFloat> wave_part = Vector<BaseFloat>(chunk_length);
std::vector<std::pair<int32, BaseFloat> > delta_weights;
BaseFloat last_traceback = 0.0;
BaseFloat num_seconds_decoded = 0.0;
while (true) {
more_data = this->audio_source->Read(&wave_part);
feature_pipeline.AcceptWaveform(this->sample_rate, wave_part);
if (!more_data) {
feature_pipeline.InputFinished();
}
if (silence_weighting.Active() &&
feature_pipeline.IvectorFeature() != NULL) {
silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
silence_weighting.GetDeltaWeights(feature_pipeline.IvectorFeature()->NumFramesReady(),
&delta_weights);
feature_pipeline.IvectorFeature()->UpdateFrameWeights(delta_weights);
}
decoder.AdvanceDecoding();
num_seconds_decoded += 1.0 * wave_part.Dim() / this->sample_rate;
this->total_time_decoded += 1.0 * wave_part.Dim() / this->sample_rate;
if (!more_data) {
break;
}
if (this->do_endpointing
&& (decoder.NumFramesDecoded() > 0)
&& decoder.EndpointDetected(*(this->endpoint_config))) {
break;
}
if ((num_seconds_decoded - last_traceback > traceback_period_secs)
&& (decoder.NumFramesDecoded() > 0)) {
Lattice lat;
decoder.GetBestPath(false, &lat);
DecodePartialResult(lat);
last_traceback += traceback_period_secs;
}
}
if (num_seconds_decoded > 0.1) {
decoder.FinalizeDecoding();
CompactLattice clat;
bool end_of_utterance = true;
decoder.GetLattice(end_of_utterance, &clat);
int32 num_words = 0;
DecoderFinalResult(clat, &num_words);
if (num_words >= this->min_words_for_ivector) {
// Only update adaptation state if the utterance contained enough words
feature_pipeline.GetAdaptationState(this->adaptation_state);
}
} else {
KALDI_VLOG("Less than 0.1 seconds decoded, discarding");
}
}