|
// Copyright (c) 2017 Personal (Binbin Zhang)
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#ifndef FRONTEND_FEATURE_PIPELINE_H_
|
|
#define FRONTEND_FEATURE_PIPELINE_H_
|
|
|
|
#include <limits>
|
|
#include <mutex>
|
|
#include <queue>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "fbank.h"
|
|
#include "utils/blocking_queue.h"
|
|
|
|
namespace wenet {
|
|
|
|
enum class FeatureType {
|
|
kKaldi = 0,
|
|
kWhisper,
|
|
};
|
|
|
|
struct FeaturePipelineConfig {
|
|
int num_bins;
|
|
int sample_rate;
|
|
int frame_length;
|
|
int frame_shift;
|
|
float low_freq;
|
|
bool pre_emphasis;
|
|
bool scale_input_to_unit;
|
|
float log_floor;
|
|
LogBase log_base;
|
|
WindowType window_type;
|
|
MelType mel_type;
|
|
NormalizationType norm_type;
|
|
|
|
FeaturePipelineConfig(int num_bins, int sample_rate,
|
|
FeatureType feat_type = FeatureType::kKaldi)
|
|
: num_bins(num_bins), // 80 dim fbank
|
|
sample_rate(sample_rate) { // 16k sample rate
|
|
frame_length = sample_rate / 1000 * 25; // frame length 25ms
|
|
frame_shift = sample_rate / 1000 * 10; // frame shift 10ms
|
|
if (feat_type == FeatureType::kKaldi) {
|
|
low_freq = 20.0;
|
|
pre_emphasis = true;
|
|
log_floor = std::numeric_limits<float>::epsilon();
|
|
log_base = LogBase::kBaseE;
|
|
window_type = WindowType::kPovey;
|
|
mel_type = MelType::kHTK;
|
|
norm_type = NormalizationType::kKaldi;
|
|
scale_input_to_unit = false;
|
|
} else if (feat_type == FeatureType::kWhisper) {
|
|
low_freq = 0.0;
|
|
pre_emphasis = false;
|
|
log_floor = 1e-10;
|
|
log_base = LogBase::kBase10;
|
|
window_type = WindowType::kHanning;
|
|
mel_type = MelType::kSlaney;
|
|
scale_input_to_unit = true;
|
|
norm_type = NormalizationType::kWhisper;
|
|
}
|
|
}
|
|
|
|
void Info() const {
|
|
google::LOG(INFO) << "feature pipeline config"
|
|
<< " num_bins " << num_bins << " frame_length " << frame_length
|
|
<< " frame_shift " << frame_shift << " low_freq " << low_freq
|
|
<< " preemphasis " << pre_emphasis << " log_floor " << log_floor
|
|
<< " log_base " << int(log_base) << " window_type "
|
|
<< int(window_type) << " mel_type " << int(mel_type)
|
|
<< " norm_type " << int(norm_type);
|
|
}
|
|
};
|
|
|
|
// Typically, FeaturePipeline is used in two threads: one thread A calls
|
|
// AcceptWaveform() to add raw wav data and set_input_finished() to notice
|
|
// the end of input wav, another thread B (decoder thread) calls Read() to
|
|
// consume features.So a BlockingQueue is used to make this class thread safe.
|
|
|
|
// The Read() is designed as a blocking method when there is no feature
|
|
// in feature_queue_ and the input is not finished.
|
|
|
|
// See bin/decoder_main.cc, websocket/websocket_server.cc and
|
|
// decoder/torch_asr_decoder.cc for usage
|
|
|
|
class FeaturePipeline {
|
|
public:
|
|
explicit FeaturePipeline(const FeaturePipelineConfig& config);
|
|
|
|
// The feature extraction is done in AcceptWaveform().
|
|
void AcceptWaveform(const float* pcm, const int size);
|
|
void AcceptWaveform(const int16_t* pcm, const int size);
|
|
|
|
// Current extracted frames number.
|
|
int num_frames() const { return num_frames_; }
|
|
int feature_dim() const { return feature_dim_; }
|
|
const FeaturePipelineConfig& config() const { return config_; }
|
|
|
|
// The caller should call this method when speech input is end.
|
|
// Never call AcceptWaveform() after calling set_input_finished() !
|
|
void set_input_finished();
|
|
bool input_finished() const { return input_finished_; }
|
|
|
|
// Return False if input is finished and no feature could be read.
|
|
// Return True if a feature is read.
|
|
// This function is a blocking method. It will block the thread when
|
|
// there is no feature in feature_queue_ and the input is not finished.
|
|
bool ReadOne(std::vector<float>* feat);
|
|
|
|
// Read #num_frames frame features.
|
|
// Return False if less than #num_frames features are read and the
|
|
// input is finished.
|
|
// Return True if #num_frames features are read.
|
|
// This function is a blocking method when there is no feature
|
|
// in feature_queue_ and the input is not finished.
|
|
bool Read(int num_frames, std::vector<std::vector<float>>* feats);
|
|
|
|
void Reset();
|
|
bool IsLastFrame(int frame) const {
|
|
return input_finished_ && (frame == num_frames_ - 1);
|
|
}
|
|
|
|
int NumQueuedFrames() const { return feature_queue_.Size(); }
|
|
|
|
private:
|
|
const FeaturePipelineConfig& config_;
|
|
int feature_dim_;
|
|
Fbank fbank_;
|
|
|
|
BlockingQueue<std::vector<float>> feature_queue_;
|
|
int num_frames_;
|
|
bool input_finished_;
|
|
|
|
// The feature extraction is done in AcceptWaveform().
|
|
// This waveform sample points are consumed by frame size.
|
|
// The residual waveform sample points after framing are
|
|
// kept to be used in next AcceptWaveform() calling.
|
|
std::vector<float> remained_wav_;
|
|
|
|
// Used to block the Read when there is no feature in feature_queue_
|
|
// and the input is not finished.
|
|
mutable std::mutex mutex_;
|
|
std::condition_variable finish_condition_;
|
|
};
|
|
|
|
} // namespace wenet
|
|
|
|
#endif // FRONTEND_FEATURE_PIPELINE_H_
|