You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

78 lines
2.6 KiB

// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef DECODER_CTC_ENDPOINT_H_
#define DECODER_CTC_ENDPOINT_H_
#include <vector>
namespace wenet {
struct CtcEndpointRule {
bool must_decoded_sth;
int min_trailing_silence;
int min_utterance_length;
CtcEndpointRule(bool must_decoded_sth = true, int min_trailing_silence = 1000,
int min_utterance_length = 0)
: must_decoded_sth(must_decoded_sth),
min_trailing_silence(min_trailing_silence),
min_utterance_length(min_utterance_length) {}
};
struct CtcEndpointConfig {
/// We consider blank as silence for purposes of endpointing.
int blank = 0; // blank id
float blank_threshold = 0.8; // blank threshold to be silence
/// We support three rules. We terminate decoding if ANY of these rules
/// evaluates to "true". If you want to add more rules, do it by changing this
/// code. If you want to disable a rule, you can set the silence-timeout for
/// that rule to a very large number.
/// rule1 times out after 5000 ms of silence, even if we decoded nothing.
CtcEndpointRule rule1;
/// rule2 times out after 1000 ms of silence after decoding something.
CtcEndpointRule rule2;
/// rule3 times out after the utterance is 20000 ms long, regardless of
/// anything else.
CtcEndpointRule rule3;
CtcEndpointConfig()
: rule1(false, 5000, 0), rule2(true, 1000, 0), rule3(false, 0, 20000) {}
};
class CtcEndpoint {
public:
explicit CtcEndpoint(const CtcEndpointConfig& config);
void Reset();
/// This function returns true if this set of endpointing rules thinks we
/// should terminate decoding.
bool IsEndpoint(const std::vector<std::vector<float>>& ctc_log_probs,
bool decoded_something);
void frame_shift_in_ms(int frame_shift_in_ms) {
frame_shift_in_ms_ = frame_shift_in_ms;
}
private:
CtcEndpointConfig config_;
int frame_shift_in_ms_ = -1;
int num_frames_decoded_ = 0;
int num_frames_trailing_blank_ = 0;
};
} // namespace wenet
#endif // DECODER_CTC_ENDPOINT_H_