xiaoke
/
libtorch-runtime

// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu)
//               2022 Binbin Zhang (binbzha@qq.com)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "decoder/torch_asr_model.h"

#include <algorithm>
#include <memory>
#include <stdexcept>
#include <utility>

#include "torch/script.h"
#ifndef IOS
#include "torch/torch.h"
#endif
#include <torch/csrc/jit/passes/tensorexpr_fuser.h>

namespace wenet {

#ifndef IOS
void TorchAsrModel::InitEngineThreads(int num_threads) {
  // For multi-thread performance
  at::set_num_threads(num_threads);
  VLOG(1) << "Num intra-op threads: " << at::get_num_threads();
}
#endif

void TorchAsrModel::Read(const std::string& model_path) {
  torch::DeviceType device = at::kCPU;
#ifdef USE_GPU
  if (!torch::cuda::is_available()) {
    VLOG(1) << "CUDA is not available! Please check your GPU settings";
    throw std::runtime_error("CUDA is not available!");
  } else {
    VLOG(1) << "CUDA available! Running on GPU";
    device = at::kCUDA;
  }
#endif
#ifdef USE_IPEX
  torch::jit::setTensorExprFuserEnabled(false);
#endif
  torch::jit::script::Module model = torch::jit::load(model_path, device);
  model_ = std::make_shared<TorchModule>(std::move(model));
  torch::NoGradGuard no_grad;
  model_->eval();
  torch::jit::IValue o1 = model_->run_method("subsampling_rate");
  CHECK_EQ(o1.isInt(), true);
  subsampling_rate_ = o1.toInt();
  torch::jit::IValue o2 = model_->run_method("right_context");
  CHECK_EQ(o2.isInt(), true);
  right_context_ = o2.toInt();
  torch::jit::IValue o3 = model_->run_method("sos_symbol");
  CHECK_EQ(o3.isInt(), true);
  sos_ = o3.toInt();
  torch::jit::IValue o4 = model_->run_method("eos_symbol");
  CHECK_EQ(o4.isInt(), true);
  eos_ = o4.toInt();
  torch::jit::IValue o5 = model_->run_method("is_bidirectional_decoder");
  CHECK_EQ(o5.isBool(), true);
  is_bidirectional_decoder_ = o5.toBool();

  torch::jit::setGraphExecutorOptimize(false);
  torch::jit::FusionStrategy static0 = {
      {torch::jit::FusionBehavior::STATIC, 0}};
  torch::jit::setFusionStrategy(static0);

  VLOG(1) << "Torch Model Info:";
  VLOG(1) << "\tsubsampling_rate " << subsampling_rate_;
  VLOG(1) << "\tright context " << right_context_;
  VLOG(1) << "\tsos " << sos_;
  VLOG(1) << "\teos " << eos_;
  VLOG(1) << "\tis bidirectional decoder " << is_bidirectional_decoder_;
}

TorchAsrModel::TorchAsrModel(const TorchAsrModel& other) {
  // 1. Init the model info
  right_context_ = other.right_context_;
  subsampling_rate_ = other.subsampling_rate_;
  sos_ = other.sos_;
  eos_ = other.eos_;
  is_bidirectional_decoder_ = other.is_bidirectional_decoder_;
  chunk_size_ = other.chunk_size_;
  num_left_chunks_ = other.num_left_chunks_;
  offset_ = other.offset_;
  // 2. Model copy, just copy the model ptr since:
  // PyTorch allows using multiple CPU threads during TorchScript model
  // inference, please see https://pytorch.org/docs/stable/notes/cpu_
  // threading_torchscript_inference.html
  model_ = other.model_;

  // NOTE(Binbin Zhang):
  // inner states for forward are not copied here.
}

std::shared_ptr<AsrModel> TorchAsrModel::Copy() const {
  auto asr_model = std::make_shared<TorchAsrModel>(*this);
  // Reset the inner states for new decoding
  asr_model->Reset();
  return asr_model;
}

void TorchAsrModel::Reset() {
  offset_ = 0;
  att_cache_ = std::move(torch::zeros({0, 0, 0, 0}));
  cnn_cache_ = std::move(torch::zeros({0, 0, 0, 0}));
  encoder_outs_.clear();
  cached_feature_.clear();
}

void TorchAsrModel::ForwardEncoderFunc(
    const std::vector<std::vector<float>>& chunk_feats,
    std::vector<std::vector<float>>* out_prob) {
  // 1. Prepare libtorch required data, splice cached_feature_ and chunk_feats
  // The first dimension is for batchsize, which is 1.
  int num_frames = cached_feature_.size() + chunk_feats.size();
  const int feature_dim = chunk_feats[0].size();
  torch::Tensor feats =
      torch::zeros({1, num_frames, feature_dim}, torch::kFloat);
  for (size_t i = 0; i < cached_feature_.size(); ++i) {
    torch::Tensor row =
        torch::from_blob(const_cast<float*>(cached_feature_[i].data()),
                         {feature_dim}, torch::kFloat)
            .clone();
    feats[0][i] = std::move(row);
  }
  for (size_t i = 0; i < chunk_feats.size(); ++i) {
    torch::Tensor row =
        torch::from_blob(const_cast<float*>(chunk_feats[i].data()),
                         {feature_dim}, torch::kFloat)
            .clone();
    feats[0][cached_feature_.size() + i] = std::move(row);
  }

  // 2. Encoder chunk forward
#ifdef USE_GPU
  feats = feats.to(at::kCUDA);
  att_cache_ = att_cache_.to(at::kCUDA);
  cnn_cache_ = cnn_cache_.to(at::kCUDA);
#endif
  int required_cache_size = chunk_size_ * num_left_chunks_;
  torch::NoGradGuard no_grad;
  std::vector<torch::jit::IValue> inputs = {feats, offset_, required_cache_size,
                                            att_cache_, cnn_cache_};

  // Refer interfaces in wenet/transformer/asr_model.py
  auto outputs =
      model_->get_method("forward_encoder_chunk")(inputs).toTuple()->elements();
  CHECK_EQ(outputs.size(), 3);
#ifdef USE_GPU
  torch::Tensor chunk_out = outputs[0].toTensor().to(at::kCPU);
  att_cache_ = outputs[1].toTensor().to(at::kCPU);
  cnn_cache_ = outputs[2].toTensor().to(at::kCPU);
#else
  torch::Tensor chunk_out = outputs[0].toTensor();
  att_cache_ = outputs[1].toTensor();
  cnn_cache_ = outputs[2].toTensor();
#endif
  offset_ += chunk_out.size(1);

  // The first dimension of returned value is for batchsize, which is 1
#ifdef USE_GPU
  chunk_out = chunk_out.to(at::kCUDA);
  torch::Tensor ctc_log_probs =
      model_->run_method("ctc_activation", chunk_out).toTensor();
  ctc_log_probs = ctc_log_probs.to(at::kCPU)[0];
  encoder_outs_.push_back(std::move(chunk_out.to(at::kCPU)));
#else
  torch::Tensor ctc_log_probs =
      model_->run_method("ctc_activation", chunk_out).toTensor()[0];
  encoder_outs_.push_back(std::move(chunk_out));
#endif

  // Copy to output
  int num_outputs = ctc_log_probs.size(0);
  int output_dim = ctc_log_probs.size(1);
  out_prob->resize(num_outputs);
  for (int i = 0; i < num_outputs; i++) {
    (*out_prob)[i].resize(output_dim);
    memcpy((*out_prob)[i].data(), ctc_log_probs[i].data_ptr(),
           sizeof(float) * output_dim);
  }
}

float TorchAsrModel::ComputeAttentionScore(const torch::Tensor& prob,
                                           const std::vector<int>& hyp,
                                           int eos) {
  float score = 0.0f;
  auto accessor = prob.accessor<float, 2>();
  for (size_t j = 0; j < hyp.size(); ++j) {
    score += accessor[j][hyp[j]];
  }
  score += accessor[hyp.size()][eos];
  return score;
}

void TorchAsrModel::AttentionRescoring(
    const std::vector<std::vector<int>>& hyps, float reverse_weight,
    std::vector<float>* rescoring_score) {
  CHECK(rescoring_score != nullptr);
  int num_hyps = hyps.size();
  rescoring_score->resize(num_hyps, 0.0f);

  if (num_hyps == 0) {
    return;
  }
  // No encoder output
  if (encoder_outs_.size() == 0) {
    return;
  }

  torch::NoGradGuard no_grad;
  // Step 1: Prepare input for libtorch
  torch::Tensor hyps_length = torch::zeros({num_hyps}, torch::kLong);
  int max_hyps_len = 0;
  for (size_t i = 0; i < num_hyps; ++i) {
    int length = hyps[i].size() + 1;
    max_hyps_len = std::max(length, max_hyps_len);
    hyps_length[i] = static_cast<int64_t>(length);
  }
  torch::Tensor hyps_tensor =
      torch::zeros({num_hyps, max_hyps_len}, torch::kLong);
  for (size_t i = 0; i < num_hyps; ++i) {
    const std::vector<int>& hyp = hyps[i];
    hyps_tensor[i][0] = sos_;
    for (size_t j = 0; j < hyp.size(); ++j) {
      hyps_tensor[i][j + 1] = hyp[j];
    }
  }

  // Step 2: Forward attention decoder by hyps and corresponding encoder_outs_
  torch::Tensor encoder_out = torch::cat(encoder_outs_, 1);
#ifdef USE_GPU
  hyps_tensor = hyps_tensor.to(at::kCUDA);
  hyps_length = hyps_length.to(at::kCUDA);
  encoder_out = encoder_out.to(at::kCUDA);
#endif
  auto outputs = model_
                     ->run_method("forward_attention_decoder", hyps_tensor,
                                  hyps_length, encoder_out, reverse_weight)
                     .toTuple()
                     ->elements();
#ifdef USE_GPU
  auto probs = outputs[0].toTensor().to(at::kCPU);
  auto r_probs = outputs[1].toTensor().to(at::kCPU);
#else
  auto probs = outputs[0].toTensor();
  auto r_probs = outputs[1].toTensor();
#endif
  CHECK_EQ(probs.size(0), num_hyps);
  CHECK_EQ(probs.size(1), max_hyps_len);

  // Step 3: Compute rescoring score
  for (size_t i = 0; i < num_hyps; ++i) {
    const std::vector<int>& hyp = hyps[i];
    float score = 0.0f;
    // left-to-right decoder score
    score = ComputeAttentionScore(probs[i], hyp, eos_);
    // Optional: Used for right to left score
    float r_score = 0.0f;
    if (is_bidirectional_decoder_ && reverse_weight > 0) {
      // right-to-left score
      CHECK_EQ(r_probs.size(0), num_hyps);
      CHECK_EQ(r_probs.size(1), max_hyps_len);
      std::vector<int> r_hyp(hyp.size());
      std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin());
      // right to left decoder score
      r_score = ComputeAttentionScore(r_probs[i], r_hyp, eos_);
    }

    // combined left-to-right and right-to-left score
    (*rescoring_score)[i] =
        score * (1 - reverse_weight) + r_score * reverse_weight;
  }
}

}  // namespace wenet