PaddleSpeech/runtime/engine/asr/nnet/u2_nnet.h

// Copyright 2022 Horizon Robotics. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// modified from
// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h
#pragma once

#include "base/common.h"
#include "matrix/kaldi-matrix.h"
#include "nnet/nnet_itf.h"
#include "paddle/extension.h"
#include "paddle/jit/all.h"
#include "paddle/phi/api/all.h"

namespace ppspeech {


class U2NnetBase : public NnetBase {
  public:
    virtual int Context() const { return right_context_ + 1; }
    virtual int RightContext() const { return right_context_; }

    virtual int EOS() const { return eos_; }
    virtual int SOS() const { return sos_; }
    virtual int IsBidecoder() const { return is_bidecoder_; }
    // current offset in decoder frame
    virtual int Offset() const { return offset_; }
    virtual void SetChunkSize(int chunk_size) { chunk_size_ = chunk_size; }
    virtual void SetNumLeftChunks(int num_left_chunks) {
        num_left_chunks_ = num_left_chunks;
    }

    virtual std::shared_ptr<NnetBase> Clone() const = 0;

  protected:
    virtual void ForwardEncoderChunkImpl(
        const std::vector<kaldi::BaseFloat>& chunk_feats,
        const int32& feat_dim,
        std::vector<kaldi::BaseFloat>* ctc_probs,
        int32* vocab_dim) = 0;

  protected:
    // model specification
    int right_context_{0};

    int sos_{0};
    int eos_{0};

    bool is_bidecoder_{false};

    int chunk_size_{16};  // num of decoder frames. If chunk_size > 0, streaming
                          // case. Otherwise, none streaming case
    int num_left_chunks_{-1};  // -1 means all left chunks

    // asr decoder state, not used in nnet
    int offset_{0};  // current offset in encoder output time stamp. Used by
                     // position embedding.
    std::vector<std::vector<float>> cached_feats_{};  // features cache
};


class U2Nnet : public U2NnetBase {
  public:
    explicit U2Nnet(const ModelOptions& opts);
    U2Nnet(const U2Nnet& other);

    void FeedForward(const std::vector<kaldi::BaseFloat>& features,
                     const int32& feature_dim,
                     NnetOut* out) override;

    void Reset() override;

    bool IsLogProb() override { return true; }

    void Dim();

    void LoadModel(const std::string& model_path_w_prefix);
    void Warmup();

    std::shared_ptr<paddle::jit::Layer> model() const { return model_; }

    std::shared_ptr<NnetBase> Clone() const override;

    void ForwardEncoderChunkImpl(
        const std::vector<kaldi::BaseFloat>& chunk_feats,
        const int32& feat_dim,
        std::vector<kaldi::BaseFloat>* ctc_probs,
        int32* vocab_dim) override;

    float ComputePathScore(const paddle::Tensor& prob,
                           const std::vector<int>& hyp,
                           int eos);

    void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
                            float reverse_weight,
                            std::vector<float>* rescoring_score) override;

    // debug
    void FeedEncoderOuts(const paddle::Tensor& encoder_out);

    void EncoderOuts(
        std::vector<std::vector<kaldi::BaseFloat>>* encoder_out) const;

    ModelOptions opts_; // hack, fix later
  private:

    phi::Place dev_;
    std::shared_ptr<paddle::jit::Layer> model_{nullptr};
    std::vector<paddle::Tensor> encoder_outs_;
    // transformer/conformer attention cache
    paddle::Tensor att_cache_ = paddle::full({0, 0, 0, 0}, 0.0);
    // conformer-only conv_module cache
    paddle::Tensor cnn_cache_ = paddle::full({0, 0, 0, 0}, 0.0);

    paddle::jit::Function forward_encoder_chunk_;
    paddle::jit::Function forward_attention_decoder_;
    paddle::jit::Function ctc_activation_;
    float cost_time_ = 0.0;
};

}  // namespace ppspeech
add DecoderBase and license 2 years ago			`// Copyright 2022 Horizon Robotics. All Rights Reserved.`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

format code 2 years ago			`// modified from`
			`// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`#pragma once`

			`#include "base/common.h"`
[speechx] rm openblas && refactor kaldi-matrix, kaldi-vector (#2824) * rm openblas && refactor kaldi-matrix kaldi-vector 2 years ago			`#include "matrix/kaldi-matrix.h"`
unify model opts; add attention rescore in decodable; rename ds2 ctc beam search 2 years ago			`#include "nnet/nnet_itf.h"`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`#include "paddle/extension.h"`
			`#include "paddle/jit/all.h"`
			`#include "paddle/phi/api/all.h"`

			`namespace ppspeech {`


seprate recognizer; NnetBase as base class 2 years ago			`class U2NnetBase : public NnetBase {`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`public:`
remove uesless code 2 years ago			`virtual int Context() const { return right_context_ + 1; }`
			`virtual int RightContext() const { return right_context_; }`
add u2 recg 2 years ago
remove uesless code 2 years ago			`virtual int EOS() const { return eos_; }`
			`virtual int SOS() const { return sos_; }`
			`virtual int IsBidecoder() const { return is_bidecoder_; }`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`// current offset in decoder frame`
remove uesless code 2 years ago			`virtual int Offset() const { return offset_; }`
			`virtual void SetChunkSize(int chunk_size) { chunk_size_ = chunk_size; }`
			`virtual void SetNumLeftChunks(int num_left_chunks) {`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`num_left_chunks_ = num_left_chunks;`
			`}`

[speechx] add batch recognizer decode. (#2866) * add recognizer_batch 2 years ago			`virtual std::shared_ptr<NnetBase> Clone() const = 0;`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago
			`protected:`
			`virtual void ForwardEncoderChunkImpl(`
			`const std::vector<kaldi::BaseFloat>& chunk_feats,`
add nnetout struct 2 years ago			`const int32& feat_dim,`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`std::vector<kaldi::BaseFloat>* ctc_probs,`
			`int32* vocab_dim) = 0;`

			`protected:`
			`// model specification`
			`int right_context_{0};`

			`int sos_{0};`
			`int eos_{0};`

			`bool is_bidecoder_{false};`

			`int chunk_size_{16}; // num of decoder frames. If chunk_size > 0, streaming`
			`// case. Otherwise, none streaming case`
			`int num_left_chunks_{-1}; // -1 means all left chunks`

add nnetout struct 2 years ago			`// asr decoder state, not used in nnet`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`int offset_{0}; // current offset in encoder output time stamp. Used by`
			`// position embedding.`
			`std::vector<std::vector<float>> cached_feats_{}; // features cache`
			`};`


			`class U2Nnet : public U2NnetBase {`
			`public:`
cpplint 2 years ago			`explicit U2Nnet(const ModelOptions& opts);`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`U2Nnet(const U2Nnet& other);`

[speechx]add kaldi-native-fbank && refactor frontend (#2794) * replace kaldi-fbank with kaldi-native-fbank * make kaldi-native-fbank work 2 years ago			`void FeedForward(const std::vector<kaldi::BaseFloat>& features,`
add nnetout struct 2 years ago			`const int32& feature_dim,`
			`NnetOut* out) override;`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago
			`void Reset() override;`

fix LogLikelihood and add AdvanceChunk 2 years ago			`bool IsLogProb() override { return true; }`

add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`void Dim();`

			`void LoadModel(const std::string& model_path_w_prefix);`
			`void Warmup();`

			`std::shared_ptr<paddle::jit::Layer> model() const { return model_; }`

[speechx] add batch recognizer decode. (#2866) * add recognizer_batch 2 years ago			`std::shared_ptr<NnetBase> Clone() const override;`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago
			`void ForwardEncoderChunkImpl(`
			`const std::vector<kaldi::BaseFloat>& chunk_feats,`
add nnetout struct 2 years ago			`const int32& feat_dim,`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`std::vector<kaldi::BaseFloat>* ctc_probs,`
			`int32* vocab_dim) override;`

			`float ComputePathScore(const paddle::Tensor& prob,`
			`const std::vector<int>& hyp,`
			`int eos);`

			`void AttentionRescoring(const std::vector<std::vector<int>>& hyps,`
			`float reverse_weight,`
			`std::vector<float>* rescoring_score) override;`

			`// debug`
cpplint 2 years ago			`void FeedEncoderOuts(const paddle::Tensor& encoder_out);`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago
u2 nnet get encoder out and align with py 2 years ago			`void EncoderOuts(`
[speechx]add kaldi-native-fbank && refactor frontend (#2794) * replace kaldi-fbank with kaldi-native-fbank * make kaldi-native-fbank work 2 years ago			`std::vector<std::vector<kaldi::BaseFloat>>* encoder_out) const;`
add nnetout struct 2 years ago
[engine] add wfst recognizer in example (#3173) * update wfst script * add skip blank 2 years ago			`ModelOptions opts_; // hack, fix later`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`private:`

			`phi::Place dev_;`
			`std::shared_ptr<paddle::jit::Layer> model_{nullptr};`
			`std::vector<paddle::Tensor> encoder_outs_;`
			`// transformer/conformer attention cache`
			`paddle::Tensor att_cache_ = paddle::full({0, 0, 0, 0}, 0.0);`
			`// conformer-only conv_module cache`
			`paddle::Tensor cnn_cache_ = paddle::full({0, 0, 0, 0}, 0.0);`

			`paddle::jit::Function forward_encoder_chunk_;`
			`paddle::jit::Function forward_attention_decoder_;`
			`paddle::jit::Function ctc_activation_;`
[engine] add wfst recognizer in example (#3173) * update wfst script * add skip blank 2 years ago			`float cost_time_ = 0.0;`
add u2 nnet, u2 nnet main, codelab, and can compile 2 years ago			`};`

			`} // namespace ppspeech`