PaddleSpeech/runtime/engine/asr/nnet/decodable.h

// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "base/common.h"
#include "kaldi/decoder/decodable-itf.h"
#include "matrix/kaldi-matrix.h"
#include "nnet/nnet_itf.h"
#include "nnet/nnet_producer.h"

namespace ppspeech {

struct DecodableOpts;

class Decodable : public kaldi::DecodableInterface {
  public:
    explicit Decodable(const std::shared_ptr<NnetProducer>& nnet_producer,
                       kaldi::BaseFloat acoustic_scale = 1.0);

    // nnet logprob output, used by wfst
    virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index);

    // nnet output
    virtual bool FrameLikelihood(int32 frame,
                                 std::vector<kaldi::BaseFloat>* likelihood);

    // forward nnet with feats
    bool AdvanceChunk();

    // forward nnet with feats, and get nnet output
    bool AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
                      int* vocab_dim);

    void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
                            float reverse_weight,
                            std::vector<float>* rescoring_score);

    virtual bool IsLastFrame(int32 frame);

    // nnet output dim, e.g. vocab size
    virtual int32 NumIndices() const;

    virtual int32 NumFramesReady() const;

    void Reset();

    bool IsInputFinished() const { return nnet_producer_->IsFinished(); }

    bool EnsureFrameHaveComputed(int32 frame);

    int32 TokenId2NnetId(int32 token_id);

    // for offline test
    void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);

  private:
    std::shared_ptr<NnetProducer> nnet_producer_;

    // the frame is nnet prob frame rather than audio feature frame
    // nnet frame subsample the feature frame
    // eg: 35 frame features output 8 frame inferences
    int32 frame_offset_;
    int32 frames_ready_;

    // todo: feature frame mismatch with nnet inference frame
    // so use subsampled_frame
    int32 current_log_post_subsampled_offset_;
    int32 num_chunk_computed_;
    std::vector<kaldi::BaseFloat> framelikelihood_;

    kaldi::BaseFloat acoustic_scale_;
};

}  // namespace ppspeech