parent
e1fc57deb1
commit
290c23b9d7
@ -0,0 +1,29 @@
|
||||
# This file is used by clang-format to autoformat paddle source code
|
||||
#
|
||||
# The clang-format is part of llvm toolchain.
|
||||
# It need to install llvm and clang to format source code style.
|
||||
#
|
||||
# The basic usage is,
|
||||
# clang-format -i -style=file PATH/TO/SOURCE/CODE
|
||||
#
|
||||
# The -style=file implicit use ".clang-format" file located in one of
|
||||
# parent directory.
|
||||
# The -i means inplace change.
|
||||
#
|
||||
# The document of clang-format is
|
||||
# http://clang.llvm.org/docs/ClangFormat.html
|
||||
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
|
||||
---
|
||||
Language: Cpp
|
||||
BasedOnStyle: Google
|
||||
IndentWidth: 4
|
||||
TabWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
MaxEmptyLinesToKeep: 2
|
||||
AccessModifierOffset: -2 # The private/protected/public has no indent in class
|
||||
Standard: Cpp11
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
BinPackParameters: false
|
||||
BinPackArguments: false
|
||||
...
|
||||
|
@ -1,8 +1,8 @@
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
gtest
|
||||
URL https://github.com/google/googletest/archive/release-1.10.0.zip
|
||||
URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91
|
||||
URL https://github.com/google/googletest/archive/release-1.11.0.zip
|
||||
URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a
|
||||
)
|
||||
FetchContent_MakeAvailable(gtest)
|
||||
|
||||
|
@ -0,0 +1,2 @@
|
||||
data
|
||||
exp
|
@ -0,0 +1,3 @@
|
||||
data
|
||||
exp
|
||||
*log
|
@ -0,0 +1,3 @@
|
||||
# Deepspeech2 Streaming NNet Test
|
||||
|
||||
Using for ds2 streaming nnet inference test.
|
@ -0,0 +1,19 @@
|
||||
# This contains the locations of binarys build required for running the examples.
|
||||
|
||||
unset GREP_OPTIONS
|
||||
|
||||
SPEECHX_ROOT=$PWD/../../../
|
||||
SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
|
||||
|
||||
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
|
||||
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
|
||||
|
||||
[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
|
||||
|
||||
export LC_AL=C
|
||||
|
||||
SPEECHX_BIN=$SPEECHX_BUILD/nnet
|
||||
export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
|
||||
|
||||
PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')")
|
||||
export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
|
@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
set -x
|
||||
set -e
|
||||
|
||||
. path.sh
|
||||
|
||||
# 1. compile
|
||||
if [ ! -d ${SPEECHX_EXAMPLES} ]; then
|
||||
pushd ${SPEECHX_ROOT}
|
||||
bash build.sh
|
||||
popd
|
||||
fi
|
||||
|
||||
# 2. download model
|
||||
if [ ! -f data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz ]; then
|
||||
mkdir -p data/model
|
||||
pushd data/model
|
||||
wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz
|
||||
tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz
|
||||
popd
|
||||
fi
|
||||
|
||||
# produce wav scp
|
||||
if [ ! -f data/wav.scp ]; then
|
||||
mkdir -p data
|
||||
pushd data
|
||||
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
|
||||
echo "utt1 " $PWD/zh.wav > wav.scp
|
||||
popd
|
||||
fi
|
||||
|
||||
data=data
|
||||
exp=exp
|
||||
mkdir -p $exp
|
||||
ckpt_dir=./data/model
|
||||
model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
|
||||
|
||||
|
||||
cmvn_json2kaldi_main \
|
||||
--json_file $model_dir/mean_std.json \
|
||||
--cmvn_write_path $exp/cmvn.ark \
|
||||
--binary=false
|
||||
echo "convert json cmvn to kaldi ark."
|
||||
|
||||
compute_fbank_main \
|
||||
--num_bins 80 \
|
||||
--wav_rspecifier=scp:$data/wav.scp \
|
||||
--cmvn_file=$exp/cmvn.ark \
|
||||
--feature_wspecifier=ark,t:$exp/fbank.ark
|
||||
echo "compute fbank feature."
|
||||
|
||||
u2_nnet_main \
|
||||
--model_path=$model_dir/export.jit \
|
||||
--feature_rspecifier=ark,t:$exp/fbank.ark \
|
||||
--nnet_decoder_chunk=16 \
|
||||
--receptive_field_length=7 \
|
||||
--downsampling_rate=4 \
|
||||
--acoustic_scale=1.0 \
|
||||
--nnet_prob_wspecifier=ark,t:$exp/probs.ark
|
@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
|
||||
# this script is for memory check, so please run ./run.sh first.
|
||||
|
||||
set +x
|
||||
set -e
|
||||
|
||||
. ./path.sh
|
||||
|
||||
if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
|
||||
echo "please install valgrind in the speechx tools dir.\n"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ckpt_dir=./data/model
|
||||
model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
|
||||
|
||||
valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
|
||||
ds2_model_test_main \
|
||||
--model_path=$model_dir/avg_1.jit.pdmodel \
|
||||
--param_path=$model_dir/avg_1.jit.pdparams
|
@ -0,0 +1,5 @@
|
||||
# U2/U2++ Streaming ASR
|
||||
|
||||
## Examples
|
||||
|
||||
* `wenetspeech` - Streaming Decoding using wenetspeech u2/u2++ model. Using aishell test data for testing.
|
@ -1,12 +1,40 @@
|
||||
project(nnet)
|
||||
set(srcs decodable.cc)
|
||||
|
||||
add_library(nnet STATIC
|
||||
decodable.cc
|
||||
ds2_nnet.cc
|
||||
)
|
||||
if(USING_DS2)
|
||||
list(APPEND srcs ds2_nnet.cc)
|
||||
endif()
|
||||
|
||||
if(USING_U2)
|
||||
list(APPEND srcs u2_nnet.cc)
|
||||
endif()
|
||||
|
||||
add_library(nnet STATIC ${srcs})
|
||||
target_link_libraries(nnet absl::strings)
|
||||
|
||||
set(bin_name ds2_nnet_main)
|
||||
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
|
||||
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
|
||||
target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS})
|
||||
if(USING_U2)
|
||||
target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS})
|
||||
target_include_directories(nnet PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
|
||||
# target_link_libraries(nnet ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
|
||||
endif()
|
||||
|
||||
|
||||
if(USING_DS2)
|
||||
set(bin_name ds2_nnet_main)
|
||||
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
|
||||
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
|
||||
target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet)
|
||||
|
||||
target_link_libraries(${bin_name} ${DEPS})
|
||||
endif()
|
||||
|
||||
# test bin
|
||||
if(USING_U2)
|
||||
set(bin_name u2_nnet_main)
|
||||
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
|
||||
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
|
||||
target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet)
|
||||
|
||||
target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS})
|
||||
target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
|
||||
target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
|
||||
endif()
|
||||
|
@ -0,0 +1,706 @@
|
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "nnet/u2_nnet.h"
|
||||
|
||||
#ifdef USE_PROFILING
|
||||
#include "paddle/fluid/platform/profiler.h"
|
||||
using paddle::platform::RecordEvent;
|
||||
using paddle::platform::TracerEventType;
|
||||
#endif // end USE_PROFILING
|
||||
|
||||
namespace ppspeech {
|
||||
|
||||
int U2NnetBase::num_frames_for_chunk(bool start) const {
|
||||
int num_needed_frames = 0; // num feat frames
|
||||
bool first = !start; // start == false is first
|
||||
|
||||
if (chunk_size_ > 0) {
|
||||
// streaming mode
|
||||
if (first) {
|
||||
// first chunk
|
||||
// 1 decoder frame need `context` feat frames
|
||||
int context = this->context();
|
||||
num_needed_frames = (chunk_size_ - 1) * subsampling_rate_ + context;
|
||||
} else {
|
||||
// after first chunk, we need stride this num frames.
|
||||
num_needed_frames = chunk_size_ * subsampling_rate_;
|
||||
}
|
||||
} else {
|
||||
// non-streaming mode. feed all feats once.
|
||||
num_needed_frames = std::numeric_limits<int>::max();
|
||||
}
|
||||
|
||||
return num_needed_frames;
|
||||
}
|
||||
|
||||
// cache feats for next chunk
|
||||
void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
|
||||
int32 feat_dim) {
|
||||
// chunk_feats is nframes*feat_dim
|
||||
const int chunk_size = chunk_feats.size() / feat_dim;
|
||||
const int cached_feat_size = this->context() - subsampling_rate_;
|
||||
if (chunk_size >= cached_feat_size) {
|
||||
cached_feats_.resize(cached_feat_size);
|
||||
for (int i = 0; i < cached_feat_size; ++i) {
|
||||
auto start =
|
||||
chunk_feats.begin() + chunk_size - cached_feat_size + i;
|
||||
auto end = start + feat_dim;
|
||||
cached_feats_[i] = std::vector<float>(start, end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void U2NnetBase::ForwardEncoderChunk(
|
||||
const std::vector<kaldi::BaseFloat>& chunk_feats,
|
||||
int32 feat_dim,
|
||||
std::vector<kaldi::BaseFloat>* ctc_probs,
|
||||
int32* vocab_dim) {
|
||||
ctc_probs->clear();
|
||||
// int num_frames = cached_feats_.size() + chunk_feats.size();
|
||||
int num_frames = chunk_feats.size() / feat_dim;
|
||||
VLOG(3) << "foward encoder chunk: " << num_frames << " frames";
|
||||
VLOG(3) << "context: " << this->context() << " frames";
|
||||
|
||||
if (num_frames >= this->context()) {
|
||||
this->ForwardEncoderChunkImpl(
|
||||
chunk_feats, feat_dim, ctc_probs, vocab_dim);
|
||||
VLOG(3) << "after forward chunk";
|
||||
this->CacheFeature(chunk_feats, feat_dim);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void U2Nnet::LoadModel(const std::string& model_path_w_prefix) {
|
||||
paddle::jit::utils::InitKernelSignatureMap();
|
||||
|
||||
#ifdef USE_GPU
|
||||
dev_ = phi::GPUPlace();
|
||||
#else
|
||||
dev_ = phi::CPUPlace();
|
||||
#endif
|
||||
paddle::jit::Layer model = paddle::jit::Load(model_path_w_prefix, dev_);
|
||||
model_ = std::make_shared<paddle::jit::Layer>(std::move(model));
|
||||
|
||||
subsampling_rate_ = model_->Attribute<int>("subsampling_rate");
|
||||
right_context_ = model_->Attribute<int>("right_context");
|
||||
sos_ = model_->Attribute<int>("sos_symbol");
|
||||
eos_ = model_->Attribute<int>("eos_symbol");
|
||||
is_bidecoder_ = model_->Attribute<int>("is_bidirectional_decoder");
|
||||
|
||||
forward_encoder_chunk_ = model_->Function("forward_encoder_chunk");
|
||||
forward_attention_decoder_ = model_->Function("forward_attention_decoder");
|
||||
ctc_activation_ = model_->Function("ctc_activation");
|
||||
CHECK(forward_encoder_chunk_.IsValid());
|
||||
CHECK(forward_attention_decoder_.IsValid());
|
||||
CHECK(ctc_activation_.IsValid());
|
||||
|
||||
LOG(INFO) << "Paddle Model Info: ";
|
||||
LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_;
|
||||
LOG(INFO) << "\tright context " << right_context_;
|
||||
LOG(INFO) << "\tsos " << sos_;
|
||||
LOG(INFO) << "\teos " << eos_;
|
||||
LOG(INFO) << "\tis bidecoder " << is_bidecoder_ << std::endl;
|
||||
|
||||
Warmup();
|
||||
}
|
||||
|
||||
void U2Nnet::Warmup() {
|
||||
#ifdef USE_PROFILING
|
||||
RecordEvent event("warmup", TracerEventType::UserDefined, 1);
|
||||
#endif
|
||||
|
||||
{
|
||||
#ifdef USE_PROFILING
|
||||
RecordEvent event(
|
||||
"warmup-encoder-ctc", TracerEventType::UserDefined, 1);
|
||||
#endif
|
||||
int feat_dim = 80;
|
||||
int frame_num = 16 * 4 + 3; // chunk_size * downsample_rate +
|
||||
// (receptive_field - downsample_rate)
|
||||
paddle::Tensor feats = paddle::full(
|
||||
{1, frame_num, feat_dim}, 0.12f, paddle::DataType::FLOAT32);
|
||||
paddle::Tensor offset = paddle::zeros({1}, paddle::DataType::INT32);
|
||||
paddle::Tensor att_cache =
|
||||
paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32);
|
||||
paddle::Tensor cnn_cache =
|
||||
paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32);
|
||||
std::vector<paddle::Tensor> inputs = {
|
||||
feats, offset, /*required_cache_size, */ att_cache, cnn_cache};
|
||||
std::vector<paddle::Tensor> outputs = forward_encoder_chunk_(inputs);
|
||||
|
||||
auto chunk_out = outputs[0];
|
||||
inputs = std::move(std::vector<paddle::Tensor>({chunk_out}));
|
||||
outputs = ctc_activation_(inputs);
|
||||
}
|
||||
|
||||
{
|
||||
#ifdef USE_PROFILING
|
||||
RecordEvent event("warmup-decoder", TracerEventType::UserDefined, 1);
|
||||
#endif
|
||||
auto hyps =
|
||||
paddle::full({10, 8}, 10, paddle::DataType::INT64, phi::CPUPlace());
|
||||
auto hyps_lens =
|
||||
paddle::full({10}, 8, paddle::DataType::INT64, phi::CPUPlace());
|
||||
auto encoder_out = paddle::ones(
|
||||
{1, 20, 512}, paddle::DataType::FLOAT32, phi::CPUPlace());
|
||||
|
||||
std::vector<paddle::experimental::Tensor> inputs{
|
||||
hyps, hyps_lens, encoder_out};
|
||||
|
||||
std::vector<paddle::experimental::Tensor> outputs =
|
||||
forward_attention_decoder_(inputs);
|
||||
}
|
||||
|
||||
Reset();
|
||||
}
|
||||
|
||||
U2Nnet::U2Nnet(const U2ModelOptions& opts) : opts_(opts) {
|
||||
LoadModel(opts_.model_path);
|
||||
}
|
||||
|
||||
// shallow copy
|
||||
U2Nnet::U2Nnet(const U2Nnet& other) {
|
||||
// copy meta
|
||||
right_context_ = other.right_context_;
|
||||
subsampling_rate_ = other.subsampling_rate_;
|
||||
sos_ = other.sos_;
|
||||
eos_ = other.eos_;
|
||||
is_bidecoder_ = other.is_bidecoder_;
|
||||
chunk_size_ = other.chunk_size_;
|
||||
num_left_chunks_ = other.num_left_chunks_;
|
||||
|
||||
forward_encoder_chunk_ = other.forward_encoder_chunk_;
|
||||
forward_attention_decoder_ = other.forward_attention_decoder_;
|
||||
ctc_activation_ = other.ctc_activation_;
|
||||
|
||||
// offset_ = other.offset_; // TODO: not used in nnets
|
||||
|
||||
// copy model ptr
|
||||
model_ = other.model_;
|
||||
|
||||
// ignore inner states
|
||||
}
|
||||
|
||||
std::shared_ptr<NnetInterface> U2Nnet::Copy() const {
|
||||
auto asr_model = std::make_shared<U2Nnet>(*this);
|
||||
// reset inner state for new decoding
|
||||
asr_model->Reset();
|
||||
return asr_model;
|
||||
}
|
||||
|
||||
void U2Nnet::Reset() {
|
||||
// offset_ = 0;
|
||||
// cached_feats_.clear(); // TODO: not used in nnets
|
||||
|
||||
att_cache_ =
|
||||
std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));
|
||||
cnn_cache_ =
|
||||
std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));
|
||||
|
||||
encoder_outs_.clear();
|
||||
}
|
||||
|
||||
// Debug API
|
||||
void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
|
||||
// encoder_out (T,D)
|
||||
encoder_outs_.clear();
|
||||
encoder_outs_.push_back(encoder_out);
|
||||
}
|
||||
|
||||
|
||||
void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
|
||||
int32 feature_dim,
|
||||
kaldi::Vector<BaseFloat>* inferences,
|
||||
int32* inference_dim) {
|
||||
std::vector<kaldi::BaseFloat> chunk_feats(features.Data(),
|
||||
features.Data() + features.Dim());
|
||||
std::vector<kaldi::BaseFloat> ctc_probs;
|
||||
ForwardEncoderChunkImpl(
|
||||
chunk_feats, feature_dim, &ctc_probs, inference_dim);
|
||||
inferences->Resize(ctc_probs.size(), kaldi::kSetZero);
|
||||
std::memcpy(inferences->Data(),
|
||||
ctc_probs.data(),
|
||||
ctc_probs.size() * sizeof(kaldi::BaseFloat));
|
||||
}
|
||||
|
||||
|
||||
void U2Nnet::ForwardEncoderChunkImpl(
|
||||
const std::vector<kaldi::BaseFloat>& chunk_feats,
|
||||
int32 feat_dim,
|
||||
std::vector<kaldi::BaseFloat>* out_prob,
|
||||
int32* vocab_dim) {
|
||||
#ifdef USE_PROFILING
|
||||
RecordEvent event(
|
||||
"ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1);
|
||||
#endif
|
||||
|
||||
// 1. splice cached_feature, and chunk_feats
|
||||
// First dimension is B, which is 1.
|
||||
// int num_frames = cached_feats_.size() + chunk_feats.size();
|
||||
|
||||
int num_frames = chunk_feats.size() / feat_dim;
|
||||
VLOG(3) << "num_frames: " << num_frames;
|
||||
VLOG(3) << "feat_dim: " << feat_dim;
|
||||
|
||||
// feats (B=1,T,D)
|
||||
paddle::Tensor feats =
|
||||
paddle::zeros({1, num_frames, feat_dim}, paddle::DataType::FLOAT32);
|
||||
float* feats_ptr = feats.mutable_data<float>();
|
||||
|
||||
// for (size_t i = 0; i < cached_feats_.size(); ++i) {
|
||||
// float* row = feats_ptr + i * feat_dim;
|
||||
// std::memcpy(row, cached_feats_[i].data(), feat_dim * sizeof(float));
|
||||
// }
|
||||
|
||||
// for (size_t i = 0; i < chunk_feats.size(); ++i) {
|
||||
// float* row = feats_ptr + (cached_feats_.size() + i) * feat_dim;
|
||||
// std::memcpy(row, chunk_feats[i].data(), feat_dim * sizeof(float));
|
||||
// }
|
||||
|
||||
// not cache feature in nnet
|
||||
CHECK(cached_feats_.size() == 0);
|
||||
// CHECK_EQ(std::is_same<float, kaldi::BaseFloat>::value, true);
|
||||
std::memcpy(feats_ptr,
|
||||
chunk_feats.data(),
|
||||
chunk_feats.size() * sizeof(kaldi::BaseFloat));
|
||||
|
||||
VLOG(3) << "feats shape: " << feats.shape()[0] << ", " << feats.shape()[1]
|
||||
<< ", " << feats.shape()[2];
|
||||
|
||||
#ifdef TEST_DEBUG
|
||||
{
|
||||
std::stringstream path("feat", std::ios_base::app | std::ios_base::out);
|
||||
path << offset_;
|
||||
std::ofstream feat_fobj(path.str().c_str(), std::ios::out);
|
||||
CHECK(feat_fobj.is_open());
|
||||
// feat_fobj << feats.shape()[0] << " " << feats.shape()[1] << " "
|
||||
// << feats.shape()[2] << "\n";
|
||||
for (int i = 0; i < feats.numel(); i++) {
|
||||
feat_fobj << std::setprecision(18) << feats_ptr[i] << " ";
|
||||
if ((i + 1) % feat_dim == 0) {
|
||||
feat_fobj << "\n";
|
||||
}
|
||||
}
|
||||
feat_fobj << "\n";
|
||||
}
|
||||
#endif
|
||||
|
||||
// Endocer chunk forward
|
||||
#ifdef USE_GPU
|
||||
feats = feats.copy_to(paddle::GPUPlace(), /*blocking*/ false);
|
||||
att_cache_ = att_cache_.copy_to(paddle::GPUPlace()), /*blocking*/ false;
|
||||
cnn_cache_ = cnn_cache_.copy_to(Paddle::GPUPlace(), /*blocking*/ false);
|
||||
#endif
|
||||
|
||||
int required_cache_size = num_left_chunks_ * chunk_size_; // -1 * 16
|
||||
// must be scalar, but paddle do not have scalar.
|
||||
paddle::Tensor offset = paddle::full({1}, offset_, paddle::DataType::INT32);
|
||||
// freeze `required_cache_size` in graph, so not specific it in function
|
||||
// call.
|
||||
std::vector<paddle::Tensor> inputs = {
|
||||
feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_};
|
||||
VLOG(3) << "inputs size: " << inputs.size();
|
||||
CHECK(inputs.size() == 4);
|
||||
std::vector<paddle::Tensor> outputs = forward_encoder_chunk_(inputs);
|
||||
VLOG(3) << "outputs size: " << outputs.size();
|
||||
CHECK(outputs.size() == 3);
|
||||
|
||||
#ifdef USE_GPU
|
||||
paddle::Tensor chunk_out = outputs[0].copy_to(paddle::CPUPlace());
|
||||
att_cache_ = outputs[1].copy_to(paddle::CPUPlace());
|
||||
cnn_cache_ = outputs[2].copy_to(paddle::CPUPlace());
|
||||
#else
|
||||
paddle::Tensor chunk_out = outputs[0];
|
||||
att_cache_ = outputs[1];
|
||||
cnn_cache_ = outputs[2];
|
||||
#endif
|
||||
|
||||
#ifdef TEST_DEBUG
|
||||
{
|
||||
std::stringstream path("encoder_logits",
|
||||
std::ios_base::app | std::ios_base::out);
|
||||
auto i = offset_ - chunk_out.shape()[1];
|
||||
path << std::max(i, 0L);
|
||||
std::ofstream logits_fobj(path.str().c_str(), std::ios::out);
|
||||
CHECK(logits_fobj.is_open());
|
||||
logits_fobj << chunk_out.shape()[0] << " " << chunk_out.shape()[1]
|
||||
<< " " << chunk_out.shape()[2] << "\n";
|
||||
const float* chunk_out_ptr = chunk_out.data<float>();
|
||||
logits_fobj << chunk_out_ptr << std::endl;
|
||||
for (int i = 0; i < chunk_out.numel(); i++) {
|
||||
logits_fobj << chunk_out_ptr[i] << " ";
|
||||
}
|
||||
logits_fobj << "\n";
|
||||
}
|
||||
#endif // end TEST_DEBUG
|
||||
|
||||
// current offset in decoder frame
|
||||
// not used in nnet
|
||||
offset_ += chunk_out.shape()[1];
|
||||
|
||||
// collects encoder outs.
|
||||
VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();
|
||||
encoder_outs_.push_back(chunk_out);
|
||||
|
||||
#ifdef TEST_DEBUG
|
||||
{
|
||||
std::stringstream path("encoder_logits_list",
|
||||
std::ios_base::app | std::ios_base::out);
|
||||
path << offset_ - encoder_outs_[0].shape()[1];
|
||||
std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
|
||||
CHECK(logits_out_fobj.is_open());
|
||||
logits_out_fobj << encoder_outs_[0].shape()[0] << " "
|
||||
<< encoder_outs_[0].shape()[1] << " "
|
||||
<< encoder_outs_[0].shape()[2] << "\n";
|
||||
const float* encoder_outs_ptr = encoder_outs_[0].data<float>();
|
||||
logits_out_fobj << encoder_outs_ptr << std::endl;
|
||||
for (int i = 0; i < encoder_outs_[0].numel(); i++) {
|
||||
logits_out_fobj << encoder_outs_ptr[i] << " ";
|
||||
}
|
||||
logits_out_fobj << "\n";
|
||||
}
|
||||
#endif // end TEST_DEBUG
|
||||
|
||||
#ifdef USE_GPU
|
||||
|
||||
#error "Not implementation."
|
||||
|
||||
#else
|
||||
// compute ctc_activation == log_softmax
|
||||
inputs.clear();
|
||||
outputs.clear();
|
||||
inputs.push_back(chunk_out);
|
||||
CHECK(inputs.size() == 1);
|
||||
outputs = ctc_activation_(inputs);
|
||||
CHECK(outputs.size() == 1);
|
||||
paddle::Tensor ctc_log_probs = outputs[0];
|
||||
|
||||
#ifdef TEST_DEBUG
|
||||
{
|
||||
std::stringstream path("encoder_logprob",
|
||||
std::ios_base::app | std::ios_base::out);
|
||||
path << offset_ - chunk_out.shape()[1];
|
||||
|
||||
std::ofstream logprob_fobj(path.str().c_str(), std::ios::out);
|
||||
CHECK(logprob_fobj.is_open());
|
||||
logprob_fobj << ctc_log_probs.shape()[0] << " "
|
||||
<< ctc_log_probs.shape()[1] << " "
|
||||
<< ctc_log_probs.shape()[2] << "\n";
|
||||
const float* logprob_ptr = ctc_log_probs.data<float>();
|
||||
for (int i = 0; i < ctc_log_probs.numel(); i++) {
|
||||
logprob_fobj << logprob_ptr[i] << " ";
|
||||
if ((i + 1) % ctc_log_probs.shape()[2] == 0) {
|
||||
logprob_fobj << "\n";
|
||||
}
|
||||
}
|
||||
logprob_fobj << "\n";
|
||||
}
|
||||
#endif // end TEST_DEBUG
|
||||
|
||||
#endif // end USE_GPU
|
||||
|
||||
// Copy to output, (B=1,T,D)
|
||||
std::vector<int64_t> ctc_log_probs_shape = ctc_log_probs.shape();
|
||||
CHECK(ctc_log_probs_shape.size() == 3);
|
||||
int B = ctc_log_probs_shape[0];
|
||||
CHECK(B == 1);
|
||||
int T = ctc_log_probs_shape[1];
|
||||
int D = ctc_log_probs_shape[2];
|
||||
*vocab_dim = D;
|
||||
|
||||
float* ctc_log_probs_ptr = ctc_log_probs.data<float>();
|
||||
|
||||
// // vector<vector<float>>
|
||||
// out_prob->resize(T);
|
||||
// for (int i = 0; i < T; i++) {
|
||||
// (*out_prob)[i].resize(D);
|
||||
// float* dst_ptr = (*out_prob)[i].data();
|
||||
// float* src_ptr = ctc_log_probs_ptr + (i * D);
|
||||
// std::memcpy(dst_ptr, src_ptr, D * sizeof(float));
|
||||
// }
|
||||
// CHECK(std::is_same<float, kaldi::BaseFloat>::value);
|
||||
out_prob->resize(T * D);
|
||||
std::memcpy(
|
||||
out_prob->data(), ctc_log_probs_ptr, T * D * sizeof(kaldi::BaseFloat));
|
||||
|
||||
#ifdef TEST_DEBUG
|
||||
{
|
||||
std::stringstream path("encoder_logits_list_ctc",
|
||||
std::ios_base::app | std::ios_base::out);
|
||||
path << offset_ - encoder_outs_[0].shape()[1];
|
||||
std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
|
||||
CHECK(logits_out_fobj.is_open());
|
||||
logits_out_fobj << encoder_outs_[0].shape()[0] << " "
|
||||
<< encoder_outs_[0].shape()[1] << " "
|
||||
<< encoder_outs_[0].shape()[2] << "\n";
|
||||
const float* encoder_outs_ptr = encoder_outs_[0].data<float>();
|
||||
logits_out_fobj << encoder_outs_ptr << std::endl;
|
||||
for (int i = 0; i < encoder_outs_[0].numel(); i++) {
|
||||
logits_out_fobj << encoder_outs_ptr[i] << " ";
|
||||
}
|
||||
logits_out_fobj << "\n";
|
||||
}
|
||||
#endif // end TEST_DEBUG
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
float U2Nnet::ComputePathScore(const paddle::Tensor& prob,
|
||||
const std::vector<int>& hyp,
|
||||
int eos) {
|
||||
// sum `hyp` path scores in `prob`
|
||||
// prob (1, Umax, V)
|
||||
// hyp (U,)
|
||||
float score = 0.0f;
|
||||
std::vector<int64_t> dims = prob.shape();
|
||||
CHECK(dims.size() == 3);
|
||||
VLOG(2) << "prob shape: " << dims[0] << ", " << dims[1] << ", " << dims[2];
|
||||
CHECK(dims[0] == 1);
|
||||
int vocab_dim = static_cast<int>(dims[2]);
|
||||
|
||||
const float* prob_ptr = prob.data<float>();
|
||||
for (size_t i = 0; i < hyp.size(); ++i) {
|
||||
const float* row = prob_ptr + i * vocab_dim;
|
||||
score += row[hyp[i]];
|
||||
}
|
||||
const float* row = prob_ptr + hyp.size() * vocab_dim;
|
||||
score += row[eos];
|
||||
return score;
|
||||
}
|
||||
|
||||
|
||||
void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
|
||||
float reverse_weight,
|
||||
std::vector<float>* rescoring_score) {
|
||||
#ifdef USE_PROFILING
|
||||
RecordEvent event("AttentionRescoring", TracerEventType::UserDefined, 1);
|
||||
#endif
|
||||
|
||||
CHECK(rescoring_score != nullptr);
|
||||
|
||||
int num_hyps = hyps.size();
|
||||
rescoring_score->resize(num_hyps, 0.0f);
|
||||
|
||||
if (num_hyps == 0) return;
|
||||
VLOG(2) << "num hyps: " << num_hyps;
|
||||
|
||||
if (encoder_outs_.size() == 0) {
|
||||
// no encoder outs
|
||||
std::cerr << "encoder_outs_.size() is zero. Please check it."
|
||||
<< std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// prepare input
|
||||
paddle::Tensor hyps_lens =
|
||||
paddle::zeros({num_hyps}, paddle::DataType::INT64);
|
||||
int64_t* hyps_len_ptr = hyps_lens.mutable_data<int64_t>();
|
||||
int max_hyps_len = 0;
|
||||
for (size_t i = 0; i < num_hyps; ++i) {
|
||||
int len = hyps[i].size() + 1; // eos
|
||||
max_hyps_len = std::max(max_hyps_len, len);
|
||||
hyps_len_ptr[i] = static_cast<int64_t>(len);
|
||||
}
|
||||
|
||||
paddle::Tensor hyps_tensor =
|
||||
paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64);
|
||||
int64_t* hyps_ptr = hyps_tensor.mutable_data<int64_t>();
|
||||
for (size_t i = 0; i < num_hyps; ++i) {
|
||||
const std::vector<int>& hyp = hyps[i];
|
||||
int64_t* row = hyps_ptr + max_hyps_len * i;
|
||||
row[0] = sos_;
|
||||
for (size_t j = 0; j < hyp.size(); ++j) {
|
||||
row[j + 1] = hyp[j];
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef TEST_DEBUG
|
||||
{
|
||||
std::stringstream path("encoder_logits_concat",
|
||||
std::ios_base::app | std::ios_base::out);
|
||||
for (int j = 0; j < encoder_outs_.size(); j++) {
|
||||
path << j;
|
||||
std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
|
||||
CHECK(logits_out_fobj.is_open());
|
||||
logits_out_fobj << encoder_outs_[j].shape()[0] << " "
|
||||
<< encoder_outs_[j].shape()[1] << " "
|
||||
<< encoder_outs_[j].shape()[2] << "\n";
|
||||
const float* encoder_outs_ptr = encoder_outs_[j].data<float>();
|
||||
for (int i = 0; i < encoder_outs_[j].numel(); i++) {
|
||||
logits_out_fobj << encoder_outs_ptr[i] << " ";
|
||||
}
|
||||
logits_out_fobj << "\n";
|
||||
}
|
||||
}
|
||||
#endif // end TEST_DEBUG
|
||||
|
||||
// forward attention decoder by hyps and correspoinding encoder_outs_
|
||||
paddle::Tensor encoder_out = paddle::concat(encoder_outs_, 1);
|
||||
VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();
|
||||
|
||||
#ifdef TEST_DEBUG
|
||||
{
|
||||
std::stringstream path("encoder_out0",
|
||||
std::ios_base::app | std::ios_base::out);
|
||||
std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out);
|
||||
CHECK(encoder_out_fobj.is_open());
|
||||
|
||||
encoder_out_fobj << encoder_outs_[0].shape()[0] << " "
|
||||
<< encoder_outs_[0].shape()[1] << " "
|
||||
<< encoder_outs_[0].shape()[2] << "\n";
|
||||
const float* enc_logprob_ptr = encoder_outs_[0].data<float>();
|
||||
|
||||
size_t size = encoder_outs_[0].numel();
|
||||
for (int i = 0; i < size; i++) {
|
||||
encoder_out_fobj << enc_logprob_ptr[i] << "\n";
|
||||
}
|
||||
}
|
||||
#endif // end TEST_DEBUG
|
||||
|
||||
#ifdef TEST_DEBUG
|
||||
{
|
||||
std::stringstream path("encoder_out",
|
||||
std::ios_base::app | std::ios_base::out);
|
||||
std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out);
|
||||
CHECK(encoder_out_fobj.is_open());
|
||||
|
||||
encoder_out_fobj << encoder_out.shape()[0] << " "
|
||||
<< encoder_out.shape()[1] << " "
|
||||
<< encoder_out.shape()[2] << "\n";
|
||||
const float* enc_logprob_ptr = encoder_out.data<float>();
|
||||
|
||||
size_t size = encoder_out.numel();
|
||||
for (int i = 0; i < size; i++) {
|
||||
encoder_out_fobj << enc_logprob_ptr[i] << "\n";
|
||||
}
|
||||
}
|
||||
#endif // end TEST_DEBUG
|
||||
|
||||
std::vector<paddle::experimental::Tensor> inputs{
|
||||
hyps_tensor, hyps_lens, encoder_out};
|
||||
std::vector<paddle::Tensor> outputs = forward_attention_decoder_(inputs);
|
||||
CHECK(outputs.size() == 2);
|
||||
|
||||
// (B, Umax, V)
|
||||
paddle::Tensor probs = outputs[0];
|
||||
std::vector<int64_t> probs_shape = probs.shape();
|
||||
CHECK(probs_shape.size() == 3);
|
||||
CHECK(probs_shape[0] == num_hyps);
|
||||
CHECK(probs_shape[1] == max_hyps_len);
|
||||
|
||||
#ifdef TEST_DEBUG
|
||||
{
|
||||
std::stringstream path("decoder_logprob",
|
||||
std::ios_base::app | std::ios_base::out);
|
||||
std::ofstream dec_logprob_fobj(path.str().c_str(), std::ios::out);
|
||||
CHECK(dec_logprob_fobj.is_open());
|
||||
|
||||
dec_logprob_fobj << probs.shape()[0] << " " << probs.shape()[1] << " "
|
||||
<< probs.shape()[2] << "\n";
|
||||
const float* dec_logprob_ptr = probs.data<float>();
|
||||
|
||||
size_t size = probs.numel();
|
||||
for (int i = 0; i < size; i++) {
|
||||
dec_logprob_fobj << dec_logprob_ptr[i] << "\n";
|
||||
}
|
||||
}
|
||||
#endif // end TEST_DEBUG
|
||||
|
||||
#ifdef TEST_DEBUG
|
||||
{
|
||||
std::stringstream path("hyps_lens",
|
||||
std::ios_base::app | std::ios_base::out);
|
||||
std::ofstream hyps_len_fobj(path.str().c_str(), std::ios::out);
|
||||
CHECK(hyps_len_fobj.is_open());
|
||||
|
||||
const int64_t* hyps_lens_ptr = hyps_lens.data<int64_t>();
|
||||
|
||||
size_t size = hyps_lens.numel();
|
||||
for (int i = 0; i < size; i++) {
|
||||
hyps_len_fobj << hyps_lens_ptr[i] << "\n";
|
||||
}
|
||||
}
|
||||
#endif // end TEST_DEBUG
|
||||
|
||||
#ifdef TEST_DEBUG
|
||||
{
|
||||
std::stringstream path("hyps_tensor",
|
||||
std::ios_base::app | std::ios_base::out);
|
||||
std::ofstream hyps_tensor_fobj(path.str().c_str(), std::ios::out);
|
||||
CHECK(hyps_tensor_fobj.is_open());
|
||||
|
||||
const int64_t* hyps_tensor_ptr = hyps_tensor.data<int64_t>();
|
||||
|
||||
size_t size = hyps_tensor.numel();
|
||||
for (int i = 0; i < size; i++) {
|
||||
hyps_tensor_fobj << hyps_tensor_ptr[i] << "\n";
|
||||
}
|
||||
}
|
||||
#endif // end TEST_DEBUG
|
||||
|
||||
paddle::Tensor r_probs = outputs[1];
|
||||
std::vector<int64_t> r_probs_shape = r_probs.shape();
|
||||
if (is_bidecoder_ && reverse_weight > 0) {
|
||||
CHECK(r_probs_shape.size() == 3);
|
||||
CHECK(r_probs_shape[0] == num_hyps);
|
||||
CHECK(r_probs_shape[1] == max_hyps_len);
|
||||
} else {
|
||||
// dump r_probs
|
||||
CHECK(r_probs_shape.size() == 1);
|
||||
CHECK(r_probs_shape[0] == 1) << r_probs_shape[0];
|
||||
}
|
||||
|
||||
// compute rescoring score
|
||||
using IntArray = paddle::experimental::IntArray;
|
||||
std::vector<paddle::Tensor> probs_v =
|
||||
paddle::experimental::split_with_num(probs, num_hyps, 0);
|
||||
VLOG(2) << "split prob: " << probs_v.size() << " "
|
||||
<< probs_v[0].shape().size() << " 0: " << probs_v[0].shape()[0]
|
||||
<< ", " << probs_v[0].shape()[1] << ", " << probs_v[0].shape()[2];
|
||||
CHECK(static_cast<int>(probs_v.size()) == num_hyps)
|
||||
<< ": is " << probs_v.size() << " expect: " << num_hyps;
|
||||
|
||||
std::vector<paddle::Tensor> r_probs_v;
|
||||
if (is_bidecoder_ && reverse_weight > 0) {
|
||||
r_probs_v = paddle::experimental::split_with_num(r_probs, num_hyps, 0);
|
||||
CHECK(static_cast<int>(r_probs_v.size()) == num_hyps)
|
||||
<< "r_probs_v size: is " << r_probs_v.size()
|
||||
<< " expect: " << num_hyps;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_hyps; ++i) {
|
||||
const std::vector<int>& hyp = hyps[i];
|
||||
|
||||
// left-to-right decoder score
|
||||
float score = 0.0f;
|
||||
score = ComputePathScore(probs_v[i], hyp, eos_);
|
||||
|
||||
// right-to-left decoder score
|
||||
float r_score = 0.0f;
|
||||
if (is_bidecoder_ && reverse_weight > 0) {
|
||||
std::vector<int> r_hyp(hyp.size());
|
||||
std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin());
|
||||
r_score = ComputePathScore(r_probs_v[i], r_hyp, eos_);
|
||||
}
|
||||
|
||||
// combinded left-to-right and right-to-lfet score
|
||||
(*rescoring_score)[i] =
|
||||
score * (1 - reverse_weight) + r_score * reverse_weight;
|
||||
VLOG(1) << "hyp " << i << " score: " << score << " r_score: " << r_score
|
||||
<< " reverse_weight: " << reverse_weight;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ppspeech
|
@ -0,0 +1,157 @@
|
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "base/common.h"
|
||||
#include "kaldi/matrix/kaldi-matrix.h"
|
||||
|
||||
#include "kaldi/util/options-itf.h"
|
||||
#include "nnet/nnet_itf.h"
|
||||
|
||||
#include "paddle/extension.h"
|
||||
#include "paddle/jit/all.h"
|
||||
#include "paddle/phi/api/all.h"
|
||||
|
||||
namespace ppspeech {
|
||||
|
||||
struct U2ModelOptions {
|
||||
std::string model_path;
|
||||
int thread_num;
|
||||
bool use_gpu;
|
||||
U2ModelOptions() : model_path(""), thread_num(1), use_gpu(false) {}
|
||||
|
||||
void Register(kaldi::OptionsItf* opts) {
|
||||
opts->Register("model-path", &model_path, "model file path");
|
||||
opts->Register("thread-num", &thread_num, "thread num");
|
||||
opts->Register("use-gpu", &use_gpu, "if use gpu");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class U2NnetBase : public NnetInterface {
|
||||
public:
|
||||
virtual int context() const { return right_context_ + 1; }
|
||||
virtual int right_context() const { return right_context_; }
|
||||
virtual int subsampling_rate() const { return subsampling_rate_; }
|
||||
virtual int eos() const { return eos_; }
|
||||
virtual int sos() const { return sos_; }
|
||||
virtual int is_bidecoder() const { return is_bidecoder_; }
|
||||
// current offset in decoder frame
|
||||
virtual int offset() const { return offset_; }
|
||||
virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; }
|
||||
virtual void set_num_left_chunks(int num_left_chunks) {
|
||||
num_left_chunks_ = num_left_chunks;
|
||||
}
|
||||
// start: false, it is the start chunk of one sentence, else true
|
||||
virtual int num_frames_for_chunk(bool start) const;
|
||||
|
||||
virtual std::shared_ptr<NnetInterface> Copy() const = 0;
|
||||
|
||||
virtual void ForwardEncoderChunk(
|
||||
const std::vector<kaldi::BaseFloat>& chunk_feats,
|
||||
int32 feat_dim,
|
||||
std::vector<kaldi::BaseFloat>* ctc_probs,
|
||||
int32* vocab_dim);
|
||||
|
||||
virtual void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
|
||||
float reverse_weight,
|
||||
std::vector<float>* rescoring_score) = 0;
|
||||
|
||||
protected:
|
||||
virtual void ForwardEncoderChunkImpl(
|
||||
const std::vector<kaldi::BaseFloat>& chunk_feats,
|
||||
int32 feat_dim,
|
||||
std::vector<kaldi::BaseFloat>* ctc_probs,
|
||||
int32* vocab_dim) = 0;
|
||||
|
||||
virtual void CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
|
||||
int32 feat_dim);
|
||||
|
||||
protected:
|
||||
// model specification
|
||||
int right_context_{0};
|
||||
int subsampling_rate_{1};
|
||||
|
||||
int sos_{0};
|
||||
int eos_{0};
|
||||
|
||||
bool is_bidecoder_{false};
|
||||
|
||||
int chunk_size_{16}; // num of decoder frames. If chunk_size > 0, streaming
|
||||
// case. Otherwise, none streaming case
|
||||
int num_left_chunks_{-1}; // -1 means all left chunks
|
||||
|
||||
// asr decoder state
|
||||
int offset_{0}; // current offset in encoder output time stamp. Used by
|
||||
// position embedding.
|
||||
std::vector<std::vector<float>> cached_feats_{}; // features cache
|
||||
};
|
||||
|
||||
|
||||
class U2Nnet : public U2NnetBase {
|
||||
public:
|
||||
U2Nnet(const U2ModelOptions& opts);
|
||||
U2Nnet(const U2Nnet& other);
|
||||
|
||||
void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
|
||||
int32 feature_dim,
|
||||
kaldi::Vector<kaldi::BaseFloat>* inferences,
|
||||
int32* inference_dim) override;
|
||||
|
||||
void Reset() override;
|
||||
|
||||
void Dim();
|
||||
|
||||
void LoadModel(const std::string& model_path_w_prefix);
|
||||
void Warmup();
|
||||
|
||||
std::shared_ptr<paddle::jit::Layer> model() const { return model_; }
|
||||
|
||||
std::shared_ptr<NnetInterface> Copy() const override;
|
||||
|
||||
void ForwardEncoderChunkImpl(
|
||||
const std::vector<kaldi::BaseFloat>& chunk_feats,
|
||||
int32 feat_dim,
|
||||
std::vector<kaldi::BaseFloat>* ctc_probs,
|
||||
int32* vocab_dim) override;
|
||||
|
||||
float ComputePathScore(const paddle::Tensor& prob,
|
||||
const std::vector<int>& hyp,
|
||||
int eos);
|
||||
|
||||
void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
|
||||
float reverse_weight,
|
||||
std::vector<float>* rescoring_score) override;
|
||||
|
||||
// debug
|
||||
void FeedEncoderOuts(paddle::Tensor& encoder_out);
|
||||
|
||||
private:
|
||||
U2ModelOptions opts_;
|
||||
|
||||
phi::Place dev_;
|
||||
std::shared_ptr<paddle::jit::Layer> model_{nullptr};
|
||||
std::vector<paddle::Tensor> encoder_outs_;
|
||||
// transformer/conformer attention cache
|
||||
paddle::Tensor att_cache_ = paddle::full({0, 0, 0, 0}, 0.0);
|
||||
// conformer-only conv_module cache
|
||||
paddle::Tensor cnn_cache_ = paddle::full({0, 0, 0, 0}, 0.0);
|
||||
|
||||
paddle::jit::Function forward_encoder_chunk_;
|
||||
paddle::jit::Function forward_attention_decoder_;
|
||||
paddle::jit::Function ctc_activation_;
|
||||
};
|
||||
|
||||
} // namespace ppspeech
|
@ -0,0 +1,180 @@
|
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "nnet/u2_nnet.h"
|
||||
#include "base/common.h"
|
||||
#include "frontend/audio/assembler.h"
|
||||
#include "frontend/audio/data_cache.h"
|
||||
#include "kaldi/util/table-types.h"
|
||||
#include "nnet/decodable.h"
|
||||
|
||||
DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
|
||||
DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier");
|
||||
|
||||
DEFINE_string(model_path, "", "paddle nnet model");
|
||||
|
||||
DEFINE_int32(nnet_decoder_chunk, 16, "nnet forward chunk");
|
||||
DEFINE_int32(receptive_field_length,
|
||||
7,
|
||||
"receptive field of two CNN(kernel=3) downsampling module.");
|
||||
DEFINE_int32(downsampling_rate,
|
||||
4,
|
||||
"two CNN(kernel=3) module downsampling rate.");
|
||||
DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
|
||||
|
||||
using kaldi::BaseFloat;
|
||||
using kaldi::Matrix;
|
||||
using std::vector;
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
gflags::SetUsageMessage("Usage:");
|
||||
gflags::ParseCommandLineFlags(&argc, &argv, false);
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
google::InstallFailureSignalHandler();
|
||||
FLAGS_logtostderr = 1;
|
||||
|
||||
int32 num_done = 0, num_err = 0;
|
||||
|
||||
CHECK(FLAGS_feature_rspecifier.size() > 0);
|
||||
CHECK(FLAGS_nnet_prob_wspecifier.size() > 0);
|
||||
CHECK(FLAGS_model_path.size() > 0);
|
||||
LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier;
|
||||
LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier;
|
||||
LOG(INFO) << "model path: " << FLAGS_model_path;
|
||||
kaldi::SequentialBaseFloatMatrixReader feature_reader(
|
||||
FLAGS_feature_rspecifier);
|
||||
kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier);
|
||||
|
||||
ppspeech::U2ModelOptions model_opts;
|
||||
model_opts.model_path = FLAGS_model_path;
|
||||
|
||||
int32 chunk_size =
|
||||
(FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate +
|
||||
FLAGS_receptive_field_length;
|
||||
int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
|
||||
int32 receptive_field_length = FLAGS_receptive_field_length;
|
||||
LOG(INFO) << "chunk size (frame): " << chunk_size;
|
||||
LOG(INFO) << "chunk stride (frame): " << chunk_stride;
|
||||
LOG(INFO) << "receptive field (frame): " << receptive_field_length;
|
||||
|
||||
std::shared_ptr<ppspeech::U2Nnet> nnet(new ppspeech::U2Nnet(model_opts));
|
||||
std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
|
||||
std::shared_ptr<ppspeech::Decodable> decodable(
|
||||
new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
|
||||
kaldi::Timer timer;
|
||||
|
||||
for (; !feature_reader.Done(); feature_reader.Next()) {
|
||||
string utt = feature_reader.Key();
|
||||
kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
|
||||
|
||||
int nframes = feature.NumRows();
|
||||
int feat_dim = feature.NumCols();
|
||||
raw_data->SetDim(feat_dim);
|
||||
LOG(INFO) << "utt: " << utt;
|
||||
LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim;
|
||||
|
||||
// // pad feats
|
||||
// int32 padding_len = 0;
|
||||
// if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
|
||||
// padding_len =
|
||||
// chunk_stride - (feature.NumRows() - chunk_size) %
|
||||
// chunk_stride;
|
||||
// feature.Resize(feature.NumRows() + padding_len,
|
||||
// feature.NumCols(),
|
||||
// kaldi::kCopyData);
|
||||
// }
|
||||
|
||||
int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
|
||||
int32 frame_idx = 0;
|
||||
std::vector<kaldi::Vector<kaldi::BaseFloat>> prob_vec;
|
||||
int32 ori_feature_len = feature.NumRows();
|
||||
|
||||
for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
|
||||
kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
|
||||
feat_dim);
|
||||
|
||||
int32 feature_chunk_size = 0;
|
||||
if (ori_feature_len > chunk_idx * chunk_stride) {
|
||||
feature_chunk_size = std::min(
|
||||
ori_feature_len - chunk_idx * chunk_stride, chunk_size);
|
||||
}
|
||||
if (feature_chunk_size < receptive_field_length) {
|
||||
LOG(WARNING) << "utt: " << utt << " skip last "
|
||||
<< feature_chunk_size << " frames, expect is "
|
||||
<< receptive_field_length;
|
||||
break;
|
||||
}
|
||||
|
||||
int32 start = chunk_idx * chunk_stride;
|
||||
for (int row_id = 0; row_id < chunk_size; ++row_id) {
|
||||
kaldi::SubVector<kaldi::BaseFloat> feat_row(feature, start);
|
||||
kaldi::SubVector<kaldi::BaseFloat> feature_chunk_row(
|
||||
feature_chunk.Data() + row_id * feat_dim, feat_dim);
|
||||
|
||||
feature_chunk_row.CopyFromVec(feat_row);
|
||||
++start;
|
||||
}
|
||||
|
||||
// feat to frontend pipeline cache
|
||||
raw_data->Accept(feature_chunk);
|
||||
|
||||
// send data finish signal
|
||||
if (chunk_idx == num_chunks - 1) {
|
||||
raw_data->SetFinished();
|
||||
}
|
||||
|
||||
// get nnet outputs
|
||||
vector<kaldi::BaseFloat> prob;
|
||||
while (decodable->FrameLikelihood(frame_idx, &prob)) {
|
||||
kaldi::Vector<kaldi::BaseFloat> vec_tmp(prob.size());
|
||||
std::memcpy(vec_tmp.Data(),
|
||||
prob.data(),
|
||||
sizeof(kaldi::BaseFloat) * prob.size());
|
||||
prob_vec.push_back(vec_tmp);
|
||||
frame_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
// after process one utt, then reset decoder state.
|
||||
decodable->Reset();
|
||||
|
||||
if (prob_vec.size() == 0) {
|
||||
// the TokenWriter can not write empty string.
|
||||
++num_err;
|
||||
LOG(WARNING) << " the nnet prob of " << utt << " is empty";
|
||||
continue;
|
||||
}
|
||||
|
||||
// writer nnet output
|
||||
kaldi::MatrixIndexT nrow = prob_vec.size();
|
||||
kaldi::MatrixIndexT ncol = prob_vec[0].Dim();
|
||||
LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol;
|
||||
kaldi::Matrix<kaldi::BaseFloat> result(nrow, ncol);
|
||||
for (int32 row_idx = 0; row_idx < nrow; ++row_idx) {
|
||||
for (int32 col_idx = 0; col_idx < ncol; ++col_idx) {
|
||||
result(row_idx, col_idx) = prob_vec[row_idx](col_idx);
|
||||
}
|
||||
}
|
||||
nnet_out_writer.Write(utt, result);
|
||||
|
||||
++num_done;
|
||||
}
|
||||
|
||||
double elapsed = timer.Elapsed();
|
||||
LOG(INFO) << " cost:" << elapsed << " sec";
|
||||
|
||||
LOG(INFO) << "Done " << num_done << " utterances, " << num_err
|
||||
<< " with errors.";
|
||||
return (num_done != 0 ? 0 : 1);
|
||||
}
|
@ -1,3 +1 @@
|
||||
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
|
||||
|
||||
add_subdirectory(websocket)
|
||||
|
@ -1,4 +1,5 @@
|
||||
|
||||
add_library(utils
|
||||
file_utils.cc
|
||||
math.cc
|
||||
)
|
@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
PYTHON=python3.7
|
||||
test -d venv || virtualenv -p ${PYTHON} venv
|
Loading…
Reference in new issue