From 854b63b519e58b73ba1afde14c87d8fca998d7cb Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 17 Mar 2022 03:27:10 +0000 Subject: [PATCH 1/2] speechx doc --- speechx/README.md | 45 ++++++++++++++++++++++++++++++++++---- speechx/TODO.md | 3 --- speechx/examples/README.md | 13 ++++++++++- 3 files changed, 53 insertions(+), 8 deletions(-) delete mode 100644 speechx/TODO.md diff --git a/speechx/README.md b/speechx/README.md index fb6ff2fc..7d73b61c 100644 --- a/speechx/README.md +++ b/speechx/README.md @@ -1,24 +1,61 @@ -# SpeechX -- Speech Inference All in One +# SpeechX -- All in One Speech Task Inference -> Test under `Ubuntu 16.04.7 LTS`. +## Environment + +We develop under: +* docker - registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7 +* os - Ubuntu 16.04.7 LTS +* gcc/g++ - 8.2.0 +* cmake - 3.16.0 + +> We make sure all things work fun under docker, and recommend using it to develop and deploy. + +* [How to Install Docker](https://docs.docker.com/engine/install/) +* [A Docker Tutorial for Beginners](https://docker-curriculum.com/) +* [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/overview.html) ## Build +1. First to launch docker container. + +``` +nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7 /bin/bash ``` + +* More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html). + +* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nviida-docker`. + + +2. Build `speechx` and `examples`. + +``` +pushd /path/to/speechx ./build.sh ``` -## Valgrind +3. Go to `examples` to have a fun. + +More details please see `README.md` under `examples`. + + +## Valgrind (Optional) > If using docker please check `--privileged` is set when `docker run`. -1. Fatal error at startup: a function redirection which is mandatory for this platform-tool combination cannot be set up +* Fatal error at startup: `a function redirection which is mandatory for this platform-tool combination cannot be set up` ``` apt-get install libc6-dbg ``` +* Install + ``` pushd tools ./setup_valgrind.sh popd ``` + +## TODO + +* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result. diff --git a/speechx/TODO.md b/speechx/TODO.md deleted file mode 100644 index d65adaa2..00000000 --- a/speechx/TODO.md +++ /dev/null @@ -1,3 +0,0 @@ -# TODO - -* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result. diff --git a/speechx/examples/README.md b/speechx/examples/README.md index fde9a361..28001ad9 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -1,5 +1,16 @@ # Examples -* decoder - offline decoder +* decoder - online decoder to work as offline * feat - mfcc, linear * nnet - ds2 nn + +## How to run + +`run.sh` is the entry point. + +Example to play `decoder`: + +``` +pushd decoder +bash run.sh +``` \ No newline at end of file From 6abc5d9f7eca30f3d1f96cdf4291b45b47bb0280 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 17 Mar 2022 03:27:23 +0000 Subject: [PATCH 2/2] format --- paddleaudio/setup.py | 1 + paddlespeech/cli/utils.py | 2 +- .../server/bin/paddlespeech_server.py | 2 +- speechx/examples/README.md | 2 +- .../examples/feat/linear_spectrogram_main.cc | 4 +- .../decoder/ctc_beam_search_decoder.cc | 6 +- speechx/speechx/frontend/feature_cache.cc | 3 +- speechx/speechx/frontend/feature_cache.h | 3 +- .../frontend/feature_extractor_controller.h | 1 - .../feature_extractor_controller_impl.h | 1 - .../frontend/feature_extractor_interface.h | 5 +- .../speechx/frontend/linear_spectrogram.cc | 9 +- speechx/speechx/frontend/linear_spectrogram.h | 7 +- speechx/speechx/frontend/normalizer.cc | 9 +- speechx/speechx/frontend/normalizer.h | 14 +-- speechx/speechx/frontend/raw_audio.cc | 5 +- speechx/speechx/frontend/raw_audio.h | 10 +- speechx/speechx/nnet/decodable-itf.h | 119 +++++++++++------- speechx/speechx/nnet/decodable.cc | 7 +- utils/generate_infer_yaml.py | 2 +- 20 files changed, 112 insertions(+), 100 deletions(-) diff --git a/paddleaudio/setup.py b/paddleaudio/setup.py index 6c757d33..930f86e4 100644 --- a/paddleaudio/setup.py +++ b/paddleaudio/setup.py @@ -61,6 +61,7 @@ def remove_version_py(filename='paddleaudio/__init__.py'): if "__version__" not in line: f.write(line) + remove_version_py() write_version_py() diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py index d7dcc90c..f7d64b9a 100644 --- a/paddlespeech/cli/utils.py +++ b/paddlespeech/cli/utils.py @@ -192,7 +192,7 @@ class ConfigCache: try: cfg = yaml.load(file, Loader=yaml.FullLoader) self._data.update(cfg) - except: + except Exception as e: self.flush() @property diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index 7e7f03b2..f6a7f429 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -174,7 +174,7 @@ class ServerStatsExecutor(): "Failed to get the table of TTS pretrained models supported in the service." ) return False - + elif self.task == 'cls': try: from paddlespeech.cli.cls.infer import pretrained_models diff --git a/speechx/examples/README.md b/speechx/examples/README.md index 28001ad9..941c4272 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -13,4 +13,4 @@ Example to play `decoder`: ``` pushd decoder bash run.sh -``` \ No newline at end of file +``` diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index f137a52c..9ed4d6f9 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -164,8 +164,8 @@ int main(int argc, char* argv[]) { // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning // window -->linear_spectrogram --> cmvn int32 num_done = 0, num_err = 0; - //std::unique_ptr data_source(new - //ppspeech::RawDataCache()); + // std::unique_ptr data_source(new + // ppspeech::RawDataCache()); std::unique_ptr data_source( new ppspeech::RawAudioCache()); diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc index 582f2c95..84f1453c 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc @@ -52,14 +52,14 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) } void CTCBeamSearch::Reset() { - //num_frame_decoded_ = 0; - //ResetPrefixes(); + // num_frame_decoded_ = 0; + // ResetPrefixes(); InitDecoder(); } void CTCBeamSearch::InitDecoder() { num_frame_decoded_ = 0; - //ResetPrefixes(); + // ResetPrefixes(); prefixes_.clear(); root_ = std::make_shared(); diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc index b353df16..d23b3a8b 100644 --- a/speechx/speechx/frontend/feature_cache.cc +++ b/speechx/speechx/frontend/feature_cache.cc @@ -29,8 +29,7 @@ FeatureCache::FeatureCache( base_extractor_ = std::move(base_extractor); } -void FeatureCache::Accept( - const kaldi::VectorBase& inputs) { +void FeatureCache::Accept(const kaldi::VectorBase& inputs) { base_extractor_->Accept(inputs); // feed current data bool result = false; diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index 459134ee..e52d8b29 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -24,8 +24,7 @@ class FeatureCache : public FeatureExtractorInterface { explicit FeatureCache( int32 max_size = kint16max, std::unique_ptr base_extractor = NULL); - virtual void Accept( - const kaldi::VectorBase& inputs); + virtual void Accept(const kaldi::VectorBase& inputs); // feats dim = num_frames * feature_dim virtual bool Read(kaldi::Vector* feats); // feature cache only cache feature which from base extractor diff --git a/speechx/speechx/frontend/feature_extractor_controller.h b/speechx/speechx/frontend/feature_extractor_controller.h index 5860f391..0544a1e2 100644 --- a/speechx/speechx/frontend/feature_extractor_controller.h +++ b/speechx/speechx/frontend/feature_extractor_controller.h @@ -11,4 +11,3 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - diff --git a/speechx/speechx/frontend/feature_extractor_controller_impl.h b/speechx/speechx/frontend/feature_extractor_controller_impl.h index 5860f391..0544a1e2 100644 --- a/speechx/speechx/frontend/feature_extractor_controller_impl.h +++ b/speechx/speechx/frontend/feature_extractor_controller_impl.h @@ -11,4 +11,3 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h index cb6fec1b..3668fbda 100644 --- a/speechx/speechx/frontend/feature_extractor_interface.h +++ b/speechx/speechx/frontend/feature_extractor_interface.h @@ -21,10 +21,9 @@ namespace ppspeech { class FeatureExtractorInterface { public: - // accept input data, accept feature or raw waves which decided + // accept input data, accept feature or raw waves which decided // by the base_extractor - virtual void Accept( - const kaldi::VectorBase& inputs) = 0; + virtual void Accept(const kaldi::VectorBase& inputs) = 0; // get the processed result // the length of output = feature_row * feature_dim, // the Matrix is squashed into Vector diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc index b8a18e02..41bc8743 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -57,8 +57,9 @@ bool LinearSpectrogram::Read(Vector* feats) { if (flag == false || input_feats.Dim() == 0) return false; vector input_feats_vec(input_feats.Dim()); - std::memcpy(input_feats_vec.data(), input_feats.Data(), - input_feats.Dim()*sizeof(BaseFloat)); + std::memcpy(input_feats_vec.data(), + input_feats.Data(), + input_feats.Dim() * sizeof(BaseFloat)); vector> result; Compute(input_feats_vec, result); int32 feat_size = 0; @@ -86,10 +87,10 @@ bool LinearSpectrogram::NumpyFft(vector* v, vector* img) const { Vector v_tmp; v_tmp.Resize(v->size()); - std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat)*(v->size())); + std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat) * (v->size())); RealFft(&v_tmp, true); v->resize(v_tmp.Dim()); - std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat)*(v->size())); + std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat) * (v->size())); real->push_back(v->at(0)); img->push_back(0); diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h index b2bb414d..ffdfbbe9 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -38,16 +38,13 @@ class LinearSpectrogram : public FeatureExtractorInterface { explicit LinearSpectrogram( const LinearSpectrogramOptions& opts, std::unique_ptr base_extractor); - virtual void Accept( - const kaldi::VectorBase& inputs); + virtual void Accept(const kaldi::VectorBase& inputs); virtual bool Read(kaldi::Vector* feats); // the dim_ is the dim of single frame feature virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { - base_extractor_->Reset(); - } + virtual void Reset() { base_extractor_->Reset(); } private: void Hanning(std::vector* data) const; diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc index 285c8e03..1adddb40 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/normalizer.cc @@ -34,14 +34,12 @@ DecibelNormalizer::DecibelNormalizer( dim_ = 1; } -void DecibelNormalizer::Accept( - const kaldi::VectorBase& waves) { +void DecibelNormalizer::Accept(const kaldi::VectorBase& waves) { base_extractor_->Accept(waves); } bool DecibelNormalizer::Read(kaldi::Vector* waves) { - if (base_extractor_->Read(waves) == false || - waves->Dim() == 0) { + if (base_extractor_->Read(waves) == false || waves->Dim() == 0) { return false; } Compute(waves); @@ -88,7 +86,8 @@ bool DecibelNormalizer::Compute(VectorBase* waves) const { item *= std::pow(10.0, gain / 20.0); } - std::memcpy(waves->Data(), samples.data(), sizeof(BaseFloat)*samples.size()); + std::memcpy( + waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size()); return true; } diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index 24542eba..352d1e16 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -45,16 +45,13 @@ class DecibelNormalizer : public FeatureExtractorInterface { explicit DecibelNormalizer( const DecibelNormalizerOptions& opts, std::unique_ptr base_extractor); - virtual void Accept( - const kaldi::VectorBase& waves); + virtual void Accept(const kaldi::VectorBase& waves); virtual bool Read(kaldi::Vector* waves); // noramlize audio, the dim is 1. virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { - base_extractor_->Reset(); - } + virtual void Reset() { base_extractor_->Reset(); } private: bool Compute(kaldi::VectorBase* waves) const; @@ -69,8 +66,7 @@ class CMVN : public FeatureExtractorInterface { public: explicit CMVN(std::string cmvn_file, std::unique_ptr base_extractor); - virtual void Accept( - const kaldi::VectorBase& inputs); + virtual void Accept(const kaldi::VectorBase& inputs); // the length of feats = feature_row * feature_dim, // the Matrix is squashed into Vector @@ -79,9 +75,7 @@ class CMVN : public FeatureExtractorInterface { virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { - base_extractor_->Reset(); - } + virtual void Reset() { base_extractor_->Reset(); } private: void Compute(kaldi::VectorBase* feats) const; diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc index 0f3d83ec..21f64362 100644 --- a/speechx/speechx/frontend/raw_audio.cc +++ b/speechx/speechx/frontend/raw_audio.cc @@ -32,7 +32,7 @@ void RawAudioCache::Accept(const VectorBase& waves) { ready_feed_condition_.wait(lock); } for (size_t idx = 0; idx < waves.Dim(); ++idx) { - int32 buffer_idx = (idx + start_) % ring_buffer_.size(); + int32 buffer_idx = (idx + start_) % ring_buffer_.size(); ring_buffer_[buffer_idx] = waves(idx); } data_length_ += waves.Dim(); @@ -44,7 +44,8 @@ bool RawAudioCache::Read(Vector* waves) { std::unique_lock lock(mutex_); while (chunk_size > data_length_) { // when audio is empty and no more data feed - // ready_read_condition will block in dead lock. so replace with timeout_ + // ready_read_condition will block in dead lock. so replace with + // timeout_ // ready_read_condition_.wait(lock); int32 elapsed = static_cast(timer.Elapsed() * 1000); if (elapsed > timeout_) { diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h index 7726f825..ce75c137 100644 --- a/speechx/speechx/frontend/raw_audio.h +++ b/speechx/speechx/frontend/raw_audio.h @@ -35,9 +35,9 @@ class RawAudioCache : public FeatureExtractorInterface { } virtual bool IsFinished() const { return finished_; } virtual void Reset() { - start_ = 0; - data_length_ = 0; - finished_ = false; + start_ = 0; + data_length_ = 0; + finished_ = false; } private: @@ -72,9 +72,7 @@ class RawDataCache : public FeatureExtractorInterface { virtual void SetFinished() { finished_ = true; } virtual bool IsFinished() const { return finished_; } void SetDim(int32 dim) { dim_ = dim; } - virtual void Reset() { - finished_ = true; - } + virtual void Reset() { finished_ = true; } private: kaldi::Vector data_; diff --git a/speechx/speechx/nnet/decodable-itf.h b/speechx/speechx/nnet/decodable-itf.h index 37c3007b..8e9a5a72 100644 --- a/speechx/speechx/nnet/decodable-itf.h +++ b/speechx/speechx/nnet/decodable-itf.h @@ -1,3 +1,17 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // itf/decodable-itf.h // Copyright 2009-2011 Microsoft Corporation; Saarland University; @@ -42,8 +56,10 @@ namespace kaldi { For online decoding, where the features are coming in in real time, it is important to understand the IsLastFrame() and NumFramesReady() functions. - There are two ways these are used: the old online-decoding code, in ../online/, - and the new online-decoding code, in ../online2/. In the old online-decoding + There are two ways these are used: the old online-decoding code, in + ../online/, + and the new online-decoding code, in ../online2/. In the old + online-decoding code, the decoder would do: \code{.cc} for (int frame = 0; !decodable.IsLastFrame(frame); frame++) { @@ -52,13 +68,16 @@ namespace kaldi { \endcode and the call to IsLastFrame would block if the features had not arrived yet. The decodable object would have to know when to terminate the decoding. This - online-decoding mode is still supported, it is what happens when you call, for + online-decoding mode is still supported, it is what happens when you call, + for example, LatticeFasterDecoder::Decode(). We realized that this "blocking" mode of decoding is not very convenient because it forces the program to be multi-threaded and makes it complex to - control endpointing. In the "new" decoding code, you don't call (for example) - LatticeFasterDecoder::Decode(), you call LatticeFasterDecoder::InitDecoding(), + control endpointing. In the "new" decoding code, you don't call (for + example) + LatticeFasterDecoder::Decode(), you call + LatticeFasterDecoder::InitDecoding(), and then each time you get more features, you provide them to the decodable object, and you call LatticeFasterDecoder::AdvanceDecoding(), which does something like this: @@ -68,7 +87,8 @@ namespace kaldi { } \endcode So the decodable object never has IsLastFrame() called. For decoding where - you are starting with a matrix of features, the NumFramesReady() function will + you are starting with a matrix of features, the NumFramesReady() function + will always just return the number of frames in the file, and IsLastFrame() will return true for the last frame. @@ -80,45 +100,54 @@ namespace kaldi { frame of the file once we've decided to terminate decoding. */ class DecodableInterface { - public: - /// Returns the log likelihood, which will be negated in the decoder. - /// The "frame" starts from zero. You should verify that NumFramesReady() > frame - /// before calling this. - virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0; - - /// Returns true if this is the last frame. Frames are zero-based, so the - /// first frame is zero. IsLastFrame(-1) will return false, unless the file - /// is empty (which is a case that I'm not sure all the code will handle, so - /// be careful). Caution: the behavior of this function in an online setting - /// is being changed somewhat. In future it may return false in cases where - /// we haven't yet decided to terminate decoding, but later true if we decide - /// to terminate decoding. The plan in future is to rely more on - /// NumFramesReady(), and in future, IsLastFrame() would always return false - /// in an online-decoding setting, and would only return true in a - /// decoding-from-matrix setting where we want to allow the last delta or LDA - /// features to be flushed out for compatibility with the baseline setup. - virtual bool IsLastFrame(int32 frame) const = 0; - - /// The call NumFramesReady() will return the number of frames currently available - /// for this decodable object. This is for use in setups where you don't want the - /// decoder to block while waiting for input. This is newly added as of Jan 2014, - /// and I hope, going forward, to rely on this mechanism more than IsLastFrame to - /// know when to stop decoding. - virtual int32 NumFramesReady() const { - KALDI_ERR << "NumFramesReady() not implemented for this decodable type."; - return -1; - } - - /// Returns the number of states in the acoustic model - /// (they will be indexed one-based, i.e. from 1 to NumIndices(); - /// this is for compatibility with OpenFst). - virtual int32 NumIndices() const = 0; - - virtual bool FrameLogLikelihood(int32 frame, - std::vector* likelihood) = 0; - - - virtual ~DecodableInterface() {} + public: + /// Returns the log likelihood, which will be negated in the decoder. + /// The "frame" starts from zero. You should verify that NumFramesReady() > + /// frame + /// before calling this. + virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0; + + /// Returns true if this is the last frame. Frames are zero-based, so the + /// first frame is zero. IsLastFrame(-1) will return false, unless the file + /// is empty (which is a case that I'm not sure all the code will handle, so + /// be careful). Caution: the behavior of this function in an online + /// setting + /// is being changed somewhat. In future it may return false in cases where + /// we haven't yet decided to terminate decoding, but later true if we + /// decide + /// to terminate decoding. The plan in future is to rely more on + /// NumFramesReady(), and in future, IsLastFrame() would always return false + /// in an online-decoding setting, and would only return true in a + /// decoding-from-matrix setting where we want to allow the last delta or + /// LDA + /// features to be flushed out for compatibility with the baseline setup. + virtual bool IsLastFrame(int32 frame) const = 0; + + /// The call NumFramesReady() will return the number of frames currently + /// available + /// for this decodable object. This is for use in setups where you don't + /// want the + /// decoder to block while waiting for input. This is newly added as of Jan + /// 2014, + /// and I hope, going forward, to rely on this mechanism more than + /// IsLastFrame to + /// know when to stop decoding. + virtual int32 NumFramesReady() const { + KALDI_ERR + << "NumFramesReady() not implemented for this decodable type."; + return -1; + } + + /// Returns the number of states in the acoustic model + /// (they will be indexed one-based, i.e. from 1 to NumIndices(); + /// this is for compatibility with OpenFst). + virtual int32 NumIndices() const = 0; + + virtual bool FrameLogLikelihood( + int32 frame, std::vector* likelihood) = 0; + + + virtual ~DecodableInterface() {} }; /// @} } // namespace Kaldi diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 3cc07f38..6c0909ca 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -23,10 +23,7 @@ using kaldi::Vector; Decodable::Decodable(const std::shared_ptr& nnet, const std::shared_ptr& frontend) - : frontend_(frontend), - nnet_(nnet), - frame_offset_(0), - frames_ready_(0) {} + : frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {} void Decodable::Acceptlikelihood(const Matrix& likelihood) { frames_ready_ += likelihood.NumRows(); @@ -83,7 +80,7 @@ void Decodable::Reset() { frontend_->Reset(); nnet_->Reset(); frame_offset_ = 0; - frames_ready_ = 0; + frames_ready_ = 0; } } // namespace ppspeech \ No newline at end of file diff --git a/utils/generate_infer_yaml.py b/utils/generate_infer_yaml.py index b8a797ad..02ff262b 100644 --- a/utils/generate_infer_yaml.py +++ b/utils/generate_infer_yaml.py @@ -148,7 +148,7 @@ def merge_configs( for item in remove_train_list: try: remove_config_part(config, [item]) - except: + except Exception as e: print(item + " " + "can not be removed") # Save the config