diff --git a/paddleaudio/setup.py b/paddleaudio/setup.py index 6c757d336..930f86e41 100644 --- a/paddleaudio/setup.py +++ b/paddleaudio/setup.py @@ -61,6 +61,7 @@ def remove_version_py(filename='paddleaudio/__init__.py'): if "__version__" not in line: f.write(line) + remove_version_py() write_version_py() diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py index d7dcc90c7..f7d64b9a9 100644 --- a/paddlespeech/cli/utils.py +++ b/paddlespeech/cli/utils.py @@ -192,7 +192,7 @@ class ConfigCache: try: cfg = yaml.load(file, Loader=yaml.FullLoader) self._data.update(cfg) - except: + except Exception as e: self.flush() @property diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index 7e7f03b2d..f6a7f4295 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -174,7 +174,7 @@ class ServerStatsExecutor(): "Failed to get the table of TTS pretrained models supported in the service." ) return False - + elif self.task == 'cls': try: from paddlespeech.cli.cls.infer import pretrained_models diff --git a/speechx/examples/README.md b/speechx/examples/README.md index 28001ad9c..941c4272d 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -13,4 +13,4 @@ Example to play `decoder`: ``` pushd decoder bash run.sh -``` \ No newline at end of file +``` diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index f137a52cc..9ed4d6f93 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -164,8 +164,8 @@ int main(int argc, char* argv[]) { // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning // window -->linear_spectrogram --> cmvn int32 num_done = 0, num_err = 0; - //std::unique_ptr data_source(new - //ppspeech::RawDataCache()); + // std::unique_ptr data_source(new + // ppspeech::RawDataCache()); std::unique_ptr data_source( new ppspeech::RawAudioCache()); diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc index 582f2c950..84f1453c0 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc @@ -52,14 +52,14 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) } void CTCBeamSearch::Reset() { - //num_frame_decoded_ = 0; - //ResetPrefixes(); + // num_frame_decoded_ = 0; + // ResetPrefixes(); InitDecoder(); } void CTCBeamSearch::InitDecoder() { num_frame_decoded_ = 0; - //ResetPrefixes(); + // ResetPrefixes(); prefixes_.clear(); root_ = std::make_shared(); diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc index b353df164..d23b3a8b2 100644 --- a/speechx/speechx/frontend/feature_cache.cc +++ b/speechx/speechx/frontend/feature_cache.cc @@ -29,8 +29,7 @@ FeatureCache::FeatureCache( base_extractor_ = std::move(base_extractor); } -void FeatureCache::Accept( - const kaldi::VectorBase& inputs) { +void FeatureCache::Accept(const kaldi::VectorBase& inputs) { base_extractor_->Accept(inputs); // feed current data bool result = false; diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index 459134ee8..e52d8b298 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -24,8 +24,7 @@ class FeatureCache : public FeatureExtractorInterface { explicit FeatureCache( int32 max_size = kint16max, std::unique_ptr base_extractor = NULL); - virtual void Accept( - const kaldi::VectorBase& inputs); + virtual void Accept(const kaldi::VectorBase& inputs); // feats dim = num_frames * feature_dim virtual bool Read(kaldi::Vector* feats); // feature cache only cache feature which from base extractor diff --git a/speechx/speechx/frontend/feature_extractor_controller.h b/speechx/speechx/frontend/feature_extractor_controller.h index 5860f391c..0544a1e29 100644 --- a/speechx/speechx/frontend/feature_extractor_controller.h +++ b/speechx/speechx/frontend/feature_extractor_controller.h @@ -11,4 +11,3 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - diff --git a/speechx/speechx/frontend/feature_extractor_controller_impl.h b/speechx/speechx/frontend/feature_extractor_controller_impl.h index 5860f391c..0544a1e29 100644 --- a/speechx/speechx/frontend/feature_extractor_controller_impl.h +++ b/speechx/speechx/frontend/feature_extractor_controller_impl.h @@ -11,4 +11,3 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h index cb6fec1b8..3668fbda7 100644 --- a/speechx/speechx/frontend/feature_extractor_interface.h +++ b/speechx/speechx/frontend/feature_extractor_interface.h @@ -21,10 +21,9 @@ namespace ppspeech { class FeatureExtractorInterface { public: - // accept input data, accept feature or raw waves which decided + // accept input data, accept feature or raw waves which decided // by the base_extractor - virtual void Accept( - const kaldi::VectorBase& inputs) = 0; + virtual void Accept(const kaldi::VectorBase& inputs) = 0; // get the processed result // the length of output = feature_row * feature_dim, // the Matrix is squashed into Vector diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc index b8a18e028..41bc8743a 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -57,8 +57,9 @@ bool LinearSpectrogram::Read(Vector* feats) { if (flag == false || input_feats.Dim() == 0) return false; vector input_feats_vec(input_feats.Dim()); - std::memcpy(input_feats_vec.data(), input_feats.Data(), - input_feats.Dim()*sizeof(BaseFloat)); + std::memcpy(input_feats_vec.data(), + input_feats.Data(), + input_feats.Dim() * sizeof(BaseFloat)); vector> result; Compute(input_feats_vec, result); int32 feat_size = 0; @@ -86,10 +87,10 @@ bool LinearSpectrogram::NumpyFft(vector* v, vector* img) const { Vector v_tmp; v_tmp.Resize(v->size()); - std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat)*(v->size())); + std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat) * (v->size())); RealFft(&v_tmp, true); v->resize(v_tmp.Dim()); - std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat)*(v->size())); + std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat) * (v->size())); real->push_back(v->at(0)); img->push_back(0); diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h index b2bb414d5..ffdfbbe92 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -38,16 +38,13 @@ class LinearSpectrogram : public FeatureExtractorInterface { explicit LinearSpectrogram( const LinearSpectrogramOptions& opts, std::unique_ptr base_extractor); - virtual void Accept( - const kaldi::VectorBase& inputs); + virtual void Accept(const kaldi::VectorBase& inputs); virtual bool Read(kaldi::Vector* feats); // the dim_ is the dim of single frame feature virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { - base_extractor_->Reset(); - } + virtual void Reset() { base_extractor_->Reset(); } private: void Hanning(std::vector* data) const; diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc index 285c8e03e..1adddb401 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/normalizer.cc @@ -34,14 +34,12 @@ DecibelNormalizer::DecibelNormalizer( dim_ = 1; } -void DecibelNormalizer::Accept( - const kaldi::VectorBase& waves) { +void DecibelNormalizer::Accept(const kaldi::VectorBase& waves) { base_extractor_->Accept(waves); } bool DecibelNormalizer::Read(kaldi::Vector* waves) { - if (base_extractor_->Read(waves) == false || - waves->Dim() == 0) { + if (base_extractor_->Read(waves) == false || waves->Dim() == 0) { return false; } Compute(waves); @@ -88,7 +86,8 @@ bool DecibelNormalizer::Compute(VectorBase* waves) const { item *= std::pow(10.0, gain / 20.0); } - std::memcpy(waves->Data(), samples.data(), sizeof(BaseFloat)*samples.size()); + std::memcpy( + waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size()); return true; } diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index 24542ebad..352d1e167 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -45,16 +45,13 @@ class DecibelNormalizer : public FeatureExtractorInterface { explicit DecibelNormalizer( const DecibelNormalizerOptions& opts, std::unique_ptr base_extractor); - virtual void Accept( - const kaldi::VectorBase& waves); + virtual void Accept(const kaldi::VectorBase& waves); virtual bool Read(kaldi::Vector* waves); // noramlize audio, the dim is 1. virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { - base_extractor_->Reset(); - } + virtual void Reset() { base_extractor_->Reset(); } private: bool Compute(kaldi::VectorBase* waves) const; @@ -69,8 +66,7 @@ class CMVN : public FeatureExtractorInterface { public: explicit CMVN(std::string cmvn_file, std::unique_ptr base_extractor); - virtual void Accept( - const kaldi::VectorBase& inputs); + virtual void Accept(const kaldi::VectorBase& inputs); // the length of feats = feature_row * feature_dim, // the Matrix is squashed into Vector @@ -79,9 +75,7 @@ class CMVN : public FeatureExtractorInterface { virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { - base_extractor_->Reset(); - } + virtual void Reset() { base_extractor_->Reset(); } private: void Compute(kaldi::VectorBase* feats) const; diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc index 0f3d83ec0..21f643628 100644 --- a/speechx/speechx/frontend/raw_audio.cc +++ b/speechx/speechx/frontend/raw_audio.cc @@ -32,7 +32,7 @@ void RawAudioCache::Accept(const VectorBase& waves) { ready_feed_condition_.wait(lock); } for (size_t idx = 0; idx < waves.Dim(); ++idx) { - int32 buffer_idx = (idx + start_) % ring_buffer_.size(); + int32 buffer_idx = (idx + start_) % ring_buffer_.size(); ring_buffer_[buffer_idx] = waves(idx); } data_length_ += waves.Dim(); @@ -44,7 +44,8 @@ bool RawAudioCache::Read(Vector* waves) { std::unique_lock lock(mutex_); while (chunk_size > data_length_) { // when audio is empty and no more data feed - // ready_read_condition will block in dead lock. so replace with timeout_ + // ready_read_condition will block in dead lock. so replace with + // timeout_ // ready_read_condition_.wait(lock); int32 elapsed = static_cast(timer.Elapsed() * 1000); if (elapsed > timeout_) { diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h index 7726f825b..ce75c137c 100644 --- a/speechx/speechx/frontend/raw_audio.h +++ b/speechx/speechx/frontend/raw_audio.h @@ -35,9 +35,9 @@ class RawAudioCache : public FeatureExtractorInterface { } virtual bool IsFinished() const { return finished_; } virtual void Reset() { - start_ = 0; - data_length_ = 0; - finished_ = false; + start_ = 0; + data_length_ = 0; + finished_ = false; } private: @@ -72,9 +72,7 @@ class RawDataCache : public FeatureExtractorInterface { virtual void SetFinished() { finished_ = true; } virtual bool IsFinished() const { return finished_; } void SetDim(int32 dim) { dim_ = dim; } - virtual void Reset() { - finished_ = true; - } + virtual void Reset() { finished_ = true; } private: kaldi::Vector data_; diff --git a/speechx/speechx/nnet/decodable-itf.h b/speechx/speechx/nnet/decodable-itf.h index 37c3007b3..8e9a5a72a 100644 --- a/speechx/speechx/nnet/decodable-itf.h +++ b/speechx/speechx/nnet/decodable-itf.h @@ -1,3 +1,17 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // itf/decodable-itf.h // Copyright 2009-2011 Microsoft Corporation; Saarland University; @@ -42,8 +56,10 @@ namespace kaldi { For online decoding, where the features are coming in in real time, it is important to understand the IsLastFrame() and NumFramesReady() functions. - There are two ways these are used: the old online-decoding code, in ../online/, - and the new online-decoding code, in ../online2/. In the old online-decoding + There are two ways these are used: the old online-decoding code, in + ../online/, + and the new online-decoding code, in ../online2/. In the old + online-decoding code, the decoder would do: \code{.cc} for (int frame = 0; !decodable.IsLastFrame(frame); frame++) { @@ -52,13 +68,16 @@ namespace kaldi { \endcode and the call to IsLastFrame would block if the features had not arrived yet. The decodable object would have to know when to terminate the decoding. This - online-decoding mode is still supported, it is what happens when you call, for + online-decoding mode is still supported, it is what happens when you call, + for example, LatticeFasterDecoder::Decode(). We realized that this "blocking" mode of decoding is not very convenient because it forces the program to be multi-threaded and makes it complex to - control endpointing. In the "new" decoding code, you don't call (for example) - LatticeFasterDecoder::Decode(), you call LatticeFasterDecoder::InitDecoding(), + control endpointing. In the "new" decoding code, you don't call (for + example) + LatticeFasterDecoder::Decode(), you call + LatticeFasterDecoder::InitDecoding(), and then each time you get more features, you provide them to the decodable object, and you call LatticeFasterDecoder::AdvanceDecoding(), which does something like this: @@ -68,7 +87,8 @@ namespace kaldi { } \endcode So the decodable object never has IsLastFrame() called. For decoding where - you are starting with a matrix of features, the NumFramesReady() function will + you are starting with a matrix of features, the NumFramesReady() function + will always just return the number of frames in the file, and IsLastFrame() will return true for the last frame. @@ -80,45 +100,54 @@ namespace kaldi { frame of the file once we've decided to terminate decoding. */ class DecodableInterface { - public: - /// Returns the log likelihood, which will be negated in the decoder. - /// The "frame" starts from zero. You should verify that NumFramesReady() > frame - /// before calling this. - virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0; - - /// Returns true if this is the last frame. Frames are zero-based, so the - /// first frame is zero. IsLastFrame(-1) will return false, unless the file - /// is empty (which is a case that I'm not sure all the code will handle, so - /// be careful). Caution: the behavior of this function in an online setting - /// is being changed somewhat. In future it may return false in cases where - /// we haven't yet decided to terminate decoding, but later true if we decide - /// to terminate decoding. The plan in future is to rely more on - /// NumFramesReady(), and in future, IsLastFrame() would always return false - /// in an online-decoding setting, and would only return true in a - /// decoding-from-matrix setting where we want to allow the last delta or LDA - /// features to be flushed out for compatibility with the baseline setup. - virtual bool IsLastFrame(int32 frame) const = 0; - - /// The call NumFramesReady() will return the number of frames currently available - /// for this decodable object. This is for use in setups where you don't want the - /// decoder to block while waiting for input. This is newly added as of Jan 2014, - /// and I hope, going forward, to rely on this mechanism more than IsLastFrame to - /// know when to stop decoding. - virtual int32 NumFramesReady() const { - KALDI_ERR << "NumFramesReady() not implemented for this decodable type."; - return -1; - } - - /// Returns the number of states in the acoustic model - /// (they will be indexed one-based, i.e. from 1 to NumIndices(); - /// this is for compatibility with OpenFst). - virtual int32 NumIndices() const = 0; - - virtual bool FrameLogLikelihood(int32 frame, - std::vector* likelihood) = 0; - - - virtual ~DecodableInterface() {} + public: + /// Returns the log likelihood, which will be negated in the decoder. + /// The "frame" starts from zero. You should verify that NumFramesReady() > + /// frame + /// before calling this. + virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0; + + /// Returns true if this is the last frame. Frames are zero-based, so the + /// first frame is zero. IsLastFrame(-1) will return false, unless the file + /// is empty (which is a case that I'm not sure all the code will handle, so + /// be careful). Caution: the behavior of this function in an online + /// setting + /// is being changed somewhat. In future it may return false in cases where + /// we haven't yet decided to terminate decoding, but later true if we + /// decide + /// to terminate decoding. The plan in future is to rely more on + /// NumFramesReady(), and in future, IsLastFrame() would always return false + /// in an online-decoding setting, and would only return true in a + /// decoding-from-matrix setting where we want to allow the last delta or + /// LDA + /// features to be flushed out for compatibility with the baseline setup. + virtual bool IsLastFrame(int32 frame) const = 0; + + /// The call NumFramesReady() will return the number of frames currently + /// available + /// for this decodable object. This is for use in setups where you don't + /// want the + /// decoder to block while waiting for input. This is newly added as of Jan + /// 2014, + /// and I hope, going forward, to rely on this mechanism more than + /// IsLastFrame to + /// know when to stop decoding. + virtual int32 NumFramesReady() const { + KALDI_ERR + << "NumFramesReady() not implemented for this decodable type."; + return -1; + } + + /// Returns the number of states in the acoustic model + /// (they will be indexed one-based, i.e. from 1 to NumIndices(); + /// this is for compatibility with OpenFst). + virtual int32 NumIndices() const = 0; + + virtual bool FrameLogLikelihood( + int32 frame, std::vector* likelihood) = 0; + + + virtual ~DecodableInterface() {} }; /// @} } // namespace Kaldi diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 3cc07f38b..6c0909ca1 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -23,10 +23,7 @@ using kaldi::Vector; Decodable::Decodable(const std::shared_ptr& nnet, const std::shared_ptr& frontend) - : frontend_(frontend), - nnet_(nnet), - frame_offset_(0), - frames_ready_(0) {} + : frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {} void Decodable::Acceptlikelihood(const Matrix& likelihood) { frames_ready_ += likelihood.NumRows(); @@ -83,7 +80,7 @@ void Decodable::Reset() { frontend_->Reset(); nnet_->Reset(); frame_offset_ = 0; - frames_ready_ = 0; + frames_ready_ = 0; } } // namespace ppspeech \ No newline at end of file diff --git a/utils/generate_infer_yaml.py b/utils/generate_infer_yaml.py index b8a797ad9..02ff262b0 100644 --- a/utils/generate_infer_yaml.py +++ b/utils/generate_infer_yaml.py @@ -148,7 +148,7 @@ def merge_configs( for item in remove_train_list: try: remove_config_part(config, [item]) - except: + except Exception as e: print(item + " " + "can not be removed") # Save the config