format

4 years ago · 6abc5d9f7e
parent 854b63b519
commit 6abc5d9f7e
20 changed files with 112 additions and 100 deletions
--- a/paddleaudio/setup.py
+++ b/paddleaudio/setup.py
@ -61,6 +61,7 @@ def remove_version_py(filename='paddleaudio/__init__.py'):
            if "__version__" not in line:
                f.write(line)
 remove_version_py()
 write_version_py()
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@ -192,7 +192,7 @@ class ConfigCache:
            try:
                cfg = yaml.load(file, Loader=yaml.FullLoader)
                self._data.update(cfg)
-            except:
+            except Exception as e:
                self.flush()
    @property
--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@ -164,8 +164,8 @@ int main(int argc, char* argv[]) {
    // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
    // window -->linear_spectrogram --> cmvn
    int32 num_done = 0, num_err = 0;
-    //std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
+    // std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
-     //ppspeech::RawDataCache());
+    // ppspeech::RawDataCache());
    std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
        new ppspeech::RawAudioCache());
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
@ -52,14 +52,14 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
 }
 void CTCBeamSearch::Reset() {
-    //num_frame_decoded_ = 0;
+    // num_frame_decoded_ = 0;
-    //ResetPrefixes();
+    // ResetPrefixes();
    InitDecoder();
 }
 void CTCBeamSearch::InitDecoder() {
    num_frame_decoded_ = 0;
-    //ResetPrefixes();
+    // ResetPrefixes();
    prefixes_.clear();
    root_ = std::make_shared<PathTrie>();
--- a/speechx/speechx/frontend/feature_cache.cc
+++ b/speechx/speechx/frontend/feature_cache.cc
@ -29,8 +29,7 @@ FeatureCache::FeatureCache(
    base_extractor_ = std::move(base_extractor);
 }
-void FeatureCache::Accept(
+void FeatureCache::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
    const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
    base_extractor_->Accept(inputs);
    // feed current data
    bool result = false;
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@ -24,8 +24,7 @@ class FeatureCache : public FeatureExtractorInterface {
    explicit FeatureCache(
        int32 max_size = kint16max,
        std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
-    virtual void Accept(
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
        const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
    // feats dim = num_frames * feature_dim
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
    // feature cache only cache feature which from base extractor
--- a/speechx/speechx/frontend/feature_extractor_controller.h
+++ b/speechx/speechx/frontend/feature_extractor_controller.h
@ -11,4 +11,3 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
--- a/speechx/speechx/frontend/feature_extractor_controller_impl.h
+++ b/speechx/speechx/frontend/feature_extractor_controller_impl.h
@ -11,4 +11,3 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@ -23,8 +23,7 @@ class FeatureExtractorInterface {
  public:
    // accept input data, accept feature or raw waves which decided
    // by the base_extractor
-    virtual void Accept(
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
        const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
    // get the processed result
    // the length of output = feature_row * feature_dim,
    // the Matrix is squashed into Vector
--- a/speechx/speechx/frontend/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/linear_spectrogram.cc
@ -57,8 +57,9 @@ bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
    if (flag == false || input_feats.Dim() == 0) return false;
    vector<BaseFloat> input_feats_vec(input_feats.Dim());
-    std::memcpy(input_feats_vec.data(), input_feats.Data(), 
+    std::memcpy(input_feats_vec.data(),
-        input_feats.Dim()*sizeof(BaseFloat));
+                input_feats.Data(),
                input_feats.Dim() * sizeof(BaseFloat));
    vector<vector<BaseFloat>> result;
    Compute(input_feats_vec, result);
    int32 feat_size = 0;
@ -86,10 +87,10 @@ bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
                                 vector<BaseFloat>* img) const {
    Vector<BaseFloat> v_tmp;
    v_tmp.Resize(v->size());
-    std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat)*(v->size()));
+    std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat) * (v->size()));
    RealFft(&v_tmp, true);
    v->resize(v_tmp.Dim());
-    std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat)*(v->size()));
+    std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat) * (v->size()));
    real->push_back(v->at(0));
    img->push_back(0);
--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@ -38,16 +38,13 @@ class LinearSpectrogram : public FeatureExtractorInterface {
    explicit LinearSpectrogram(
        const LinearSpectrogramOptions& opts,
        std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void Accept(
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
        const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
    // the dim_ is the dim of single frame feature
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
-    virtual void Reset() {
+    virtual void Reset() { base_extractor_->Reset(); }
        base_extractor_->Reset();
    }
  private:
    void Hanning(std::vector<kaldi::BaseFloat>* data) const;
--- a/speechx/speechx/frontend/normalizer.cc
+++ b/speechx/speechx/frontend/normalizer.cc
@ -34,14 +34,12 @@ DecibelNormalizer::DecibelNormalizer(
    dim_ = 1;
 }
-void DecibelNormalizer::Accept(
+void DecibelNormalizer::Accept(const kaldi::VectorBase<BaseFloat>& waves) {
    const kaldi::VectorBase<BaseFloat>& waves) {
    base_extractor_->Accept(waves);
 }
 bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* waves) {
-    if (base_extractor_->Read(waves) == false || 
+    if (base_extractor_->Read(waves) == false || waves->Dim() == 0) {
        waves->Dim() == 0) {
        return false;
    }
    Compute(waves);
@ -88,7 +86,8 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
        item *= std::pow(10.0, gain / 20.0);
    }
-    std::memcpy(waves->Data(), samples.data(), sizeof(BaseFloat)*samples.size());
+    std::memcpy(
        waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size());
    return true;
 }
--- a/speechx/speechx/frontend/normalizer.h
+++ b/speechx/speechx/frontend/normalizer.h
@ -45,16 +45,13 @@ class DecibelNormalizer : public FeatureExtractorInterface {
    explicit DecibelNormalizer(
        const DecibelNormalizerOptions& opts,
        std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void Accept(
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
        const kaldi::VectorBase<kaldi::BaseFloat>& waves);
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
    // noramlize audio, the dim is 1.
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
-    virtual void Reset() {
+    virtual void Reset() { base_extractor_->Reset(); }
        base_extractor_->Reset();
    }
  private:
    bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* waves) const;
@ -69,8 +66,7 @@ class CMVN : public FeatureExtractorInterface {
  public:
    explicit CMVN(std::string cmvn_file,
                  std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void Accept(
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
        const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
    // the length of feats = feature_row * feature_dim,
    // the Matrix is squashed into Vector
@ -79,9 +75,7 @@ class CMVN : public FeatureExtractorInterface {
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
-    virtual void Reset() {
+    virtual void Reset() { base_extractor_->Reset(); }
        base_extractor_->Reset();
    }
  private:
    void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;
--- a/speechx/speechx/frontend/raw_audio.cc
+++ b/speechx/speechx/frontend/raw_audio.cc
@ -44,7 +44,8 @@ bool RawAudioCache::Read(Vector<BaseFloat>* waves) {
    std::unique_lock<std::mutex> lock(mutex_);
    while (chunk_size > data_length_) {
        // when audio is empty and no more data feed
-        // ready_read_condition will block in dead lock. so replace with timeout_
+        // ready_read_condition will block in dead lock. so replace with
        // timeout_
        // ready_read_condition_.wait(lock);
        int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000);
        if (elapsed > timeout_) {
--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/raw_audio.h
@ -35,9 +35,9 @@ class RawAudioCache : public FeatureExtractorInterface {
    }
    virtual bool IsFinished() const { return finished_; }
    virtual void Reset() {
-      start_ = 0;
+        start_ = 0;
-      data_length_ = 0;
+        data_length_ = 0;
-      finished_ = false;
+        finished_ = false;
    }
  private:
@ -72,9 +72,7 @@ class RawDataCache : public FeatureExtractorInterface {
    virtual void SetFinished() { finished_ = true; }
    virtual bool IsFinished() const { return finished_; }
    void SetDim(int32 dim) { dim_ = dim; }
-    virtual void Reset() {
+    virtual void Reset() { finished_ = true; }
      finished_ = true;
    }
  private:
    kaldi::Vector<kaldi::BaseFloat> data_;
--- a/speechx/speechx/nnet/decodable-itf.h
+++ b/speechx/speechx/nnet/decodable-itf.h
@ -1,3 +1,17 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // itf/decodable-itf.h
 // Copyright 2009-2011  Microsoft Corporation;  Saarland University;
@ -42,8 +56,10 @@ namespace kaldi {
    For online decoding, where the features are coming in in real time, it is
    important to understand the IsLastFrame() and NumFramesReady() functions.
-    There are two ways these are used: the old online-decoding code, in ../online/,
+    There are two ways these are used: the old online-decoding code, in
-    and the new online-decoding code, in ../online2/.  In the old online-decoding
+   ../online/,
    and the new online-decoding code, in ../online2/.  In the old
   online-decoding
    code, the decoder would do:
    \code{.cc}
    for (int frame = 0; !decodable.IsLastFrame(frame); frame++) {
@ -52,13 +68,16 @@ namespace kaldi {
    \endcode
   and the call to IsLastFrame would block if the features had not arrived yet.
   The decodable object would have to know when to terminate the decoding.  This
-   online-decoding mode is still supported, it is what happens when you call, for
+   online-decoding mode is still supported, it is what happens when you call,
   for
   example, LatticeFasterDecoder::Decode().
   We realized that this "blocking" mode of decoding is not very convenient
   because it forces the program to be multi-threaded and makes it complex to
-   control endpointing.  In the "new" decoding code, you don't call (for example)
+   control endpointing.  In the "new" decoding code, you don't call (for
-   LatticeFasterDecoder::Decode(), you call LatticeFasterDecoder::InitDecoding(),
+   example)
   LatticeFasterDecoder::Decode(), you call
   LatticeFasterDecoder::InitDecoding(),
   and then each time you get more features, you provide them to the decodable
   object, and you call LatticeFasterDecoder::AdvanceDecoding(), which does
   something like this:
@ -68,7 +87,8 @@ namespace kaldi {
   }
   \endcode
   So the decodable object never has IsLastFrame() called.  For decoding where
-   you are starting with a matrix of features, the NumFramesReady() function will
+   you are starting with a matrix of features, the NumFramesReady() function
   will
   always just return the number of frames in the file, and IsLastFrame() will
   return true for the last frame.
@ -80,45 +100,54 @@ namespace kaldi {
   frame of the file once we've decided to terminate decoding.
 */
 class DecodableInterface {
- public:
+  public:
-  /// Returns the log likelihood, which will be negated in the decoder.
+    /// Returns the log likelihood, which will be negated in the decoder.
-  /// The "frame" starts from zero.  You should verify that NumFramesReady() > frame
+    /// The "frame" starts from zero.  You should verify that NumFramesReady() >
-  /// before calling this.
+    /// frame
-  virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0;
+    /// before calling this.
-
+    virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0;
-  /// Returns true if this is the last frame.  Frames are zero-based, so the
+
-  /// first frame is zero.  IsLastFrame(-1) will return false, unless the file
+    /// Returns true if this is the last frame.  Frames are zero-based, so the
-  /// is empty (which is a case that I'm not sure all the code will handle, so
+    /// first frame is zero.  IsLastFrame(-1) will return false, unless the file
-  /// be careful).  Caution: the behavior of this function in an online setting
+    /// is empty (which is a case that I'm not sure all the code will handle, so
-  /// is being changed somewhat.  In future it may return false in cases where
+    /// be careful).  Caution: the behavior of this function in an online
-  /// we haven't yet decided to terminate decoding, but later true if we decide
+    /// setting
-  /// to terminate decoding.  The plan in future is to rely more on
+    /// is being changed somewhat.  In future it may return false in cases where
-  /// NumFramesReady(), and in future, IsLastFrame() would always return false
+    /// we haven't yet decided to terminate decoding, but later true if we
-  /// in an online-decoding setting, and would only return true in a
+    /// decide
-  /// decoding-from-matrix setting where we want to allow the last delta or LDA
+    /// to terminate decoding.  The plan in future is to rely more on
-  /// features to be flushed out for compatibility with the baseline setup.
+    /// NumFramesReady(), and in future, IsLastFrame() would always return false
-  virtual bool IsLastFrame(int32 frame) const = 0;
+    /// in an online-decoding setting, and would only return true in a
-
+    /// decoding-from-matrix setting where we want to allow the last delta or
-  /// The call NumFramesReady() will return the number of frames currently available
+    /// LDA
-  /// for this decodable object.  This is for use in setups where you don't want the
+    /// features to be flushed out for compatibility with the baseline setup.
-  /// decoder to block while waiting for input.  This is newly added as of Jan 2014,
+    virtual bool IsLastFrame(int32 frame) const = 0;
-  /// and I hope, going forward, to rely on this mechanism more than IsLastFrame to
+
-  /// know when to stop decoding.
+    /// The call NumFramesReady() will return the number of frames currently
-  virtual int32 NumFramesReady() const {
+    /// available
-    KALDI_ERR << "NumFramesReady() not implemented for this decodable type.";
+    /// for this decodable object.  This is for use in setups where you don't
-    return -1;
+    /// want the
-  }
+    /// decoder to block while waiting for input.  This is newly added as of Jan
-
+    /// 2014,
-  /// Returns the number of states in the acoustic model
+    /// and I hope, going forward, to rely on this mechanism more than
-  /// (they will be indexed one-based, i.e. from 1 to NumIndices();
+    /// IsLastFrame to
-  /// this is for compatibility with OpenFst).
+    /// know when to stop decoding.
-  virtual int32 NumIndices() const = 0;
+    virtual int32 NumFramesReady() const {
-
+        KALDI_ERR
-  virtual bool FrameLogLikelihood(int32 frame, 
+            << "NumFramesReady() not implemented for this decodable type.";
-                                  std::vector<kaldi::BaseFloat>* likelihood) = 0;
+        return -1;
-
+    }
-
+
-  virtual ~DecodableInterface() {}
+    /// Returns the number of states in the acoustic model
    /// (they will be indexed one-based, i.e. from 1 to NumIndices();
    /// this is for compatibility with OpenFst).
    virtual int32 NumIndices() const = 0;
    virtual bool FrameLogLikelihood(
        int32 frame, std::vector<kaldi::BaseFloat>* likelihood) = 0;
    virtual ~DecodableInterface() {}
 };
 /// @}
 }  // namespace Kaldi
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@ -23,10 +23,7 @@ using kaldi::Vector;
 Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
                     const std::shared_ptr<FeatureExtractorInterface>& frontend)
-    : frontend_(frontend),
+    : frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {}
      nnet_(nnet),
      frame_offset_(0),
      frames_ready_(0) {}
 void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
    frames_ready_ += likelihood.NumRows();
--- a/utils/generate_infer_yaml.py
+++ b/utils/generate_infer_yaml.py
@ -148,7 +148,7 @@ def merge_configs(
    for item in remove_train_list:
        try:
            remove_config_part(config, [item])
-        except:
+        except Exception as e:
            print(item + " " + "can not be removed")
    # Save the config