Merge pull request #1576 from zh794390558/sx

[speechx] speechx readme
4 years ago · 020e573a5d
parent 01940cd474 6abc5d9f7e
commit 020e573a5d
22 changed files with 164 additions and 107 deletions
--- a/paddleaudio/setup.py
+++ b/paddleaudio/setup.py
@ -61,6 +61,7 @@ def remove_version_py(filename='paddleaudio/__init__.py'):
            if "__version__" not in line:
                f.write(line)
 remove_version_py()
 write_version_py()
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@ -192,7 +192,7 @@ class ConfigCache:
            try:
                cfg = yaml.load(file, Loader=yaml.FullLoader)
                self._data.update(cfg)
-            except:
+            except Exception as e:
                self.flush()
    @property
--- a/speechx/README.md
+++ b/speechx/README.md
@ -1,24 +1,61 @@
-# SpeechX -- Speech Inference All in One
+# SpeechX -- All in One Speech Task Inference 
-> Test under `Ubuntu 16.04.7 LTS`.
+## Environment
 We develop under:
 * docker - registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7
 * os - Ubuntu 16.04.7 LTS
 * gcc/g++ - 8.2.0
 * cmake - 3.16.0
 > We make sure all things work fun under docker, and recommend using it to develop and deploy.
 * [How to Install Docker](https://docs.docker.com/engine/install/)
 * [A Docker Tutorial for Beginners](https://docker-curriculum.com/)
 * [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/overview.html)
 ## Build
 1. First to launch docker container.
 ```
 nvidia-docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7 /bin/bash
 ```
 * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
 * If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nviida-docker`.
 2. Build `speechx` and `examples`.
 ```
 pushd /path/to/speechx
 ./build.sh
 ```
-## Valgrind
+3. Go to `examples` to have a fun.
 More details please see `README.md` under `examples`.
 ## Valgrind (Optional)
 > If using docker please check `--privileged` is set when `docker run`.
-1. Fatal error at startup: a function redirection which is mandatory for this platform-tool combination cannot be set up
+* Fatal error at startup: `a function redirection which is mandatory for this platform-tool combination cannot be set up`
 ```
 apt-get install libc6-dbg
 ```
 * Install
 ```
 pushd tools
 ./setup_valgrind.sh
 popd
 ```
 ## TODO
 * DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.
--- a/speechx/TODO.md
+++ b/speechx/TODO.md
@ -1,3 +0,0 @@
 # TODO
 * DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@ -1,5 +1,16 @@
 # Examples
-* decoder - offline decoder 
+* decoder - online decoder to work as offline
 * feat - mfcc, linear 
 * nnet - ds2 nn
 ## How to run
 `run.sh` is the entry point.
 Example to play `decoder`:
 ```
 pushd decoder
 bash run.sh
 ```
--- a/speechx/speechx/frontend/feature_cache.cc
+++ b/speechx/speechx/frontend/feature_cache.cc
@ -29,8 +29,7 @@ FeatureCache::FeatureCache(
    base_extractor_ = std::move(base_extractor);
 }
-void FeatureCache::Accept(
+void FeatureCache::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
    const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
    base_extractor_->Accept(inputs);
    // feed current data
    bool result = false;
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@ -24,8 +24,7 @@ class FeatureCache : public FeatureExtractorInterface {
    explicit FeatureCache(
        int32 max_size = kint16max,
        std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
-    virtual void Accept(
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
        const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
    // feats dim = num_frames * feature_dim
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
    // feature cache only cache feature which from base extractor
--- a/speechx/speechx/frontend/feature_extractor_controller.h
+++ b/speechx/speechx/frontend/feature_extractor_controller.h
@ -11,4 +11,3 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
--- a/speechx/speechx/frontend/feature_extractor_controller_impl.h
+++ b/speechx/speechx/frontend/feature_extractor_controller_impl.h
@ -11,4 +11,3 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@ -23,8 +23,7 @@ class FeatureExtractorInterface {
  public:
    // accept input data, accept feature or raw waves which decided
    // by the base_extractor
-    virtual void Accept(
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
        const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
    // get the processed result
    // the length of output = feature_row * feature_dim,
    // the Matrix is squashed into Vector
--- a/speechx/speechx/frontend/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/linear_spectrogram.cc
@ -57,7 +57,8 @@ bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
    if (flag == false || input_feats.Dim() == 0) return false;
    vector<BaseFloat> input_feats_vec(input_feats.Dim());
-    std::memcpy(input_feats_vec.data(), input_feats.Data(), 
+    std::memcpy(input_feats_vec.data(),
                input_feats.Data(),
                input_feats.Dim() * sizeof(BaseFloat));
    vector<vector<BaseFloat>> result;
    Compute(input_feats_vec, result);
--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@ -38,16 +38,13 @@ class LinearSpectrogram : public FeatureExtractorInterface {
    explicit LinearSpectrogram(
        const LinearSpectrogramOptions& opts,
        std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void Accept(
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
        const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
    // the dim_ is the dim of single frame feature
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
-    virtual void Reset() {
+    virtual void Reset() { base_extractor_->Reset(); }
        base_extractor_->Reset();
    }
  private:
    void Hanning(std::vector<kaldi::BaseFloat>* data) const;
--- a/speechx/speechx/frontend/normalizer.cc
+++ b/speechx/speechx/frontend/normalizer.cc
@ -34,14 +34,12 @@ DecibelNormalizer::DecibelNormalizer(
    dim_ = 1;
 }
-void DecibelNormalizer::Accept(
+void DecibelNormalizer::Accept(const kaldi::VectorBase<BaseFloat>& waves) {
    const kaldi::VectorBase<BaseFloat>& waves) {
    base_extractor_->Accept(waves);
 }
 bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* waves) {
-    if (base_extractor_->Read(waves) == false || 
+    if (base_extractor_->Read(waves) == false || waves->Dim() == 0) {
        waves->Dim() == 0) {
        return false;
    }
    Compute(waves);
@ -88,7 +86,8 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
        item *= std::pow(10.0, gain / 20.0);
    }
-    std::memcpy(waves->Data(), samples.data(), sizeof(BaseFloat)*samples.size());
+    std::memcpy(
        waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size());
    return true;
 }
--- a/speechx/speechx/frontend/normalizer.h
+++ b/speechx/speechx/frontend/normalizer.h
@ -45,16 +45,13 @@ class DecibelNormalizer : public FeatureExtractorInterface {
    explicit DecibelNormalizer(
        const DecibelNormalizerOptions& opts,
        std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void Accept(
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
        const kaldi::VectorBase<kaldi::BaseFloat>& waves);
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
    // noramlize audio, the dim is 1.
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
-    virtual void Reset() {
+    virtual void Reset() { base_extractor_->Reset(); }
        base_extractor_->Reset();
    }
  private:
    bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* waves) const;
@ -69,8 +66,7 @@ class CMVN : public FeatureExtractorInterface {
  public:
    explicit CMVN(std::string cmvn_file,
                  std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void Accept(
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
        const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
    // the length of feats = feature_row * feature_dim,
    // the Matrix is squashed into Vector
@ -79,9 +75,7 @@ class CMVN : public FeatureExtractorInterface {
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
-    virtual void Reset() {
+    virtual void Reset() { base_extractor_->Reset(); }
        base_extractor_->Reset();
    }
  private:
    void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;
--- a/speechx/speechx/frontend/raw_audio.cc
+++ b/speechx/speechx/frontend/raw_audio.cc
@ -44,7 +44,8 @@ bool RawAudioCache::Read(Vector<BaseFloat>* waves) {
    std::unique_lock<std::mutex> lock(mutex_);
    while (chunk_size > data_length_) {
        // when audio is empty and no more data feed
-        // ready_read_condition will block in dead lock. so replace with timeout_
+        // ready_read_condition will block in dead lock. so replace with
        // timeout_
        // ready_read_condition_.wait(lock);
        int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000);
        if (elapsed > timeout_) {
--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/raw_audio.h
@ -72,9 +72,7 @@ class RawDataCache : public FeatureExtractorInterface {
    virtual void SetFinished() { finished_ = true; }
    virtual bool IsFinished() const { return finished_; }
    void SetDim(int32 dim) { dim_ = dim; }
-    virtual void Reset() {
+    virtual void Reset() { finished_ = true; }
      finished_ = true;
    }
  private:
    kaldi::Vector<kaldi::BaseFloat> data_;
--- a/speechx/speechx/nnet/decodable-itf.h
+++ b/speechx/speechx/nnet/decodable-itf.h
@ -1,3 +1,17 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // itf/decodable-itf.h
 // Copyright 2009-2011  Microsoft Corporation;  Saarland University;
@ -42,8 +56,10 @@ namespace kaldi {
    For online decoding, where the features are coming in in real time, it is
    important to understand the IsLastFrame() and NumFramesReady() functions.
-    There are two ways these are used: the old online-decoding code, in ../online/,
+    There are two ways these are used: the old online-decoding code, in
-    and the new online-decoding code, in ../online2/.  In the old online-decoding
+   ../online/,
    and the new online-decoding code, in ../online2/.  In the old
   online-decoding
    code, the decoder would do:
    \code{.cc}
    for (int frame = 0; !decodable.IsLastFrame(frame); frame++) {
@ -52,13 +68,16 @@ namespace kaldi {
    \endcode
   and the call to IsLastFrame would block if the features had not arrived yet.
   The decodable object would have to know when to terminate the decoding.  This
-   online-decoding mode is still supported, it is what happens when you call, for
+   online-decoding mode is still supported, it is what happens when you call,
   for
   example, LatticeFasterDecoder::Decode().
   We realized that this "blocking" mode of decoding is not very convenient
   because it forces the program to be multi-threaded and makes it complex to
-   control endpointing.  In the "new" decoding code, you don't call (for example)
+   control endpointing.  In the "new" decoding code, you don't call (for
-   LatticeFasterDecoder::Decode(), you call LatticeFasterDecoder::InitDecoding(),
+   example)
   LatticeFasterDecoder::Decode(), you call
   LatticeFasterDecoder::InitDecoding(),
   and then each time you get more features, you provide them to the decodable
   object, and you call LatticeFasterDecoder::AdvanceDecoding(), which does
   something like this:
@ -68,7 +87,8 @@ namespace kaldi {
   }
   \endcode
   So the decodable object never has IsLastFrame() called.  For decoding where
-   you are starting with a matrix of features, the NumFramesReady() function will
+   you are starting with a matrix of features, the NumFramesReady() function
   will
   always just return the number of frames in the file, and IsLastFrame() will
   return true for the last frame.
@ -82,30 +102,39 @@ namespace kaldi {
 class DecodableInterface {
  public:
    /// Returns the log likelihood, which will be negated in the decoder.
-  /// The "frame" starts from zero.  You should verify that NumFramesReady() > frame
+    /// The "frame" starts from zero.  You should verify that NumFramesReady() >
    /// frame
    /// before calling this.
    virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0;
    /// Returns true if this is the last frame.  Frames are zero-based, so the
    /// first frame is zero.  IsLastFrame(-1) will return false, unless the file
    /// is empty (which is a case that I'm not sure all the code will handle, so
-  /// be careful).  Caution: the behavior of this function in an online setting
+    /// be careful).  Caution: the behavior of this function in an online
    /// setting
    /// is being changed somewhat.  In future it may return false in cases where
-  /// we haven't yet decided to terminate decoding, but later true if we decide
+    /// we haven't yet decided to terminate decoding, but later true if we
    /// decide
    /// to terminate decoding.  The plan in future is to rely more on
    /// NumFramesReady(), and in future, IsLastFrame() would always return false
    /// in an online-decoding setting, and would only return true in a
-  /// decoding-from-matrix setting where we want to allow the last delta or LDA
+    /// decoding-from-matrix setting where we want to allow the last delta or
    /// LDA
    /// features to be flushed out for compatibility with the baseline setup.
    virtual bool IsLastFrame(int32 frame) const = 0;
-  /// The call NumFramesReady() will return the number of frames currently available
+    /// The call NumFramesReady() will return the number of frames currently
-  /// for this decodable object.  This is for use in setups where you don't want the
+    /// available
-  /// decoder to block while waiting for input.  This is newly added as of Jan 2014,
+    /// for this decodable object.  This is for use in setups where you don't
-  /// and I hope, going forward, to rely on this mechanism more than IsLastFrame to
+    /// want the
    /// decoder to block while waiting for input.  This is newly added as of Jan
    /// 2014,
    /// and I hope, going forward, to rely on this mechanism more than
    /// IsLastFrame to
    /// know when to stop decoding.
    virtual int32 NumFramesReady() const {
-    KALDI_ERR << "NumFramesReady() not implemented for this decodable type.";
+        KALDI_ERR
            << "NumFramesReady() not implemented for this decodable type.";
        return -1;
    }
@ -114,8 +143,8 @@ class DecodableInterface {
    /// this is for compatibility with OpenFst).
    virtual int32 NumIndices() const = 0;
-  virtual bool FrameLogLikelihood(int32 frame, 
+    virtual bool FrameLogLikelihood(
-                                  std::vector<kaldi::BaseFloat>* likelihood) = 0;
+        int32 frame, std::vector<kaldi::BaseFloat>* likelihood) = 0;
    virtual ~DecodableInterface() {}
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@ -23,10 +23,7 @@ using kaldi::Vector;
 Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
                     const std::shared_ptr<FeatureExtractorInterface>& frontend)
-    : frontend_(frontend),
+    : frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {}
      nnet_(nnet),
      frame_offset_(0),
      frames_ready_(0) {}
 void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
    frames_ready_ += likelihood.NumRows();
--- a/utils/generate_infer_yaml.py
+++ b/utils/generate_infer_yaml.py
@ -148,7 +148,7 @@ def merge_configs(
    for item in remove_train_list:
        try:
            remove_config_part(config, [item])
-        except:
+        except Exception as e:
            print(item + " " + "can not be removed")
    # Save the config
		`@ -1,3 +0,0 @@`
			`# TODO`

			`* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.`