Merge pull request #1576 from zh794390558/sx

[speechx] speechx readme
pull/1578/head
Hui Zhang 3 years ago committed by GitHub
commit 020e573a5d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -61,6 +61,7 @@ def remove_version_py(filename='paddleaudio/__init__.py'):
if "__version__" not in line: if "__version__" not in line:
f.write(line) f.write(line)
remove_version_py() remove_version_py()
write_version_py() write_version_py()

@ -192,7 +192,7 @@ class ConfigCache:
try: try:
cfg = yaml.load(file, Loader=yaml.FullLoader) cfg = yaml.load(file, Loader=yaml.FullLoader)
self._data.update(cfg) self._data.update(cfg)
except: except Exception as e:
self.flush() self.flush()
@property @property

@ -1,24 +1,61 @@
# SpeechX -- Speech Inference All in One # SpeechX -- All in One Speech Task Inference
> Test under `Ubuntu 16.04.7 LTS`. ## Environment
We develop under:
* docker - registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7
* os - Ubuntu 16.04.7 LTS
* gcc/g++ - 8.2.0
* cmake - 3.16.0
> We make sure all things work fun under docker, and recommend using it to develop and deploy.
* [How to Install Docker](https://docs.docker.com/engine/install/)
* [A Docker Tutorial for Beginners](https://docker-curriculum.com/)
* [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/overview.html)
## Build ## Build
1. First to launch docker container.
```
nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7 /bin/bash
``` ```
* More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nviida-docker`.
2. Build `speechx` and `examples`.
```
pushd /path/to/speechx
./build.sh ./build.sh
``` ```
## Valgrind 3. Go to `examples` to have a fun.
More details please see `README.md` under `examples`.
## Valgrind (Optional)
> If using docker please check `--privileged` is set when `docker run`. > If using docker please check `--privileged` is set when `docker run`.
1. Fatal error at startup: a function redirection which is mandatory for this platform-tool combination cannot be set up * Fatal error at startup: `a function redirection which is mandatory for this platform-tool combination cannot be set up`
``` ```
apt-get install libc6-dbg apt-get install libc6-dbg
``` ```
* Install
``` ```
pushd tools pushd tools
./setup_valgrind.sh ./setup_valgrind.sh
popd popd
``` ```
## TODO
* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.

@ -1,3 +0,0 @@
# TODO
* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.

@ -1,5 +1,16 @@
# Examples # Examples
* decoder - offline decoder * decoder - online decoder to work as offline
* feat - mfcc, linear * feat - mfcc, linear
* nnet - ds2 nn * nnet - ds2 nn
## How to run
`run.sh` is the entry point.
Example to play `decoder`:
```
pushd decoder
bash run.sh
```

@ -29,8 +29,7 @@ FeatureCache::FeatureCache(
base_extractor_ = std::move(base_extractor); base_extractor_ = std::move(base_extractor);
} }
void FeatureCache::Accept( void FeatureCache::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
base_extractor_->Accept(inputs); base_extractor_->Accept(inputs);
// feed current data // feed current data
bool result = false; bool result = false;

@ -24,8 +24,7 @@ class FeatureCache : public FeatureExtractorInterface {
explicit FeatureCache( explicit FeatureCache(
int32 max_size = kint16max, int32 max_size = kint16max,
std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL); std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
virtual void Accept( virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
// feats dim = num_frames * feature_dim // feats dim = num_frames * feature_dim
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// feature cache only cache feature which from base extractor // feature cache only cache feature which from base extractor

@ -11,4 +11,3 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.

@ -11,4 +11,3 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.

@ -23,8 +23,7 @@ class FeatureExtractorInterface {
public: public:
// accept input data, accept feature or raw waves which decided // accept input data, accept feature or raw waves which decided
// by the base_extractor // by the base_extractor
virtual void Accept( virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
// get the processed result // get the processed result
// the length of output = feature_row * feature_dim, // the length of output = feature_row * feature_dim,
// the Matrix is squashed into Vector // the Matrix is squashed into Vector

@ -57,7 +57,8 @@ bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
if (flag == false || input_feats.Dim() == 0) return false; if (flag == false || input_feats.Dim() == 0) return false;
vector<BaseFloat> input_feats_vec(input_feats.Dim()); vector<BaseFloat> input_feats_vec(input_feats.Dim());
std::memcpy(input_feats_vec.data(), input_feats.Data(), std::memcpy(input_feats_vec.data(),
input_feats.Data(),
input_feats.Dim() * sizeof(BaseFloat)); input_feats.Dim() * sizeof(BaseFloat));
vector<vector<BaseFloat>> result; vector<vector<BaseFloat>> result;
Compute(input_feats_vec, result); Compute(input_feats_vec, result);

@ -38,16 +38,13 @@ class LinearSpectrogram : public FeatureExtractorInterface {
explicit LinearSpectrogram( explicit LinearSpectrogram(
const LinearSpectrogramOptions& opts, const LinearSpectrogramOptions& opts,
std::unique_ptr<FeatureExtractorInterface> base_extractor); std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void Accept( virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// the dim_ is the dim of single frame feature // the dim_ is the dim of single frame feature
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); } virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() { virtual void Reset() { base_extractor_->Reset(); }
base_extractor_->Reset();
}
private: private:
void Hanning(std::vector<kaldi::BaseFloat>* data) const; void Hanning(std::vector<kaldi::BaseFloat>* data) const;

@ -34,14 +34,12 @@ DecibelNormalizer::DecibelNormalizer(
dim_ = 1; dim_ = 1;
} }
void DecibelNormalizer::Accept( void DecibelNormalizer::Accept(const kaldi::VectorBase<BaseFloat>& waves) {
const kaldi::VectorBase<BaseFloat>& waves) {
base_extractor_->Accept(waves); base_extractor_->Accept(waves);
} }
bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* waves) { bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* waves) {
if (base_extractor_->Read(waves) == false || if (base_extractor_->Read(waves) == false || waves->Dim() == 0) {
waves->Dim() == 0) {
return false; return false;
} }
Compute(waves); Compute(waves);
@ -88,7 +86,8 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
item *= std::pow(10.0, gain / 20.0); item *= std::pow(10.0, gain / 20.0);
} }
std::memcpy(waves->Data(), samples.data(), sizeof(BaseFloat)*samples.size()); std::memcpy(
waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size());
return true; return true;
} }

@ -45,16 +45,13 @@ class DecibelNormalizer : public FeatureExtractorInterface {
explicit DecibelNormalizer( explicit DecibelNormalizer(
const DecibelNormalizerOptions& opts, const DecibelNormalizerOptions& opts,
std::unique_ptr<FeatureExtractorInterface> base_extractor); std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void Accept( virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
const kaldi::VectorBase<kaldi::BaseFloat>& waves);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
// noramlize audio, the dim is 1. // noramlize audio, the dim is 1.
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); } virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() { virtual void Reset() { base_extractor_->Reset(); }
base_extractor_->Reset();
}
private: private:
bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* waves) const; bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* waves) const;
@ -69,8 +66,7 @@ class CMVN : public FeatureExtractorInterface {
public: public:
explicit CMVN(std::string cmvn_file, explicit CMVN(std::string cmvn_file,
std::unique_ptr<FeatureExtractorInterface> base_extractor); std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void Accept( virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
// the length of feats = feature_row * feature_dim, // the length of feats = feature_row * feature_dim,
// the Matrix is squashed into Vector // the Matrix is squashed into Vector
@ -79,9 +75,7 @@ class CMVN : public FeatureExtractorInterface {
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); } virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() { virtual void Reset() { base_extractor_->Reset(); }
base_extractor_->Reset();
}
private: private:
void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const; void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;

@ -44,7 +44,8 @@ bool RawAudioCache::Read(Vector<BaseFloat>* waves) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
while (chunk_size > data_length_) { while (chunk_size > data_length_) {
// when audio is empty and no more data feed // when audio is empty and no more data feed
// ready_read_condition will block in dead lock. so replace with timeout_ // ready_read_condition will block in dead lock. so replace with
// timeout_
// ready_read_condition_.wait(lock); // ready_read_condition_.wait(lock);
int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000); int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000);
if (elapsed > timeout_) { if (elapsed > timeout_) {

@ -72,9 +72,7 @@ class RawDataCache : public FeatureExtractorInterface {
virtual void SetFinished() { finished_ = true; } virtual void SetFinished() { finished_ = true; }
virtual bool IsFinished() const { return finished_; } virtual bool IsFinished() const { return finished_; }
void SetDim(int32 dim) { dim_ = dim; } void SetDim(int32 dim) { dim_ = dim; }
virtual void Reset() { virtual void Reset() { finished_ = true; }
finished_ = true;
}
private: private:
kaldi::Vector<kaldi::BaseFloat> data_; kaldi::Vector<kaldi::BaseFloat> data_;

@ -1,3 +1,17 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// itf/decodable-itf.h // itf/decodable-itf.h
// Copyright 2009-2011 Microsoft Corporation; Saarland University; // Copyright 2009-2011 Microsoft Corporation; Saarland University;
@ -42,8 +56,10 @@ namespace kaldi {
For online decoding, where the features are coming in in real time, it is For online decoding, where the features are coming in in real time, it is
important to understand the IsLastFrame() and NumFramesReady() functions. important to understand the IsLastFrame() and NumFramesReady() functions.
There are two ways these are used: the old online-decoding code, in ../online/, There are two ways these are used: the old online-decoding code, in
and the new online-decoding code, in ../online2/. In the old online-decoding ../online/,
and the new online-decoding code, in ../online2/. In the old
online-decoding
code, the decoder would do: code, the decoder would do:
\code{.cc} \code{.cc}
for (int frame = 0; !decodable.IsLastFrame(frame); frame++) { for (int frame = 0; !decodable.IsLastFrame(frame); frame++) {
@ -52,13 +68,16 @@ namespace kaldi {
\endcode \endcode
and the call to IsLastFrame would block if the features had not arrived yet. and the call to IsLastFrame would block if the features had not arrived yet.
The decodable object would have to know when to terminate the decoding. This The decodable object would have to know when to terminate the decoding. This
online-decoding mode is still supported, it is what happens when you call, for online-decoding mode is still supported, it is what happens when you call,
for
example, LatticeFasterDecoder::Decode(). example, LatticeFasterDecoder::Decode().
We realized that this "blocking" mode of decoding is not very convenient We realized that this "blocking" mode of decoding is not very convenient
because it forces the program to be multi-threaded and makes it complex to because it forces the program to be multi-threaded and makes it complex to
control endpointing. In the "new" decoding code, you don't call (for example) control endpointing. In the "new" decoding code, you don't call (for
LatticeFasterDecoder::Decode(), you call LatticeFasterDecoder::InitDecoding(), example)
LatticeFasterDecoder::Decode(), you call
LatticeFasterDecoder::InitDecoding(),
and then each time you get more features, you provide them to the decodable and then each time you get more features, you provide them to the decodable
object, and you call LatticeFasterDecoder::AdvanceDecoding(), which does object, and you call LatticeFasterDecoder::AdvanceDecoding(), which does
something like this: something like this:
@ -68,7 +87,8 @@ namespace kaldi {
} }
\endcode \endcode
So the decodable object never has IsLastFrame() called. For decoding where So the decodable object never has IsLastFrame() called. For decoding where
you are starting with a matrix of features, the NumFramesReady() function will you are starting with a matrix of features, the NumFramesReady() function
will
always just return the number of frames in the file, and IsLastFrame() will always just return the number of frames in the file, and IsLastFrame() will
return true for the last frame. return true for the last frame.
@ -82,30 +102,39 @@ namespace kaldi {
class DecodableInterface { class DecodableInterface {
public: public:
/// Returns the log likelihood, which will be negated in the decoder. /// Returns the log likelihood, which will be negated in the decoder.
/// The "frame" starts from zero. You should verify that NumFramesReady() > frame /// The "frame" starts from zero. You should verify that NumFramesReady() >
/// frame
/// before calling this. /// before calling this.
virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0; virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0;
/// Returns true if this is the last frame. Frames are zero-based, so the /// Returns true if this is the last frame. Frames are zero-based, so the
/// first frame is zero. IsLastFrame(-1) will return false, unless the file /// first frame is zero. IsLastFrame(-1) will return false, unless the file
/// is empty (which is a case that I'm not sure all the code will handle, so /// is empty (which is a case that I'm not sure all the code will handle, so
/// be careful). Caution: the behavior of this function in an online setting /// be careful). Caution: the behavior of this function in an online
/// setting
/// is being changed somewhat. In future it may return false in cases where /// is being changed somewhat. In future it may return false in cases where
/// we haven't yet decided to terminate decoding, but later true if we decide /// we haven't yet decided to terminate decoding, but later true if we
/// decide
/// to terminate decoding. The plan in future is to rely more on /// to terminate decoding. The plan in future is to rely more on
/// NumFramesReady(), and in future, IsLastFrame() would always return false /// NumFramesReady(), and in future, IsLastFrame() would always return false
/// in an online-decoding setting, and would only return true in a /// in an online-decoding setting, and would only return true in a
/// decoding-from-matrix setting where we want to allow the last delta or LDA /// decoding-from-matrix setting where we want to allow the last delta or
/// LDA
/// features to be flushed out for compatibility with the baseline setup. /// features to be flushed out for compatibility with the baseline setup.
virtual bool IsLastFrame(int32 frame) const = 0; virtual bool IsLastFrame(int32 frame) const = 0;
/// The call NumFramesReady() will return the number of frames currently available /// The call NumFramesReady() will return the number of frames currently
/// for this decodable object. This is for use in setups where you don't want the /// available
/// decoder to block while waiting for input. This is newly added as of Jan 2014, /// for this decodable object. This is for use in setups where you don't
/// and I hope, going forward, to rely on this mechanism more than IsLastFrame to /// want the
/// decoder to block while waiting for input. This is newly added as of Jan
/// 2014,
/// and I hope, going forward, to rely on this mechanism more than
/// IsLastFrame to
/// know when to stop decoding. /// know when to stop decoding.
virtual int32 NumFramesReady() const { virtual int32 NumFramesReady() const {
KALDI_ERR << "NumFramesReady() not implemented for this decodable type."; KALDI_ERR
<< "NumFramesReady() not implemented for this decodable type.";
return -1; return -1;
} }
@ -114,8 +143,8 @@ class DecodableInterface {
/// this is for compatibility with OpenFst). /// this is for compatibility with OpenFst).
virtual int32 NumIndices() const = 0; virtual int32 NumIndices() const = 0;
virtual bool FrameLogLikelihood(int32 frame, virtual bool FrameLogLikelihood(
std::vector<kaldi::BaseFloat>* likelihood) = 0; int32 frame, std::vector<kaldi::BaseFloat>* likelihood) = 0;
virtual ~DecodableInterface() {} virtual ~DecodableInterface() {}

@ -23,10 +23,7 @@ using kaldi::Vector;
Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet, Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
const std::shared_ptr<FeatureExtractorInterface>& frontend) const std::shared_ptr<FeatureExtractorInterface>& frontend)
: frontend_(frontend), : frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {}
nnet_(nnet),
frame_offset_(0),
frames_ready_(0) {}
void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) { void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
frames_ready_ += likelihood.NumRows(); frames_ready_ += likelihood.NumRows();

@ -148,7 +148,7 @@ def merge_configs(
for item in remove_train_list: for item in remove_train_list:
try: try:
remove_config_part(config, [item]) remove_config_part(config, [item])
except: except Exception as e:
print(item + " " + "can not be removed") print(item + " " + "can not be removed")
# Save the config # Save the config

Loading…
Cancel
Save