Merge pull request #1576 from zh794390558/sx

[speechx] speechx readme
pull/1578/head
Hui Zhang 2 years ago committed by GitHub
commit 020e573a5d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -61,6 +61,7 @@ def remove_version_py(filename='paddleaudio/__init__.py'):
if "__version__" not in line:
f.write(line)
remove_version_py()
write_version_py()

@ -192,7 +192,7 @@ class ConfigCache:
try:
cfg = yaml.load(file, Loader=yaml.FullLoader)
self._data.update(cfg)
except:
except Exception as e:
self.flush()
@property

@ -174,7 +174,7 @@ class ServerStatsExecutor():
"Failed to get the table of TTS pretrained models supported in the service."
)
return False
elif self.task == 'cls':
try:
from paddlespeech.cli.cls.infer import pretrained_models

@ -1,24 +1,61 @@
# SpeechX -- Speech Inference All in One
# SpeechX -- All in One Speech Task Inference
> Test under `Ubuntu 16.04.7 LTS`.
## Environment
We develop under:
* docker - registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7
* os - Ubuntu 16.04.7 LTS
* gcc/g++ - 8.2.0
* cmake - 3.16.0
> We make sure all things work fun under docker, and recommend using it to develop and deploy.
* [How to Install Docker](https://docs.docker.com/engine/install/)
* [A Docker Tutorial for Beginners](https://docker-curriculum.com/)
* [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/overview.html)
## Build
1. First to launch docker container.
```
nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7 /bin/bash
```
* More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nviida-docker`.
2. Build `speechx` and `examples`.
```
pushd /path/to/speechx
./build.sh
```
## Valgrind
3. Go to `examples` to have a fun.
More details please see `README.md` under `examples`.
## Valgrind (Optional)
> If using docker please check `--privileged` is set when `docker run`.
1. Fatal error at startup: a function redirection which is mandatory for this platform-tool combination cannot be set up
* Fatal error at startup: `a function redirection which is mandatory for this platform-tool combination cannot be set up`
```
apt-get install libc6-dbg
```
* Install
```
pushd tools
./setup_valgrind.sh
popd
```
## TODO
* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.

@ -1,3 +0,0 @@
# TODO
* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.

@ -1,5 +1,16 @@
# Examples
* decoder - offline decoder
* decoder - online decoder to work as offline
* feat - mfcc, linear
* nnet - ds2 nn
## How to run
`run.sh` is the entry point.
Example to play `decoder`:
```
pushd decoder
bash run.sh
```

@ -164,8 +164,8 @@ int main(int argc, char* argv[]) {
// test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
// window -->linear_spectrogram --> cmvn
int32 num_done = 0, num_err = 0;
//std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
//ppspeech::RawDataCache());
// std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
// ppspeech::RawDataCache());
std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
new ppspeech::RawAudioCache());

@ -52,14 +52,14 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
}
void CTCBeamSearch::Reset() {
//num_frame_decoded_ = 0;
//ResetPrefixes();
// num_frame_decoded_ = 0;
// ResetPrefixes();
InitDecoder();
}
void CTCBeamSearch::InitDecoder() {
num_frame_decoded_ = 0;
//ResetPrefixes();
// ResetPrefixes();
prefixes_.clear();
root_ = std::make_shared<PathTrie>();

@ -29,8 +29,7 @@ FeatureCache::FeatureCache(
base_extractor_ = std::move(base_extractor);
}
void FeatureCache::Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
void FeatureCache::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
base_extractor_->Accept(inputs);
// feed current data
bool result = false;

@ -24,8 +24,7 @@ class FeatureCache : public FeatureExtractorInterface {
explicit FeatureCache(
int32 max_size = kint16max,
std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
virtual void Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
// feats dim = num_frames * feature_dim
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// feature cache only cache feature which from base extractor

@ -11,4 +11,3 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

@ -11,4 +11,3 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

@ -21,10 +21,9 @@ namespace ppspeech {
class FeatureExtractorInterface {
public:
// accept input data, accept feature or raw waves which decided
// accept input data, accept feature or raw waves which decided
// by the base_extractor
virtual void Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
// get the processed result
// the length of output = feature_row * feature_dim,
// the Matrix is squashed into Vector

@ -57,8 +57,9 @@ bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
if (flag == false || input_feats.Dim() == 0) return false;
vector<BaseFloat> input_feats_vec(input_feats.Dim());
std::memcpy(input_feats_vec.data(), input_feats.Data(),
input_feats.Dim()*sizeof(BaseFloat));
std::memcpy(input_feats_vec.data(),
input_feats.Data(),
input_feats.Dim() * sizeof(BaseFloat));
vector<vector<BaseFloat>> result;
Compute(input_feats_vec, result);
int32 feat_size = 0;
@ -86,10 +87,10 @@ bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
vector<BaseFloat>* img) const {
Vector<BaseFloat> v_tmp;
v_tmp.Resize(v->size());
std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat)*(v->size()));
std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat) * (v->size()));
RealFft(&v_tmp, true);
v->resize(v_tmp.Dim());
std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat)*(v->size()));
std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat) * (v->size()));
real->push_back(v->at(0));
img->push_back(0);

@ -38,16 +38,13 @@ class LinearSpectrogram : public FeatureExtractorInterface {
explicit LinearSpectrogram(
const LinearSpectrogramOptions& opts,
std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// the dim_ is the dim of single frame feature
virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() {
base_extractor_->Reset();
}
virtual void Reset() { base_extractor_->Reset(); }
private:
void Hanning(std::vector<kaldi::BaseFloat>* data) const;

@ -34,14 +34,12 @@ DecibelNormalizer::DecibelNormalizer(
dim_ = 1;
}
void DecibelNormalizer::Accept(
const kaldi::VectorBase<BaseFloat>& waves) {
void DecibelNormalizer::Accept(const kaldi::VectorBase<BaseFloat>& waves) {
base_extractor_->Accept(waves);
}
bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* waves) {
if (base_extractor_->Read(waves) == false ||
waves->Dim() == 0) {
if (base_extractor_->Read(waves) == false || waves->Dim() == 0) {
return false;
}
Compute(waves);
@ -88,7 +86,8 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
item *= std::pow(10.0, gain / 20.0);
}
std::memcpy(waves->Data(), samples.data(), sizeof(BaseFloat)*samples.size());
std::memcpy(
waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size());
return true;
}

@ -45,16 +45,13 @@ class DecibelNormalizer : public FeatureExtractorInterface {
explicit DecibelNormalizer(
const DecibelNormalizerOptions& opts,
std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& waves);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
// noramlize audio, the dim is 1.
virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() {
base_extractor_->Reset();
}
virtual void Reset() { base_extractor_->Reset(); }
private:
bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* waves) const;
@ -69,8 +66,7 @@ class CMVN : public FeatureExtractorInterface {
public:
explicit CMVN(std::string cmvn_file,
std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
// the length of feats = feature_row * feature_dim,
// the Matrix is squashed into Vector
@ -79,9 +75,7 @@ class CMVN : public FeatureExtractorInterface {
virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() {
base_extractor_->Reset();
}
virtual void Reset() { base_extractor_->Reset(); }
private:
void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;

@ -32,7 +32,7 @@ void RawAudioCache::Accept(const VectorBase<BaseFloat>& waves) {
ready_feed_condition_.wait(lock);
}
for (size_t idx = 0; idx < waves.Dim(); ++idx) {
int32 buffer_idx = (idx + start_) % ring_buffer_.size();
int32 buffer_idx = (idx + start_) % ring_buffer_.size();
ring_buffer_[buffer_idx] = waves(idx);
}
data_length_ += waves.Dim();
@ -44,7 +44,8 @@ bool RawAudioCache::Read(Vector<BaseFloat>* waves) {
std::unique_lock<std::mutex> lock(mutex_);
while (chunk_size > data_length_) {
// when audio is empty and no more data feed
// ready_read_condition will block in dead lock. so replace with timeout_
// ready_read_condition will block in dead lock. so replace with
// timeout_
// ready_read_condition_.wait(lock);
int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000);
if (elapsed > timeout_) {

@ -35,9 +35,9 @@ class RawAudioCache : public FeatureExtractorInterface {
}
virtual bool IsFinished() const { return finished_; }
virtual void Reset() {
start_ = 0;
data_length_ = 0;
finished_ = false;
start_ = 0;
data_length_ = 0;
finished_ = false;
}
private:
@ -72,9 +72,7 @@ class RawDataCache : public FeatureExtractorInterface {
virtual void SetFinished() { finished_ = true; }
virtual bool IsFinished() const { return finished_; }
void SetDim(int32 dim) { dim_ = dim; }
virtual void Reset() {
finished_ = true;
}
virtual void Reset() { finished_ = true; }
private:
kaldi::Vector<kaldi::BaseFloat> data_;

@ -1,3 +1,17 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// itf/decodable-itf.h
// Copyright 2009-2011 Microsoft Corporation; Saarland University;
@ -42,8 +56,10 @@ namespace kaldi {
For online decoding, where the features are coming in in real time, it is
important to understand the IsLastFrame() and NumFramesReady() functions.
There are two ways these are used: the old online-decoding code, in ../online/,
and the new online-decoding code, in ../online2/. In the old online-decoding
There are two ways these are used: the old online-decoding code, in
../online/,
and the new online-decoding code, in ../online2/. In the old
online-decoding
code, the decoder would do:
\code{.cc}
for (int frame = 0; !decodable.IsLastFrame(frame); frame++) {
@ -52,13 +68,16 @@ namespace kaldi {
\endcode
and the call to IsLastFrame would block if the features had not arrived yet.
The decodable object would have to know when to terminate the decoding. This
online-decoding mode is still supported, it is what happens when you call, for
online-decoding mode is still supported, it is what happens when you call,
for
example, LatticeFasterDecoder::Decode().
We realized that this "blocking" mode of decoding is not very convenient
because it forces the program to be multi-threaded and makes it complex to
control endpointing. In the "new" decoding code, you don't call (for example)
LatticeFasterDecoder::Decode(), you call LatticeFasterDecoder::InitDecoding(),
control endpointing. In the "new" decoding code, you don't call (for
example)
LatticeFasterDecoder::Decode(), you call
LatticeFasterDecoder::InitDecoding(),
and then each time you get more features, you provide them to the decodable
object, and you call LatticeFasterDecoder::AdvanceDecoding(), which does
something like this:
@ -68,7 +87,8 @@ namespace kaldi {
}
\endcode
So the decodable object never has IsLastFrame() called. For decoding where
you are starting with a matrix of features, the NumFramesReady() function will
you are starting with a matrix of features, the NumFramesReady() function
will
always just return the number of frames in the file, and IsLastFrame() will
return true for the last frame.
@ -80,45 +100,54 @@ namespace kaldi {
frame of the file once we've decided to terminate decoding.
*/
class DecodableInterface {
public:
/// Returns the log likelihood, which will be negated in the decoder.
/// The "frame" starts from zero. You should verify that NumFramesReady() > frame
/// before calling this.
virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0;
/// Returns true if this is the last frame. Frames are zero-based, so the
/// first frame is zero. IsLastFrame(-1) will return false, unless the file
/// is empty (which is a case that I'm not sure all the code will handle, so
/// be careful). Caution: the behavior of this function in an online setting
/// is being changed somewhat. In future it may return false in cases where
/// we haven't yet decided to terminate decoding, but later true if we decide
/// to terminate decoding. The plan in future is to rely more on
/// NumFramesReady(), and in future, IsLastFrame() would always return false
/// in an online-decoding setting, and would only return true in a
/// decoding-from-matrix setting where we want to allow the last delta or LDA
/// features to be flushed out for compatibility with the baseline setup.
virtual bool IsLastFrame(int32 frame) const = 0;
/// The call NumFramesReady() will return the number of frames currently available
/// for this decodable object. This is for use in setups where you don't want the
/// decoder to block while waiting for input. This is newly added as of Jan 2014,
/// and I hope, going forward, to rely on this mechanism more than IsLastFrame to
/// know when to stop decoding.
virtual int32 NumFramesReady() const {
KALDI_ERR << "NumFramesReady() not implemented for this decodable type.";
return -1;
}
/// Returns the number of states in the acoustic model
/// (they will be indexed one-based, i.e. from 1 to NumIndices();
/// this is for compatibility with OpenFst).
virtual int32 NumIndices() const = 0;
virtual bool FrameLogLikelihood(int32 frame,
std::vector<kaldi::BaseFloat>* likelihood) = 0;
virtual ~DecodableInterface() {}
public:
/// Returns the log likelihood, which will be negated in the decoder.
/// The "frame" starts from zero. You should verify that NumFramesReady() >
/// frame
/// before calling this.
virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0;
/// Returns true if this is the last frame. Frames are zero-based, so the
/// first frame is zero. IsLastFrame(-1) will return false, unless the file
/// is empty (which is a case that I'm not sure all the code will handle, so
/// be careful). Caution: the behavior of this function in an online
/// setting
/// is being changed somewhat. In future it may return false in cases where
/// we haven't yet decided to terminate decoding, but later true if we
/// decide
/// to terminate decoding. The plan in future is to rely more on
/// NumFramesReady(), and in future, IsLastFrame() would always return false
/// in an online-decoding setting, and would only return true in a
/// decoding-from-matrix setting where we want to allow the last delta or
/// LDA
/// features to be flushed out for compatibility with the baseline setup.
virtual bool IsLastFrame(int32 frame) const = 0;
/// The call NumFramesReady() will return the number of frames currently
/// available
/// for this decodable object. This is for use in setups where you don't
/// want the
/// decoder to block while waiting for input. This is newly added as of Jan
/// 2014,
/// and I hope, going forward, to rely on this mechanism more than
/// IsLastFrame to
/// know when to stop decoding.
virtual int32 NumFramesReady() const {
KALDI_ERR
<< "NumFramesReady() not implemented for this decodable type.";
return -1;
}
/// Returns the number of states in the acoustic model
/// (they will be indexed one-based, i.e. from 1 to NumIndices();
/// this is for compatibility with OpenFst).
virtual int32 NumIndices() const = 0;
virtual bool FrameLogLikelihood(
int32 frame, std::vector<kaldi::BaseFloat>* likelihood) = 0;
virtual ~DecodableInterface() {}
};
/// @}
} // namespace Kaldi

@ -23,10 +23,7 @@ using kaldi::Vector;
Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
const std::shared_ptr<FeatureExtractorInterface>& frontend)
: frontend_(frontend),
nnet_(nnet),
frame_offset_(0),
frames_ready_(0) {}
: frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {}
void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
frames_ready_ += likelihood.NumRows();
@ -83,7 +80,7 @@ void Decodable::Reset() {
frontend_->Reset();
nnet_->Reset();
frame_offset_ = 0;
frames_ready_ = 0;
frames_ready_ = 0;
}
} // namespace ppspeech

@ -148,7 +148,7 @@ def merge_configs(
for item in remove_train_list:
try:
remove_config_part(config, [item])
except:
except Exception as e:
print(item + " " + "can not be removed")
# Save the config

Loading…
Cancel
Save