pull/1576/head
Hui Zhang 4 years ago
parent 854b63b519
commit 6abc5d9f7e

@ -61,6 +61,7 @@ def remove_version_py(filename='paddleaudio/__init__.py'):
if "__version__" not in line: if "__version__" not in line:
f.write(line) f.write(line)
remove_version_py() remove_version_py()
write_version_py() write_version_py()

@ -192,7 +192,7 @@ class ConfigCache:
try: try:
cfg = yaml.load(file, Loader=yaml.FullLoader) cfg = yaml.load(file, Loader=yaml.FullLoader)
self._data.update(cfg) self._data.update(cfg)
except: except Exception as e:
self.flush() self.flush()
@property @property

@ -164,8 +164,8 @@ int main(int argc, char* argv[]) {
// test feature linear_spectorgram: wave --> decibel_normalizer --> hanning // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
// window -->linear_spectrogram --> cmvn // window -->linear_spectrogram --> cmvn
int32 num_done = 0, num_err = 0; int32 num_done = 0, num_err = 0;
//std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new // std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
//ppspeech::RawDataCache()); // ppspeech::RawDataCache());
std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source( std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
new ppspeech::RawAudioCache()); new ppspeech::RawAudioCache());

@ -52,14 +52,14 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
} }
void CTCBeamSearch::Reset() { void CTCBeamSearch::Reset() {
//num_frame_decoded_ = 0; // num_frame_decoded_ = 0;
//ResetPrefixes(); // ResetPrefixes();
InitDecoder(); InitDecoder();
} }
void CTCBeamSearch::InitDecoder() { void CTCBeamSearch::InitDecoder() {
num_frame_decoded_ = 0; num_frame_decoded_ = 0;
//ResetPrefixes(); // ResetPrefixes();
prefixes_.clear(); prefixes_.clear();
root_ = std::make_shared<PathTrie>(); root_ = std::make_shared<PathTrie>();

@ -29,8 +29,7 @@ FeatureCache::FeatureCache(
base_extractor_ = std::move(base_extractor); base_extractor_ = std::move(base_extractor);
} }
void FeatureCache::Accept( void FeatureCache::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
base_extractor_->Accept(inputs); base_extractor_->Accept(inputs);
// feed current data // feed current data
bool result = false; bool result = false;

@ -24,8 +24,7 @@ class FeatureCache : public FeatureExtractorInterface {
explicit FeatureCache( explicit FeatureCache(
int32 max_size = kint16max, int32 max_size = kint16max,
std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL); std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
virtual void Accept( virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
// feats dim = num_frames * feature_dim // feats dim = num_frames * feature_dim
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// feature cache only cache feature which from base extractor // feature cache only cache feature which from base extractor

@ -11,4 +11,3 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.

@ -11,4 +11,3 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.

@ -23,8 +23,7 @@ class FeatureExtractorInterface {
public: public:
// accept input data, accept feature or raw waves which decided // accept input data, accept feature or raw waves which decided
// by the base_extractor // by the base_extractor
virtual void Accept( virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
// get the processed result // get the processed result
// the length of output = feature_row * feature_dim, // the length of output = feature_row * feature_dim,
// the Matrix is squashed into Vector // the Matrix is squashed into Vector

@ -57,8 +57,9 @@ bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
if (flag == false || input_feats.Dim() == 0) return false; if (flag == false || input_feats.Dim() == 0) return false;
vector<BaseFloat> input_feats_vec(input_feats.Dim()); vector<BaseFloat> input_feats_vec(input_feats.Dim());
std::memcpy(input_feats_vec.data(), input_feats.Data(), std::memcpy(input_feats_vec.data(),
input_feats.Dim()*sizeof(BaseFloat)); input_feats.Data(),
input_feats.Dim() * sizeof(BaseFloat));
vector<vector<BaseFloat>> result; vector<vector<BaseFloat>> result;
Compute(input_feats_vec, result); Compute(input_feats_vec, result);
int32 feat_size = 0; int32 feat_size = 0;
@ -86,10 +87,10 @@ bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
vector<BaseFloat>* img) const { vector<BaseFloat>* img) const {
Vector<BaseFloat> v_tmp; Vector<BaseFloat> v_tmp;
v_tmp.Resize(v->size()); v_tmp.Resize(v->size());
std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat)*(v->size())); std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat) * (v->size()));
RealFft(&v_tmp, true); RealFft(&v_tmp, true);
v->resize(v_tmp.Dim()); v->resize(v_tmp.Dim());
std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat)*(v->size())); std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat) * (v->size()));
real->push_back(v->at(0)); real->push_back(v->at(0));
img->push_back(0); img->push_back(0);

@ -38,16 +38,13 @@ class LinearSpectrogram : public FeatureExtractorInterface {
explicit LinearSpectrogram( explicit LinearSpectrogram(
const LinearSpectrogramOptions& opts, const LinearSpectrogramOptions& opts,
std::unique_ptr<FeatureExtractorInterface> base_extractor); std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void Accept( virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// the dim_ is the dim of single frame feature // the dim_ is the dim of single frame feature
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); } virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() { virtual void Reset() { base_extractor_->Reset(); }
base_extractor_->Reset();
}
private: private:
void Hanning(std::vector<kaldi::BaseFloat>* data) const; void Hanning(std::vector<kaldi::BaseFloat>* data) const;

@ -34,14 +34,12 @@ DecibelNormalizer::DecibelNormalizer(
dim_ = 1; dim_ = 1;
} }
void DecibelNormalizer::Accept( void DecibelNormalizer::Accept(const kaldi::VectorBase<BaseFloat>& waves) {
const kaldi::VectorBase<BaseFloat>& waves) {
base_extractor_->Accept(waves); base_extractor_->Accept(waves);
} }
bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* waves) { bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* waves) {
if (base_extractor_->Read(waves) == false || if (base_extractor_->Read(waves) == false || waves->Dim() == 0) {
waves->Dim() == 0) {
return false; return false;
} }
Compute(waves); Compute(waves);
@ -88,7 +86,8 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
item *= std::pow(10.0, gain / 20.0); item *= std::pow(10.0, gain / 20.0);
} }
std::memcpy(waves->Data(), samples.data(), sizeof(BaseFloat)*samples.size()); std::memcpy(
waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size());
return true; return true;
} }

@ -45,16 +45,13 @@ class DecibelNormalizer : public FeatureExtractorInterface {
explicit DecibelNormalizer( explicit DecibelNormalizer(
const DecibelNormalizerOptions& opts, const DecibelNormalizerOptions& opts,
std::unique_ptr<FeatureExtractorInterface> base_extractor); std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void Accept( virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
const kaldi::VectorBase<kaldi::BaseFloat>& waves);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
// noramlize audio, the dim is 1. // noramlize audio, the dim is 1.
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); } virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() { virtual void Reset() { base_extractor_->Reset(); }
base_extractor_->Reset();
}
private: private:
bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* waves) const; bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* waves) const;
@ -69,8 +66,7 @@ class CMVN : public FeatureExtractorInterface {
public: public:
explicit CMVN(std::string cmvn_file, explicit CMVN(std::string cmvn_file,
std::unique_ptr<FeatureExtractorInterface> base_extractor); std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void Accept( virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
// the length of feats = feature_row * feature_dim, // the length of feats = feature_row * feature_dim,
// the Matrix is squashed into Vector // the Matrix is squashed into Vector
@ -79,9 +75,7 @@ class CMVN : public FeatureExtractorInterface {
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); } virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() { virtual void Reset() { base_extractor_->Reset(); }
base_extractor_->Reset();
}
private: private:
void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const; void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;

@ -44,7 +44,8 @@ bool RawAudioCache::Read(Vector<BaseFloat>* waves) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
while (chunk_size > data_length_) { while (chunk_size > data_length_) {
// when audio is empty and no more data feed // when audio is empty and no more data feed
// ready_read_condition will block in dead lock. so replace with timeout_ // ready_read_condition will block in dead lock. so replace with
// timeout_
// ready_read_condition_.wait(lock); // ready_read_condition_.wait(lock);
int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000); int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000);
if (elapsed > timeout_) { if (elapsed > timeout_) {

@ -35,9 +35,9 @@ class RawAudioCache : public FeatureExtractorInterface {
} }
virtual bool IsFinished() const { return finished_; } virtual bool IsFinished() const { return finished_; }
virtual void Reset() { virtual void Reset() {
start_ = 0; start_ = 0;
data_length_ = 0; data_length_ = 0;
finished_ = false; finished_ = false;
} }
private: private:
@ -72,9 +72,7 @@ class RawDataCache : public FeatureExtractorInterface {
virtual void SetFinished() { finished_ = true; } virtual void SetFinished() { finished_ = true; }
virtual bool IsFinished() const { return finished_; } virtual bool IsFinished() const { return finished_; }
void SetDim(int32 dim) { dim_ = dim; } void SetDim(int32 dim) { dim_ = dim; }
virtual void Reset() { virtual void Reset() { finished_ = true; }
finished_ = true;
}
private: private:
kaldi::Vector<kaldi::BaseFloat> data_; kaldi::Vector<kaldi::BaseFloat> data_;

@ -1,3 +1,17 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// itf/decodable-itf.h // itf/decodable-itf.h
// Copyright 2009-2011 Microsoft Corporation; Saarland University; // Copyright 2009-2011 Microsoft Corporation; Saarland University;
@ -42,8 +56,10 @@ namespace kaldi {
For online decoding, where the features are coming in in real time, it is For online decoding, where the features are coming in in real time, it is
important to understand the IsLastFrame() and NumFramesReady() functions. important to understand the IsLastFrame() and NumFramesReady() functions.
There are two ways these are used: the old online-decoding code, in ../online/, There are two ways these are used: the old online-decoding code, in
and the new online-decoding code, in ../online2/. In the old online-decoding ../online/,
and the new online-decoding code, in ../online2/. In the old
online-decoding
code, the decoder would do: code, the decoder would do:
\code{.cc} \code{.cc}
for (int frame = 0; !decodable.IsLastFrame(frame); frame++) { for (int frame = 0; !decodable.IsLastFrame(frame); frame++) {
@ -52,13 +68,16 @@ namespace kaldi {
\endcode \endcode
and the call to IsLastFrame would block if the features had not arrived yet. and the call to IsLastFrame would block if the features had not arrived yet.
The decodable object would have to know when to terminate the decoding. This The decodable object would have to know when to terminate the decoding. This
online-decoding mode is still supported, it is what happens when you call, for online-decoding mode is still supported, it is what happens when you call,
for
example, LatticeFasterDecoder::Decode(). example, LatticeFasterDecoder::Decode().
We realized that this "blocking" mode of decoding is not very convenient We realized that this "blocking" mode of decoding is not very convenient
because it forces the program to be multi-threaded and makes it complex to because it forces the program to be multi-threaded and makes it complex to
control endpointing. In the "new" decoding code, you don't call (for example) control endpointing. In the "new" decoding code, you don't call (for
LatticeFasterDecoder::Decode(), you call LatticeFasterDecoder::InitDecoding(), example)
LatticeFasterDecoder::Decode(), you call
LatticeFasterDecoder::InitDecoding(),
and then each time you get more features, you provide them to the decodable and then each time you get more features, you provide them to the decodable
object, and you call LatticeFasterDecoder::AdvanceDecoding(), which does object, and you call LatticeFasterDecoder::AdvanceDecoding(), which does
something like this: something like this:
@ -68,7 +87,8 @@ namespace kaldi {
} }
\endcode \endcode
So the decodable object never has IsLastFrame() called. For decoding where So the decodable object never has IsLastFrame() called. For decoding where
you are starting with a matrix of features, the NumFramesReady() function will you are starting with a matrix of features, the NumFramesReady() function
will
always just return the number of frames in the file, and IsLastFrame() will always just return the number of frames in the file, and IsLastFrame() will
return true for the last frame. return true for the last frame.
@ -80,45 +100,54 @@ namespace kaldi {
frame of the file once we've decided to terminate decoding. frame of the file once we've decided to terminate decoding.
*/ */
class DecodableInterface { class DecodableInterface {
public: public:
/// Returns the log likelihood, which will be negated in the decoder. /// Returns the log likelihood, which will be negated in the decoder.
/// The "frame" starts from zero. You should verify that NumFramesReady() > frame /// The "frame" starts from zero. You should verify that NumFramesReady() >
/// before calling this. /// frame
virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0; /// before calling this.
virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0;
/// Returns true if this is the last frame. Frames are zero-based, so the
/// first frame is zero. IsLastFrame(-1) will return false, unless the file /// Returns true if this is the last frame. Frames are zero-based, so the
/// is empty (which is a case that I'm not sure all the code will handle, so /// first frame is zero. IsLastFrame(-1) will return false, unless the file
/// be careful). Caution: the behavior of this function in an online setting /// is empty (which is a case that I'm not sure all the code will handle, so
/// is being changed somewhat. In future it may return false in cases where /// be careful). Caution: the behavior of this function in an online
/// we haven't yet decided to terminate decoding, but later true if we decide /// setting
/// to terminate decoding. The plan in future is to rely more on /// is being changed somewhat. In future it may return false in cases where
/// NumFramesReady(), and in future, IsLastFrame() would always return false /// we haven't yet decided to terminate decoding, but later true if we
/// in an online-decoding setting, and would only return true in a /// decide
/// decoding-from-matrix setting where we want to allow the last delta or LDA /// to terminate decoding. The plan in future is to rely more on
/// features to be flushed out for compatibility with the baseline setup. /// NumFramesReady(), and in future, IsLastFrame() would always return false
virtual bool IsLastFrame(int32 frame) const = 0; /// in an online-decoding setting, and would only return true in a
/// decoding-from-matrix setting where we want to allow the last delta or
/// The call NumFramesReady() will return the number of frames currently available /// LDA
/// for this decodable object. This is for use in setups where you don't want the /// features to be flushed out for compatibility with the baseline setup.
/// decoder to block while waiting for input. This is newly added as of Jan 2014, virtual bool IsLastFrame(int32 frame) const = 0;
/// and I hope, going forward, to rely on this mechanism more than IsLastFrame to
/// know when to stop decoding. /// The call NumFramesReady() will return the number of frames currently
virtual int32 NumFramesReady() const { /// available
KALDI_ERR << "NumFramesReady() not implemented for this decodable type."; /// for this decodable object. This is for use in setups where you don't
return -1; /// want the
} /// decoder to block while waiting for input. This is newly added as of Jan
/// 2014,
/// Returns the number of states in the acoustic model /// and I hope, going forward, to rely on this mechanism more than
/// (they will be indexed one-based, i.e. from 1 to NumIndices(); /// IsLastFrame to
/// this is for compatibility with OpenFst). /// know when to stop decoding.
virtual int32 NumIndices() const = 0; virtual int32 NumFramesReady() const {
KALDI_ERR
virtual bool FrameLogLikelihood(int32 frame, << "NumFramesReady() not implemented for this decodable type.";
std::vector<kaldi::BaseFloat>* likelihood) = 0; return -1;
}
virtual ~DecodableInterface() {} /// Returns the number of states in the acoustic model
/// (they will be indexed one-based, i.e. from 1 to NumIndices();
/// this is for compatibility with OpenFst).
virtual int32 NumIndices() const = 0;
virtual bool FrameLogLikelihood(
int32 frame, std::vector<kaldi::BaseFloat>* likelihood) = 0;
virtual ~DecodableInterface() {}
}; };
/// @} /// @}
} // namespace Kaldi } // namespace Kaldi

@ -23,10 +23,7 @@ using kaldi::Vector;
Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet, Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
const std::shared_ptr<FeatureExtractorInterface>& frontend) const std::shared_ptr<FeatureExtractorInterface>& frontend)
: frontend_(frontend), : frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {}
nnet_(nnet),
frame_offset_(0),
frames_ready_(0) {}
void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) { void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
frames_ready_ += likelihood.NumRows(); frames_ready_ += likelihood.NumRows();

@ -148,7 +148,7 @@ def merge_configs(
for item in remove_train_list: for item in remove_train_list:
try: try:
remove_config_part(config, [item]) remove_config_part(config, [item])
except: except Exception as e:
print(item + " " + "can not be removed") print(item + " " + "can not be removed")
# Save the config # Save the config

Loading…
Cancel
Save