diff --git a/speechx/examples/README.md b/speechx/examples/README.md index 50f5f902..18b37281 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -1,7 +1,7 @@ # Examples for SpeechX -* ds2_ol - ds2 streaming test under `aishell-1` test dataset. - The entrypoint is `ds2_ol/aishell/run.sh` +* ds2_ol - ds2 streaming test under `aishell-1` test dataset. +The entrypoint is `ds2_ol/aishell/run.sh` ## How to run diff --git a/speechx/examples/ds2_ol/README.md b/speechx/examples/ds2_ol/README.md index 18f248a1..ed88ef6b 100644 --- a/speechx/examples/ds2_ol/README.md +++ b/speechx/examples/ds2_ol/README.md @@ -1,12 +1,12 @@ -# Deepspeech2 Streaming +# Deepspeech2 Streaming ASR -Please go to `aishell` to test it. - -* aishell -Deepspeech2 Streaming Decoding under aishell dataset. * websocket Streaming ASR with websocket. +* aishell +Streaming Decoding under aishell dataset, for local WER test and so on. + +## More The below is for developing and offline testing: * nnet * feat diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index 06f27427..049c9bf9 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -112,8 +112,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_lm > $exp/${wer}.lm fi +wfst=$data/wfst/ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - wfst=$data/wfst/ mkdir -p $wfst if [ ! -f $wfst/aishell_graph.zip ]; then pushd $wfst @@ -122,18 +122,18 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then mv aishell_graph/* $wfst popd fi +fi - graph_dir=$wfst/ - +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # TLG decoder utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \ wfst-decoder-ol \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ - --word_symbol_table=$graph_dir/words.txt \ + --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --graph_path=$graph_dir/TLG.fst --max_active=7500 \ + --graph_path=$wfst/TLG.fst --max_active=7500 \ --acoustic_scale=1.2 \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg @@ -142,40 +142,21 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - - cmvn=$data/cmvn.ark - if [ ! -f $data/split${nj}/1/${aishell_wav_scp} ]; then - cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn - ./local/split_data.sh $data ${data}/${aishell_wav_scp} $aishell_wav_scp $nj - fi - - wfst=$data/wfst/ - mkdir -p $wfst - if [ ! -f $wfst/aishell_graph.zip ]; then - pushd $wfst - wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip - unzip aishell_graph.zip - popd - fi - - graph_dir=$wfst/aishell_graph - # TLG decoder utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.log \ recognizer_test_main \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ - --convert2PCM32=true \ + --to_float32=true \ --streaming_chunk=30 \ - --params_path=$model_dir/avg_1.jit.pdiparams \ - --word_symbol_table=$graph_dir/words.txt \ + --param_path=$model_dir/avg_1.jit.pdiparams \ + --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --graph_path=$graph_dir/TLG.fst --max_active=7500 \ + --graph_path=$wfst/TLG.fst --max_active=7500 \ --acoustic_scale=1.2 \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer cat $data/split${nj}/*/result_recognizer > $exp/${label_file}_recognizer utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer -fi - +fi \ No newline at end of file diff --git a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc index 6f532af4..f8f62f84 100644 --- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc +++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc @@ -115,7 +115,7 @@ int main(int argc, char* argv[]) { flag = feature_cache.Read(&features); feats.push_back(features); feature_rows += features.Dim() / feature_cache.Dim(); - } while(flag == true && features.Dim() != 0); + } while (flag == true && features.Dim() != 0); sample_offset += cur_chunk_size; } diff --git a/speechx/examples/ds2_ol/websocket/websocket_client.sh b/speechx/examples/ds2_ol/websocket/websocket_client.sh index 3c6b4e91..2a52d2a3 100755 --- a/speechx/examples/ds2_ol/websocket/websocket_client.sh +++ b/speechx/examples/ds2_ol/websocket/websocket_client.sh @@ -14,9 +14,7 @@ fi # input mkdir -p data data=$PWD/data -ckpt_dir=$data/model -model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ -vocb_dir=$ckpt_dir/data/lang_char + # output aishell_wav_scp=aishell_test.scp if [ ! -d $data/test ]; then @@ -34,4 +32,4 @@ export GLOG_logtostderr=1 # websocket client websocket_client_main \ - --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36 + --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36 \ No newline at end of file diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh index 0e9e796c..0e389f89 100755 --- a/speechx/examples/ds2_ol/websocket/websocket_server.sh +++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh @@ -19,12 +19,26 @@ ckpt_dir=$data/model model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ vocb_dir=$ckpt_dir/data/lang_char/ +# output +aishell_wav_scp=aishell_test.scp +if [ ! -d $data/test ]; then + pushd $data + wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip + unzip aishell_test.zip + popd + + realpath $data/test/*/*.wav > $data/wavlist + awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id + paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp +fi + + if [ ! -f $ckpt_dir/data/mean_std.json ]; then - mkdir -p $ckpt_dir - pushd $ckpt_dir - wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz - tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz - popd + mkdir -p $ckpt_dir + pushd $ckpt_dir + wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz + tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz + popd fi export GLOG_logtostderr=1 @@ -49,9 +63,9 @@ websocket_server_main \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ --streaming_chunk=0.1 \ - --convert2PCM32=true \ + --to_float32=true \ --param_path=$model_dir/avg_1.jit.pdiparams \ - --word_symbol_table=$data/wfst/words.txt \ + --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --graph_path=$data/wfst/TLG.fst --max_active=7500 \ + --graph_path=$wfst/TLG.fst --max_active=7500 \ --acoustic_scale=1.2 diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index aff8d39a..f6bd77ca 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -21,7 +21,7 @@ DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size"); -DEFINE_bool(convert2PCM32, true, "audio convert to pcm32"); +DEFINE_bool(to_float32, true, "audio convert to pcm32"); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); @@ -52,7 +52,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { FeaturePipelineOptions opts; opts.cmvn_file = FLAGS_cmvn_file; opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk; - opts.convert2PCM32 = FLAGS_convert2PCM32; + opts.to_float32 = FLAGS_to_float32; kaldi::FrameExtractionOptions frame_opts; frame_opts.frame_length_ms = 20; frame_opts.frame_shift_ms = 10; diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc index e8af6668..b7a15acd 100644 --- a/speechx/speechx/frontend/audio/audio_cache.cc +++ b/speechx/speechx/frontend/audio/audio_cache.cc @@ -21,17 +21,18 @@ using kaldi::BaseFloat; using kaldi::VectorBase; using kaldi::Vector; -AudioCache::AudioCache(int buffer_size, bool convert2PCM32) +AudioCache::AudioCache(int buffer_size, bool to_float32) : finished_(false), - capacity_(buffer_size), + capacity_(buffer_size), // unit: sample size_(0), offset_(0), - timeout_(1), - convert2PCM32_(convert2PCM32) { + timeout_(1), // ms + to_float32_(to_float32) { ring_buffer_.resize(capacity_); } BaseFloat AudioCache::Convert2PCM32(BaseFloat val) { + // sample type int16, int16->float32 return val * (1. / std::pow(2.0, 15)); } @@ -43,8 +44,7 @@ void AudioCache::Accept(const VectorBase& waves) { for (size_t idx = 0; idx < waves.Dim(); ++idx) { int32 buffer_idx = (idx + offset_ + size_) % ring_buffer_.size(); ring_buffer_[buffer_idx] = waves(idx); - if (convert2PCM32_) - ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx)); + if (to_float32_) ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx)); } size_ += waves.Dim(); } diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h index a681ef09..371b2cc2 100644 --- a/speechx/speechx/frontend/audio/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -24,7 +24,7 @@ namespace ppspeech { class AudioCache : public FrontendInterface { public: explicit AudioCache(int buffer_size = 1000 * kint16max, - bool convert2PCM32 = true); + bool to_float32 = true); virtual void Accept(const kaldi::VectorBase& waves); @@ -50,14 +50,15 @@ class AudioCache : public FrontendInterface { kaldi::BaseFloat Convert2PCM32(kaldi::BaseFloat val); std::vector ring_buffer_; - size_t offset_; // offset in ring_buffer_ - size_t size_; // samples in ring_buffer_ now - size_t capacity_; // capacity of ring_buffer_ + size_t offset_; // offset in ring_buffer_, begin of data + size_t size_; // samples in ring_buffer_, size of valid data + size_t capacity_; // capacity of ring_buffer_, full size of data buffer, + // unit: sample bool finished_; // reach audio end std::mutex mutex_; std::condition_variable ready_feed_condition_; kaldi::int32 timeout_; // millisecond - bool convert2PCM32_; + bool to_float32_; DISALLOW_COPY_AND_ASSIGN(AudioCache); }; diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc index c7e446c9..1ea83aba 100644 --- a/speechx/speechx/frontend/audio/cmvn.cc +++ b/speechx/speechx/frontend/audio/cmvn.cc @@ -37,14 +37,17 @@ CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) } void CMVN::Accept(const kaldi::VectorBase& inputs) { + // feed waves/feats to compute feature base_extractor_->Accept(inputs); return; } bool CMVN::Read(kaldi::Vector* feats) { + // compute feature if (base_extractor_->Read(feats) == false || feats->Dim() == 0) { return false; } + // appply cmvn Compute(feats); return true; } diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h index a812278c..852385e9 100644 --- a/speechx/speechx/frontend/audio/data_cache.h +++ b/speechx/speechx/frontend/audio/data_cache.h @@ -27,6 +27,7 @@ class DataCache : public FrontendInterface { public: explicit DataCache() { finished_ = false; } + // accept waves/feats virtual void Accept(const kaldi::VectorBase& inputs) { data_ = inputs; } diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/frontend/audio/fbank.h index 68267b3d..96a97dfd 100644 --- a/speechx/speechx/frontend/audio/fbank.h +++ b/speechx/speechx/frontend/audio/fbank.h @@ -15,23 +15,56 @@ // wrap the fbank feat of kaldi, todo (SmileGoat) #include "kaldi/feat/feature-mfcc.h" - #incldue "kaldi/matrix/kaldi-vector.h" namespace ppspeech { -class FbankExtractor : FrontendInterface { +struct FbankOptions { + kaldi::FrameExtractionOptions frame_opts; + kaldi::BaseFloat streaming_chunk; // second + + LinearSpectrogramOptions() : streaming_chunk(0.1), frame_opts() {} + + void Register(kaldi::OptionsItf* opts) { + opts->Register("streaming-chunk", + &streaming_chunk, + "streaming chunk size, default: 0.1 sec"); + frame_opts.Register(opts); + } +}; + + +class Fbank : FrontendInterface { public: - explicit FbankExtractor(const FbankOptions& opts, - share_ptr pre_extractor); - virtual void AcceptWaveform( - const kaldi::Vector& input) = 0; - virtual void Read(kaldi::Vector* feat) = 0; - virtual size_t Dim() const = 0; + explicit Fbank(const FbankOptions& opts, + unique_ptr base_extractor); + virtual void Accept(const kaldi::VectorBase& inputs); + virtual bool Read(kaldi::Vector* feats); + + // the dim_ is the dim of single frame feature + virtual size_t Dim() const { return dim_; } + + virtual void SetFinished() { base_extractor_->SetFinished(); } + + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + + virtual void Reset() { + base_extractor_->Reset(); + remained_wav_.Resize(0); + } private: - bool Compute(const kaldi::Vector& wave, - kaldi::Vector* feat) const; + bool Compute(const kaldi::Vector& waves, + kaldi::Vector* feats); + + // kaldi::FeatureWindowFunction feature_window_funtion_; + // kaldi::BaseFloat hanning_window_energy_; + size_t dim_; + FbankOptions opts_; + std::unique_ptr base_extractor_; + kaldi::Vector remained_wav_; + int chunk_sample_size_; + DISALLOW_COPY_AND_ASSIGN(Fbank); }; } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc index b5768460..e1e1043b 100644 --- a/speechx/speechx/frontend/audio/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -28,11 +28,13 @@ FeatureCache::FeatureCache(FeatureCacheOptions opts, max_size_ = opts.max_size; frame_chunk_stride_ = opts.frame_chunk_stride; frame_chunk_size_ = opts.frame_chunk_size; + timeout_ = opts.timeout; // ms base_extractor_ = std::move(base_extractor); dim_ = base_extractor_->Dim(); } void FeatureCache::Accept(const kaldi::VectorBase& inputs) { + // read inputs base_extractor_->Accept(inputs); // feed current data bool result = false; @@ -49,9 +51,8 @@ bool FeatureCache::Read(kaldi::Vector* feats) { while (cache_.empty() && base_extractor_->IsFinished() == false) { // todo refactor: wait // ready_read_condition_.wait(lock); - int32 elapsed = static_cast(timer.Elapsed() * 1000); - // todo replace 1 with timeout_, 1 ms - if (elapsed > 1) { + int32 elapsed = static_cast(timer.Elapsed() * 1000); // ms + if (elapsed > timeout_) { return false; } usleep(100); // sleep 0.1 ms @@ -70,6 +71,8 @@ bool FeatureCache::Compute() { Vector feature; bool result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) return false; + + // join with remained int32 joint_len = feature.Dim() + remained_feature_.Dim(); int32 num_chunk = ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1; @@ -82,6 +85,7 @@ bool FeatureCache::Compute() { for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) { int32 start = chunk_idx * frame_chunk_stride_ * dim_; + Vector feature_chunk(frame_chunk_size_ * dim_); SubVector tmp(joint_feature.Data() + start, frame_chunk_size_ * dim_); @@ -89,6 +93,7 @@ bool FeatureCache::Compute() { std::unique_lock lock(mutex_); while (cache_.size() >= max_size_) { + // cache full, wait ready_feed_condition_.wait(lock); } diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h index 607f72c0..0dc704bb 100644 --- a/speechx/speechx/frontend/audio/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -23,8 +23,12 @@ struct FeatureCacheOptions { int32 max_size; int32 frame_chunk_size; int32 frame_chunk_stride; + int32 timeout; // ms FeatureCacheOptions() - : max_size(kint16max), frame_chunk_size(1), frame_chunk_stride(1) {} + : max_size(kint16max), + frame_chunk_size(1), + frame_chunk_stride(1), + timeout(1) {} }; class FeatureCache : public FrontendInterface { @@ -64,14 +68,15 @@ class FeatureCache : public FrontendInterface { bool Compute(); int32 dim_; - size_t max_size_; - int32 frame_chunk_size_; - int32 frame_chunk_stride_; + size_t max_size_; // cache capacity + int32 frame_chunk_size_; // window + int32 frame_chunk_stride_; // stride + std::unique_ptr base_extractor_; + kaldi::int32 timeout_; // ms kaldi::Vector remained_feature_; - std::unique_ptr base_extractor_; + std::queue> cache_; // feature cache std::mutex mutex_; - std::queue> cache_; std::condition_variable ready_feed_condition_; std::condition_variable ready_read_condition_; diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 86eca2e0..5914fedb 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -20,7 +20,7 @@ using std::unique_ptr; FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { unique_ptr data_source( - new ppspeech::AudioCache(1000 * kint16max, opts.convert2PCM32)); + new ppspeech::AudioCache(1000 * kint16max, opts.to_float32)); unique_ptr linear_spectrogram( new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts, diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 7bd6c84f..580c02fa 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -27,12 +27,12 @@ namespace ppspeech { struct FeaturePipelineOptions { std::string cmvn_file; - bool convert2PCM32; + bool to_float32; LinearSpectrogramOptions linear_spectrogram_opts; FeatureCacheOptions feature_cache_opts; FeaturePipelineOptions() : cmvn_file(""), - convert2PCM32(false), + to_float32(false), linear_spectrogram_opts(), feature_cache_opts() {} };