add aishell eg & add json parser & add write cmvn binary

4 years ago · ec33f8d73b
parent e75b906e11
commit ec33f8d73b
13 changed files with 42406 additions and 37 deletions
--- a/speechx/examples/aishell/local/split_data.sh
+++ b/speechx/examples/aishell/local/split_data.sh
@ -2,7 +2,9 @@

 data=$1
 feat_scp=$2
-numsplit=$3
+split_feat_name=$3
+numsplit=$4
+

 if ! [ "$numsplit" -gt 0 ]; then
  echo "Invalid num-split argument";
@ -10,7 +12,7 @@ if ! [ "$numsplit" -gt 0 ]; then
 fi

 directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
-feat_split_scp=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/feats.scp; done)
+feat_split_scp=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_feat_name}; done)
 echo $feat_split_scp
 # if this mkdir fails due to argument-list being too long, iterate.
 if ! mkdir -p $directories >&/dev/null; then
--- a/speechx/examples/aishell/run.sh
+++ b/speechx/examples/aishell/run.sh
@ -22,54 +22,60 @@ if [ ! -d ../paddle_asr_model ]; then
 fi

 mkdir -p data
-if [ ! -d ./test ]; then
+data=$PWD/data
+aishell_wav_scp=aishell_test.scp
+if [ ! -d $data/test ]; then
    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
-    unzip aishell_test.zip
-    realpath ./test/*/*.wav > wavlist
-    awk -F '/' '{ print $(NF) }' wavlist | awk -F '.' '{ print $1 }' > utt_id
-    paste utt_id wavlist > aishell_test.scp
+    unzip -d $data aishell_test.zip
+    realpath $data/test/*/*.wav > $data/wavlist
+    awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
+    paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
 fi

-if [ ! -d aishell_ds2_online_model ]; then
-    mkdir -p aishell_ds2_online_model 
-    wget -P ./aishell_ds2_online_model -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz
-    tar xzfv ./aishell_ds2_online_model/aishell_ds2_online_cer8.00_release.tar.gz -C ./aishell_ds2_online_model
+model_dir=$PWD/aishell_ds2_online_model
+if [ ! -d $model_dir ]; then
+    mkdir -p $model_dir 
+    wget -P $model_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $model_dir
 fi

 # 3. make feature
-aishell_wav_scp=./aishell_test.scp
-aishell_online_model=./aishell_ds2_online_model/exp/deepspeech2_online/checkpoints
-model_dir=../paddle_asr_model
-feat_ark=./feats.ark
-feat_scp=./aishell_feat.scp
-cmvn=./cmvn.ark
+aishell_online_model=$model_dir/exp/deepspeech2_online/checkpoints
+lm_model_dir=../paddle_asr_model
 label_file=./aishell_result
 wer=./aishell_wer

+nj=40
 export GLOG_logtostderr=1

+./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+
+data=$PWD/data
 # 3. gen linear feat
-linear_spectrogram_main \
-    --wav_rspecifier=scp:$aishell_wav_scp \
-    --feature_wspecifier=ark,scp:$feat_ark,$feat_scp \
-    --cmvn_write_path=$cmvn \
-    --streaming_chunk=10
+cmvn=$PWD/cmvn.ark
+cmvn_json2binary_main --json_file=$model_dir/data/mean_std.json --cmvn_write_path=$cmvn
+
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat_log \
+linear_spectrogram_without_db_norm_main \
+    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+    --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
+    --cmvn_file=$cmvn \
+    --streaming_chunk=0.36

-nj=10
-data=./data
-text=./test/text
-# recognizer
-./local/split_data.sh data aishell_feat.scp $nj
+text=$data/test/text

+# 4. recognizer
 utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log \
  offline_decoder_sliding_chunk_main \
-    --feature_rspecifier=scp:$data/split${nj}/JOB/feats.scp \
+    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
    --model_path=$aishell_online_model/avg_1.jit.pdmodel \
    --param_path=$aishell_online_model/avg_1.jit.pdiparams \
-    --dict_file=$model_dir/vocab.txt \
-    --lm_path=$model_dir/avg_1.jit.klm \
+    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+    --dict_file=$lm_model_dir/vocab.txt \
+    --lm_path=$lm_model_dir/avg_1.jit.klm \
    --result_wspecifier=ark,t:$data/split${nj}/JOB/result

 cat $data/split${nj}/*/result > $label_file

 local/compute-wer.py --char=1 --v=1 $label_file $text > $wer
+tail $wer
--- a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
+++ b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
@ -34,6 +34,12 @@ DEFINE_int32(receptive_field_length,
 DEFINE_int32(downsampling_rate,
             4,
             "two CNN(kernel=5) module downsampling rate.");
+DEFINE_string(model_output_names,
+              "save_infer_model/scale_0.tmp_1,save_infer_model/"
+              "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
+              "scale_3.tmp_1",
+              "model output names");
+DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");

 using kaldi::BaseFloat;
 using kaldi::Matrix;
@ -68,7 +74,8 @@ int main(int argc, char* argv[]) {
    ppspeech::ModelOptions model_opts;
    model_opts.model_path = model_graph;
    model_opts.params_path = model_params;
-    model_opts.cache_shape = "5-1-1024,5-1-1024";
+    model_opts.cache_shape = FLAGS_model_cache_names;
+    model_opts.output_names = FLAGS_model_output_names;
    std::shared_ptr<ppspeech::PaddleNnet> nnet(
        new ppspeech::PaddleNnet(model_opts));
    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
--- a/speechx/examples/feat/CMakeLists.txt
+++ b/speechx/examples/feat/CMakeLists.txt
@ -7,4 +7,12 @@ target_link_libraries(mfcc-test kaldi-mfcc)

 add_executable(linear_spectrogram_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_main.cc)
 target_include_directories(linear_spectrogram_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
+target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
+
+add_executable(linear_spectrogram_without_db_norm_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_without_db_norm_main.cc)
+target_include_directories(linear_spectrogram_without_db_norm_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(linear_spectrogram_without_db_norm_main frontend kaldi-util kaldi-feat-common gflags glog)
+
+add_executable(cmvn_json2binary_main ${CMAKE_CURRENT_SOURCE_DIR}/cmvn_json2binary_main.cc)
+target_include_directories(cmvn_json2binary_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(cmvn_json2binary_main utils kaldi-util kaldi-matrix gflags glog)
--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@ -182,6 +182,7 @@ int main(int argc, char* argv[]) {
    ppspeech::LinearSpectrogramOptions opt;
    opt.frame_opts.frame_length_ms = 20;
    opt.frame_opts.frame_shift_ms = 10;
+    opt.streaming_chunk = FLAGS_streaming_chunk;
    opt.frame_opts.dither = 0.0;
    opt.frame_opts.remove_dc_offset = false;
    opt.frame_opts.window_type = "hanning";
@ -257,6 +258,7 @@ int main(int argc, char* argv[]) {
            }
        }
        feat_writer.Write(utt, features);
+        feature_cache.Reset();

        if (num_done % 50 == 0 && num_done != 0)
            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
--- a/speechx/speechx/frontend/audio/audio_cache.cc
+++ b/speechx/speechx/frontend/audio/audio_cache.cc
@ -21,15 +21,20 @@ using kaldi::BaseFloat;
 using kaldi::VectorBase;
 using kaldi::Vector;

-AudioCache::AudioCache(int buffer_size)
+AudioCache::AudioCache(int buffer_size, bool convert2PCM32)
    : finished_(false),
      capacity_(buffer_size),
      size_(0),
      offset_(0),
-      timeout_(1) {
+      timeout_(1),
+      convert2PCM32_(convert2PCM32) {
    ring_buffer_.resize(capacity_);
 }

+BaseFloat AudioCache::Convert2PCM32(BaseFloat val) {
+    return val * (1. / std::pow(2.0, 15));
+}
+
 void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
    std::unique_lock<std::mutex> lock(mutex_);
    while (size_ + waves.Dim() > ring_buffer_.size()) {
@ -38,6 +43,8 @@ void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
    for (size_t idx = 0; idx < waves.Dim(); ++idx) {
        int32 buffer_idx = (idx + offset_) % ring_buffer_.size();
        ring_buffer_[buffer_idx] = waves(idx);
+        if (convert2PCM32_)
+            ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx));
    }
    size_ += waves.Dim();
 }
--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@ -23,7 +23,8 @@ namespace ppspeech {
 // waves cache
 class AudioCache : public FrontendInterface {
  public:
-    explicit AudioCache(int buffer_size = 100*kint16max);
+    explicit AudioCache(int buffer_size = 1000 * kint16max,
+                        bool convert2PCM32 = false);

    virtual void Accept(const kaldi::VectorBase<BaseFloat>& waves);

@ -46,6 +47,8 @@ class AudioCache : public FrontendInterface {
    }

  private:
+    kaldi::BaseFloat Convert2PCM32(kaldi::BaseFloat val);
+
    std::vector<kaldi::BaseFloat> ring_buffer_;
    size_t offset_;    // offset in ring_buffer_
    size_t size_;      // samples in ring_buffer_ now
@ -54,6 +57,7 @@ class AudioCache : public FrontendInterface {
    mutable std::mutex mutex_;
    std::condition_variable ready_feed_condition_;
    kaldi::int32 timeout_;  // millisecond
+    bool convert2PCM32_;

    DISALLOW_COPY_AND_ASSIGN(AudioCache);
 };
--- a/speechx/speechx/frontend/audio/linear_spectrogram.h
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.h
@ -46,7 +46,10 @@ class LinearSpectrogram : public FrontendInterface {
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
-    virtual void Reset() { base_extractor_->Reset(); }
+    virtual void Reset() { 
+        base_extractor_->Reset();
+        reminded_wav_.Resize(0);
+     }

  private:
    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
--- a/speechx/speechx/utils/CMakeLists.txt
+++ b/speechx/speechx/utils/CMakeLists.txt
@ -1,4 +1,5 @@

 add_library(utils
  file_utils.cc
+  simdjson.cpp
 )
--- a/speechx/speechx/utils/file_utils.cc
+++ b/speechx/speechx/utils/file_utils.cc
@ -31,4 +31,14 @@ bool ReadFileToVector(const std::string& filename,

    return true;
 }
-}
+
+std::string ReadFile2String(const std::string& path) {
+    std::ifstream input_file(path);
+    if (!input_file.is_open()) {
+        std::cerr << "please input a valid file" << std::endl;
+    }
+        return std::string((std::istreambuf_iterator<char>(input_file)),
+                            std::istreambuf_iterator<char>());
+}
+
+}
--- a/speechx/speechx/utils/file_utils.h
+++ b/speechx/speechx/utils/file_utils.h
@ -18,4 +18,7 @@ namespace ppspeech {

 bool ReadFileToVector(const std::string& filename,
                      std::vector<std::string>* data);
+
+std::string ReadFile2String(const std::string& path);
+
 }
--- a/speechx/speechx/utils/simdjson.cpp
+++ b/speechx/speechx/utils/simdjson.cpp
--- a/speechx/speechx/utils/simdjson.h
+++ b/speechx/speechx/utils/simdjson.h