add aishell eg & add json parser & add write cmvn binary

4 years ago · ec33f8d73b
parent e75b906e11
commit ec33f8d73b
13 changed files with 42406 additions and 37 deletions
--- a/speechx/examples/aishell/local/split_data.sh
+++ b/speechx/examples/aishell/local/split_data.sh
@ -2,7 +2,9 @@
 data=$1
 feat_scp=$2
-numsplit=$3
+split_feat_name=$3
 numsplit=$4
 if ! [ "$numsplit" -gt 0 ]; then
  echo "Invalid num-split argument";
@ -10,7 +12,7 @@ if ! [ "$numsplit" -gt 0 ]; then
 fi
 directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
-feat_split_scp=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/feats.scp; done)
+feat_split_scp=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_feat_name}; done)
 echo $feat_split_scp
 # if this mkdir fails due to argument-list being too long, iterate.
 if ! mkdir -p $directories >&/dev/null; then
--- a/speechx/examples/aishell/run.sh
+++ b/speechx/examples/aishell/run.sh
@ -22,54 +22,60 @@ if [ ! -d ../paddle_asr_model ]; then
 fi
 mkdir -p data
-if [ ! -d ./test ]; then
+data=$PWD/data
 aishell_wav_scp=aishell_test.scp
 if [ ! -d $data/test ]; then
    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
-    unzip aishell_test.zip
+    unzip -d $data aishell_test.zip
-    realpath ./test/*/*.wav > wavlist
+    realpath $data/test/*/*.wav > $data/wavlist
-    awk -F '/' '{ print $(NF) }' wavlist | awk -F '.' '{ print $1 }' > utt_id
+    awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
-    paste utt_id wavlist > aishell_test.scp
+    paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
 fi
-if [ ! -d aishell_ds2_online_model ]; then
+model_dir=$PWD/aishell_ds2_online_model
-    mkdir -p aishell_ds2_online_model 
+if [ ! -d $model_dir ]; then
-    wget -P ./aishell_ds2_online_model -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz
+    mkdir -p $model_dir 
-    tar xzfv ./aishell_ds2_online_model/aishell_ds2_online_cer8.00_release.tar.gz -C ./aishell_ds2_online_model
+    wget -P $model_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
    tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $model_dir
 fi
 # 3. make feature
-aishell_wav_scp=./aishell_test.scp
+aishell_online_model=$model_dir/exp/deepspeech2_online/checkpoints
-aishell_online_model=./aishell_ds2_online_model/exp/deepspeech2_online/checkpoints
+lm_model_dir=../paddle_asr_model
 model_dir=../paddle_asr_model
 feat_ark=./feats.ark
 feat_scp=./aishell_feat.scp
 cmvn=./cmvn.ark
 label_file=./aishell_result
 wer=./aishell_wer
 nj=40
 export GLOG_logtostderr=1
 ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
 data=$PWD/data
 # 3. gen linear feat
-linear_spectrogram_main \
+cmvn=$PWD/cmvn.ark
-    --wav_rspecifier=scp:$aishell_wav_scp \
+cmvn_json2binary_main --json_file=$model_dir/data/mean_std.json --cmvn_write_path=$cmvn
-    --feature_wspecifier=ark,scp:$feat_ark,$feat_scp \
+
-    --cmvn_write_path=$cmvn \
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat_log \
-    --streaming_chunk=10
+linear_spectrogram_without_db_norm_main \
    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
    --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
    --cmvn_file=$cmvn \
    --streaming_chunk=0.36
-nj=10
+text=$data/test/text
 data=./data
 text=./test/text
 # recognizer
 ./local/split_data.sh data aishell_feat.scp $nj
 # 4. recognizer
 utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log \
  offline_decoder_sliding_chunk_main \
-    --feature_rspecifier=scp:$data/split${nj}/JOB/feats.scp \
+    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
    --model_path=$aishell_online_model/avg_1.jit.pdmodel \
    --param_path=$aishell_online_model/avg_1.jit.pdiparams \
-    --dict_file=$model_dir/vocab.txt \
+    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-    --lm_path=$model_dir/avg_1.jit.klm \
+    --dict_file=$lm_model_dir/vocab.txt \
    --lm_path=$lm_model_dir/avg_1.jit.klm \
    --result_wspecifier=ark,t:$data/split${nj}/JOB/result
 cat $data/split${nj}/*/result > $label_file
 local/compute-wer.py --char=1 --v=1 $label_file $text > $wer
 tail $wer
--- a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
+++ b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
@ -34,6 +34,12 @@ DEFINE_int32(receptive_field_length,
 DEFINE_int32(downsampling_rate,
             4,
             "two CNN(kernel=5) module downsampling rate.");
 DEFINE_string(model_output_names,
              "save_infer_model/scale_0.tmp_1,save_infer_model/"
              "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
              "scale_3.tmp_1",
              "model output names");
 DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
 using kaldi::BaseFloat;
 using kaldi::Matrix;
@ -68,7 +74,8 @@ int main(int argc, char* argv[]) {
    ppspeech::ModelOptions model_opts;
    model_opts.model_path = model_graph;
    model_opts.params_path = model_params;
-    model_opts.cache_shape = "5-1-1024,5-1-1024";
+    model_opts.cache_shape = FLAGS_model_cache_names;
    model_opts.output_names = FLAGS_model_output_names;
    std::shared_ptr<ppspeech::PaddleNnet> nnet(
        new ppspeech::PaddleNnet(model_opts));
    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
--- a/speechx/examples/feat/CMakeLists.txt
+++ b/speechx/examples/feat/CMakeLists.txt
@ -7,4 +7,12 @@ target_link_libraries(mfcc-test kaldi-mfcc)
 add_executable(linear_spectrogram_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_main.cc)
 target_include_directories(linear_spectrogram_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
+target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
 add_executable(linear_spectrogram_without_db_norm_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_without_db_norm_main.cc)
 target_include_directories(linear_spectrogram_without_db_norm_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
 target_link_libraries(linear_spectrogram_without_db_norm_main frontend kaldi-util kaldi-feat-common gflags glog)
 add_executable(cmvn_json2binary_main ${CMAKE_CURRENT_SOURCE_DIR}/cmvn_json2binary_main.cc)
 target_include_directories(cmvn_json2binary_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
 target_link_libraries(cmvn_json2binary_main utils kaldi-util kaldi-matrix gflags glog)
--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@ -182,6 +182,7 @@ int main(int argc, char* argv[]) {
    ppspeech::LinearSpectrogramOptions opt;
    opt.frame_opts.frame_length_ms = 20;
    opt.frame_opts.frame_shift_ms = 10;
    opt.streaming_chunk = FLAGS_streaming_chunk;
    opt.frame_opts.dither = 0.0;
    opt.frame_opts.remove_dc_offset = false;
    opt.frame_opts.window_type = "hanning";
@ -257,6 +258,7 @@ int main(int argc, char* argv[]) {
            }
        }
        feat_writer.Write(utt, features);
        feature_cache.Reset();
        if (num_done % 50 == 0 && num_done != 0)
            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
--- a/speechx/speechx/frontend/audio/audio_cache.cc
+++ b/speechx/speechx/frontend/audio/audio_cache.cc
@ -21,15 +21,20 @@ using kaldi::BaseFloat;
 using kaldi::VectorBase;
 using kaldi::Vector;
-AudioCache::AudioCache(int buffer_size)
+AudioCache::AudioCache(int buffer_size, bool convert2PCM32)
    : finished_(false),
      capacity_(buffer_size),
      size_(0),
      offset_(0),
-      timeout_(1) {
+      timeout_(1),
      convert2PCM32_(convert2PCM32) {
    ring_buffer_.resize(capacity_);
 }
 BaseFloat AudioCache::Convert2PCM32(BaseFloat val) {
    return val * (1. / std::pow(2.0, 15));
 }
 void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
    std::unique_lock<std::mutex> lock(mutex_);
    while (size_ + waves.Dim() > ring_buffer_.size()) {
@ -38,6 +43,8 @@ void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
    for (size_t idx = 0; idx < waves.Dim(); ++idx) {
        int32 buffer_idx = (idx + offset_) % ring_buffer_.size();
        ring_buffer_[buffer_idx] = waves(idx);
        if (convert2PCM32_)
            ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx));
    }
    size_ += waves.Dim();
 }
--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@ -23,7 +23,8 @@ namespace ppspeech {
 // waves cache
 class AudioCache : public FrontendInterface {
  public:
-    explicit AudioCache(int buffer_size = 100*kint16max);
+    explicit AudioCache(int buffer_size = 1000 * kint16max,
                        bool convert2PCM32 = false);
    virtual void Accept(const kaldi::VectorBase<BaseFloat>& waves);
@ -46,6 +47,8 @@ class AudioCache : public FrontendInterface {
    }
  private:
    kaldi::BaseFloat Convert2PCM32(kaldi::BaseFloat val);
    std::vector<kaldi::BaseFloat> ring_buffer_;
    size_t offset_;    // offset in ring_buffer_
    size_t size_;      // samples in ring_buffer_ now
@ -54,6 +57,7 @@ class AudioCache : public FrontendInterface {
    mutable std::mutex mutex_;
    std::condition_variable ready_feed_condition_;
    kaldi::int32 timeout_;  // millisecond
    bool convert2PCM32_;
    DISALLOW_COPY_AND_ASSIGN(AudioCache);
 };
--- a/speechx/speechx/frontend/audio/linear_spectrogram.h
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.h
@ -46,7 +46,10 @@ class LinearSpectrogram : public FrontendInterface {
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
-    virtual void Reset() { base_extractor_->Reset(); }
+    virtual void Reset() { 
        base_extractor_->Reset();
        reminded_wav_.Resize(0);
     }
  private:
    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
--- a/speechx/speechx/utils/CMakeLists.txt
+++ b/speechx/speechx/utils/CMakeLists.txt
@ -1,4 +1,5 @@
 add_library(utils
  file_utils.cc
  simdjson.cpp
 )
--- a/speechx/speechx/utils/file_utils.cc
+++ b/speechx/speechx/utils/file_utils.cc
@ -31,4 +31,14 @@ bool ReadFileToVector(const std::string& filename,
    return true;
 }
-}
+
 std::string ReadFile2String(const std::string& path) {
    std::ifstream input_file(path);
    if (!input_file.is_open()) {
        std::cerr << "please input a valid file" << std::endl;
    }
        return std::string((std::istreambuf_iterator<char>(input_file)),
                            std::istreambuf_iterator<char>());
 }
 }
--- a/speechx/speechx/utils/file_utils.h
+++ b/speechx/speechx/utils/file_utils.h
@ -18,4 +18,7 @@ namespace ppspeech {
 bool ReadFileToVector(const std::string& filename,
                      std::vector<std::string>* data);
 std::string ReadFile2String(const std::string& path);
 }
--- a/speechx/speechx/utils/simdjson.cpp
+++ b/speechx/speechx/utils/simdjson.cpp
--- a/speechx/speechx/utils/simdjson.h
+++ b/speechx/speechx/utils/simdjson.h