diff --git a/speechx/examples/aishell/run.sh b/speechx/examples/aishell/run.sh index a21ba086..8a16a865 100755 --- a/speechx/examples/aishell/run.sh +++ b/speechx/examples/aishell/run.sh @@ -48,7 +48,7 @@ wer=./aishell_wer nj=40 export GLOG_logtostderr=1 -./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj +#./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj data=$PWD/data # 3. gen linear feat @@ -72,10 +72,42 @@ utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log \ --param_path=$aishell_online_model/avg_1.jit.pdiparams \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --dict_file=$lm_model_dir/vocab.txt \ - --lm_path=$lm_model_dir/avg_1.jit.klm \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result -cat $data/split${nj}/*/result > $label_file +cat $data/split${nj}/*/result > ${label_file} +local/compute-wer.py --char=1 --v=1 ${label_file} $text > ${wer} + +# 4. decode with lm +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log_lm \ + offline_decoder_sliding_chunk_main \ + --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ + --model_path=$aishell_online_model/avg_1.jit.pdmodel \ + --param_path=$aishell_online_model/avg_1.jit.pdiparams \ + --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ + --dict_file=$lm_model_dir/vocab.txt \ + --lm_path=$lm_model_dir/avg_1.jit.klm \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm + +cat $data/split${nj}/*/result_lm > ${label_file}_lm +local/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm + +graph_dir=./aishell_graph +if [ ! -d $ ]; then + wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip + unzip -d aishell_graph.zip +fi + +# 5. test TLG decoder +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log_tlg \ + offline_wfst_decoder_main \ + --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ + --model_path=$aishell_online_model/avg_1.jit.pdmodel \ + --param_path=$aishell_online_model/avg_1.jit.pdiparams \ + --word_symbol_table=$graph_dir/words.txt \ + --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ + --graph_path=$graph_dir/TLG.fst --max_active=7500 \ + --acoustic_scale=1.2 \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg -local/compute-wer.py --char=1 --v=1 $label_file $text > $wer -tail $wer +cat $data/split${nj}/*/result_tlg > ${label_file}_tlg +local/compute-wer.py --char=1 --v=1 ${label_file}_tlg $text > ${wer}_tlg \ No newline at end of file diff --git a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc index be56342f..1bb9ce22 100644 --- a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc +++ b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc @@ -27,7 +27,7 @@ DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm"); -DEFINE_string(lm_path, "lm.klm", "language model"); +DEFINE_string(lm_path, "", "language model"); DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=5) downsampling module."); @@ -45,7 +45,6 @@ using kaldi::BaseFloat; using kaldi::Matrix; using std::vector; - // test ds2 online decoder by feeding speech feature int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); @@ -63,7 +62,6 @@ int main(int argc, char* argv[]) { LOG(INFO) << "dict path: " << dict_file; LOG(INFO) << "lm path: " << lm_path; - int32 num_done = 0, num_err = 0; ppspeech::CTCBeamSearchOptions opts; @@ -139,6 +137,10 @@ int main(int argc, char* argv[]) { std::string result; result = decoder.GetFinalBestPath(); KALDI_LOG << " the result of " << utt << " is " << result; + if (result.empty()) { + // the TokenWriter can not write empty string. + result = " "; + } result_writer.Write(utt, result); decodable->Reset(); decoder.Reset(); diff --git a/speechx/examples/decoder/offline_wfst_decoder_main.cc b/speechx/examples/decoder/offline_wfst_decoder_main.cc index 90dc8840..2520922a 100644 --- a/speechx/examples/decoder/offline_wfst_decoder_main.cc +++ b/speechx/examples/decoder/offline_wfst_decoder_main.cc @@ -22,10 +22,11 @@ #include "nnet/decodable.h" #include "nnet/paddle_nnet.h" -DEFINE_string(feature_respecifier, "", "test feature rspecifier"); +DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); +DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(word_symbol_table, "vocab.txt", "word symbol table"); +DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); DEFINE_string(graph_path, "TLG", "decoder graph"); DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); DEFINE_int32(max_active, 7500, "decoder graph"); @@ -35,22 +36,33 @@ DEFINE_int32(receptive_field_length, DEFINE_int32(downsampling_rate, 4, "two CNN(kernel=5) module downsampling rate."); +DEFINE_string(model_output_names, + "save_infer_model/scale_0.tmp_1,save_infer_model/" + "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/" + "scale_3.tmp_1", + "model output names"); +DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names"); using kaldi::BaseFloat; using kaldi::Matrix; using std::vector; -// test clg decoder by feeding speech feature. +// test TLG decoder by feeding speech feature. int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); kaldi::SequentialBaseFloatMatrixReader feature_reader( - FLAGS_feature_respecifier); + FLAGS_feature_rspecifier); + kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); std::string model_graph = FLAGS_model_path; std::string model_params = FLAGS_param_path; std::string word_symbol_table = FLAGS_word_symbol_table; std::string graph_path = FLAGS_graph_path; + LOG(INFO) << "model path: " << model_graph; + LOG(INFO) << "model param: " << model_params; + LOG(INFO) << "word symbol path: " << word_symbol_table; + LOG(INFO) << "graph path: " << graph_path; int32 num_done = 0, num_err = 0; @@ -65,7 +77,8 @@ int main(int argc, char* argv[]) { ppspeech::ModelOptions model_opts; model_opts.model_path = model_graph; model_opts.params_path = model_params; - model_opts.cache_shape = "5-1-1024,5-1-1024"; + model_opts.cache_shape = FLAGS_model_cache_names; + model_opts.output_names = FLAGS_model_output_names; std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); std::shared_ptr raw_data(new ppspeech::DataCache()); @@ -127,6 +140,11 @@ int main(int argc, char* argv[]) { std::string result; result = decoder.GetFinalBestPath(); KALDI_LOG << " the result of " << utt << " is " << result; + if (result.empty()) { + // the TokenWriter can not write empty string. + result = " "; + } + result_writer.Write(utt, result); decodable->Reset(); decoder.Reset(); ++num_done; diff --git a/speechx/speechx/nnet/paddle_nnet.cc b/speechx/speechx/nnet/paddle_nnet.cc index c4b91cf6..c065d530 100644 --- a/speechx/speechx/nnet/paddle_nnet.cc +++ b/speechx/speechx/nnet/paddle_nnet.cc @@ -94,7 +94,6 @@ PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) { void PaddleNnet::Reset() { InitCacheEncouts(opts_); } paddle_infer::Predictor* PaddleNnet::GetPredictor() { - LOG(INFO) << "attempt to get a new predictor instance " << std::endl; paddle_infer::Predictor* predictor = nullptr; std::lock_guard guard(pool_mutex); int pred_id = 0; @@ -110,7 +109,6 @@ paddle_infer::Predictor* PaddleNnet::GetPredictor() { if (predictor) { pool_usages[pred_id] = true; predictor_to_thread_id[predictor] = pred_id; - LOG(INFO) << pred_id << " predictor create success"; } else { LOG(INFO) << "Failed to get predictor from pool !!!"; } @@ -119,7 +117,6 @@ paddle_infer::Predictor* PaddleNnet::GetPredictor() { } int PaddleNnet::ReleasePredictor(paddle_infer::Predictor* predictor) { - LOG(INFO) << "attempt to releae a predictor"; std::lock_guard guard(pool_mutex); auto iter = predictor_to_thread_id.find(predictor); @@ -128,10 +125,8 @@ int PaddleNnet::ReleasePredictor(paddle_infer::Predictor* predictor) { return 0; } - LOG(INFO) << iter->second << " predictor will be release"; pool_usages[iter->second] = false; predictor_to_thread_id.erase(predictor); - LOG(INFO) << "release success"; return 0; } @@ -152,7 +147,6 @@ void PaddleNnet::FeedForward(const Vector& features, int feat_row = features.Dim() / feature_dim; std::vector input_names = predictor->GetInputNames(); std::vector output_names = predictor->GetOutputNames(); - LOG(INFO) << "feat info: rows, cols: " << feat_row << ", " << feature_dim; std::unique_ptr input_tensor = predictor->GetInputHandle(input_names[0]);