From ed6bb7a54b82404723ed4a738cc8fc7e9355ed17 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 26 Oct 2021 08:49:48 +0000
Subject: [PATCH] refactor the code

---
 deepspeech/decoders/recog.py                  |  7 +++---
 deepspeech/models/ds2_online/deepspeech2.py   |  1 -
 deepspeech/utils/check_kwargs.py              | 16 ++++++++++++-
 deepspeech/utils/spec_augment.py              | 13 ++++++++++
 .../aishell/s0/conf/deepspeech2_online.yaml   | 14 +++++------
 examples/aishell/s0/local/data.sh             |  5 ++--
 examples/librispeech/s2/README.md             |  2 +-
 examples/librispeech/s2/local/recog.sh        |  4 +---
 examples/librispeech/s2/run.sh                |  9 +++++++
 utils/feat-to-shape.py                        | 24 +++++++++----------
 10 files changed, 63 insertions(+), 32 deletions(-)

diff --git a/deepspeech/decoders/recog.py b/deepspeech/decoders/recog.py
index e175c2477..4dd8af381 100644
--- a/deepspeech/decoders/recog.py
+++ b/deepspeech/decoders/recog.py
@@ -25,7 +25,6 @@ from .utils import add_results_to_json
 from deepspeech.exps import dynamic_import_tester
 from deepspeech.io.reader import LoadInputsAndTargets
 from deepspeech.models.asr_interface import ASRInterface
-from deepspeech.models.lm.transformer import TransformerLM
 from deepspeech.models.lm_interface import dynamic_import_lm
 from deepspeech.utils.log import Log
 # from espnet.asr.asr_utils import get_model_conf
@@ -51,12 +50,14 @@ def load_trained_model(args):
     model = exp.model
     return model, char_list, exp, confs
 
+
 def get_config(config_path):
     stream = open(config_path, mode='r', encoding="utf-8")
     config = yaml.load(stream, Loader=yaml.FullLoader)
     stream.close()
     return config
 
+
 def recog_v2(args):
     """Decode with custom models that implements ScorerInterface.
 
@@ -85,8 +86,8 @@ def recog_v2(args):
         if args.preprocess_conf is None else args.preprocess_conf,
         preprocess_args={"train": False}, )
 
-    if args.use_lm:
-        lm_path = args.rnnlm_path
+    if args.rnnlm:
+        lm_path = args.rnnlm
         lm_config_path = args.rnnlm_conf
         lm_config = get_config(lm_config_path)
         lm_class = dynamic_import_lm("transformer")
diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py
index db8ea0fcb..52e0c7b17 100644
--- a/deepspeech/models/ds2_online/deepspeech2.py
+++ b/deepspeech/models/ds2_online/deepspeech2.py
@@ -397,7 +397,6 @@ class DeepSpeech2ModelOnline(nn.Layer):
 
 class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
     def __init__(self, *args, **kwargs):
-        print("*args", *args)
         super().__init__(*args, **kwargs)
 
     def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
diff --git a/deepspeech/utils/check_kwargs.py b/deepspeech/utils/check_kwargs.py
index 593bfa248..1ee7329ba 100644
--- a/deepspeech/utils/check_kwargs.py
+++ b/deepspeech/utils/check_kwargs.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import inspect
 
 
@@ -17,4 +30,5 @@ def check_kwargs(func, kwargs, name=None):
         name = func.__name__
     for k in kwargs.keys():
         if k not in params:
-            raise TypeError(f"{name}() got an unexpected keyword argument '{k}'")
+            raise TypeError(
+                f"{name}() got an unexpected keyword argument '{k}'")
diff --git a/deepspeech/utils/spec_augment.py b/deepspeech/utils/spec_augment.py
index e69de29bb..185a92b8d 100644
--- a/deepspeech/utils/spec_augment.py
+++ b/deepspeech/utils/spec_augment.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml
index b0789d2db..c15e71a31 100644
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@@ -18,11 +18,11 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80 
+  spectrum_type: linear #linear, mfcc, fbank
+  feat_dim: 
   delta_delta: False
   stride_ms: 10.0
-  window_ms: 25.0
+  window_ms: 20.0
   n_fft: None
   max_freq: None
   target_sample_rate: 16000
@@ -36,17 +36,17 @@ collator:
 
 model:
   num_conv_layers: 2
-  num_rnn_layers: 4
+  num_rnn_layers: 5
   rnn_layer_size: 1024
   rnn_direction: forward # [forward, bidirect]
-  num_fc_layers: 1
-  fc_layers_size_list: 1024,
+  num_fc_layers: 0
+  fc_layers_size_list: -1,
   use_gru: False 
   blank_id: 0
   ctc_grad_norm_type: null
   
 training:
-  n_epoch: 80
+  n_epoch: 50
   accum_grad: 1
   lr: 2e-3
   lr_decay: 0.9  # 0.83
diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh
index 1c3fb44b6..f4fccbe6e 100755
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -30,11 +30,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-    --spectrum_type="fbank" \
-    --feat_dim=80 \
+    --spectrum_type="linear" \
     --delta_delta=false \
     --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --window_ms=20.0 \
     --sample_rate=16000 \
     --use_dB_normalization=True \
     --num_samples=2000 \
diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/s2/README.md
index fc634ff68..9285a1831 100644
--- a/examples/librispeech/s2/README.md
+++ b/examples/librispeech/s2/README.md
@@ -23,5 +23,5 @@
 | test-clean | join_ctc_w/o_lm | 2620 | 52576 | 97.2 | 2.6 | 0.3 | 0.4 | 3.2 | 34.9 |  
 | test-clean | join_ctc_w_lm | 2620 | 52576 | 97.9 | 1.8 | 0.2 | 0.3 | 2.4 | 27.8 |  
 
-Compare with [ESPNET](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-transformer-with-specaug-4-gpus--transformer-lm-4-gpus) 
+Compare with [ESPNET](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-transformer-with-specaug-4-gpus--transformer-lm-4-gpus)
 we using 8gpu, but model size (aheads4-adim256) small than it.
diff --git a/examples/librispeech/s2/local/recog.sh b/examples/librispeech/s2/local/recog.sh
index 811b161a4..b0e30cb55 100755
--- a/examples/librispeech/s2/local/recog.sh
+++ b/examples/librispeech/s2/local/recog.sh
@@ -11,7 +11,6 @@ tag=
 decode_config=conf/decode/decode.yaml
 
 # lm params
-use_lm=true
 lang_model=transformerLM.pdparams
 lmexpdir=exp/lm/transformer
 lmtag='nolm'
@@ -95,9 +94,8 @@ for dmethd in join_ctc; do
                 --result-label ${decode_dir}/data.JOB.json \
                 --model-conf ${config_path} \
                 --model ${ckpt_prefix}.pdparams \
-                --use_rnnlm ${use_lm} \
                 --rnnlm-conf ${rnnlm_config_path} \
-                --rnnlm-path ${lmexpdir}/${lang_model}
+                --rnnlm ${lmexpdir}/${lang_model}
 
         score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict}
 
diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh
index 3c7569fba..92cb22fa2 100755
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
@@ -10,6 +10,7 @@ stop_stage=100
 conf_path=conf/transformer.yaml
 dict_path=data/bpe_unigram_5000_units.txt
 avg_num=10
+use_lm=true
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
@@ -46,3 +47,11 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # export ckpt avg_n
     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
 fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && ${use_lm} == true; then
+    # use transformerlm to score
+    if [ ! -f exp/lm/transformer/transformerLM.pdparams ]; then
+        wget https://deepspeech.bj.bcebos.com/transformer_lm/transformerLM.pdparams exp/lm/transformer/
+    fi
+    bash local/recog.sh  --ckpt_prefix exp/${ckpt}/checkpoints/${avg_ckpt}
+fi
diff --git a/utils/feat-to-shape.py b/utils/feat-to-shape.py
index 911bf5cf8..7b36b7e5f 100755
--- a/utils/feat-to-shape.py
+++ b/utils/feat-to-shape.py
@@ -12,33 +12,32 @@ from deepspeech.utils.cli_utils import is_scipy_wav_style
 def get_parser():
     parser = argparse.ArgumentParser(
         description="convert feature to its shape",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
+    parser.add_argument(
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
     parser.add_argument(
         "--filetype",
         type=str,
         default="mat",
         choices=["mat", "hdf5", "sound.hdf5", "sound"],
         help="Specify the file format for the rspecifier. "
-        '"mat" is the matrix format in kaldi',
-    )
+        '"mat" is the matrix format in kaldi', )
     parser.add_argument(
         "--preprocess-conf",
         type=str,
         default=None,
-        help="The configuration file for the pre-processing",
-    )
+        help="The configuration file for the pre-processing", )
     parser.add_argument(
-        "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
-    )
+        "rspecifier",
+        type=str,
+        help="Read specifier for feats. e.g. ark:some.ark")
     parser.add_argument(
         "out",
         nargs="?",
         type=argparse.FileType("w"),
         default=sys.stdout,
-        help="The output filename. " "If omitted, then output to sys.stdout",
-    )
+        help="The output filename. "
+        "If omitted, then output to sys.stdout", )
     return parser
 
 
@@ -64,8 +63,7 @@ def main():
     # so change to file_reader_helper to return shape.
     # This make sense only with filetype="hdf5".
     for utt, mat in file_reader_helper(
-        args.rspecifier, args.filetype, return_shape=preprocessing is None
-    ):
+            args.rspecifier, args.filetype, return_shape=preprocessing is None):
         if preprocessing is not None:
             if is_scipy_wav_style(mat):
                 # If data is sound file, then got as Tuple[int, ndarray]