From c83c9800cc1d2a71328b0f930ab83acf46d52927 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 21 Oct 2022 09:51:35 +0000
Subject: [PATCH] quant with wav scp

---
 examples/wenetspeech/asr1/local/quant.sh |  11 +-
 paddlespeech/s2t/exps/u2/bin/quant.py    | 245 +++++++++++------------
 2 files changed, 121 insertions(+), 135 deletions(-)

diff --git a/examples/wenetspeech/asr1/local/quant.sh b/examples/wenetspeech/asr1/local/quant.sh
index 9dfea9045..6a2a4c72b 100755
--- a/examples/wenetspeech/asr1/local/quant.sh
+++ b/examples/wenetspeech/asr1/local/quant.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 if [ $# != 4 ];then
-    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_scp"
     exit -1
 fi
 
@@ -11,16 +11,15 @@ echo "using $ngpu gpus..."
 config_path=$1
 decode_config_path=$2
 ckpt_prefix=$3
-audio_file=$4
+audio_scp=$4
 
 mkdir -p data
-wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
 if [ $? -ne 0 ]; then
    exit 1
 fi
 
-if [ ! -f ${audio_file} ]; then
-    echo "Plase input the right audio_file path"
+if [ ! -f ${audio_scp} ]; then
+    echo "Plase input the right audio_scp path"
     exit 1
 fi
 
@@ -49,7 +48,7 @@ for type in  attention_rescoring; do
     --checkpoint_path ${ckpt_prefix} \
     --opts decode.decoding_method ${type} \
     --opts decode.decode_batch_size ${batch_size} \
-    --audio_file ${audio_file}
+    --audio_scp ${audio_scp}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/paddlespeech/s2t/exps/u2/bin/quant.py b/paddlespeech/s2t/exps/u2/bin/quant.py
index c38134c57..a76bb52f0 100644
--- a/paddlespeech/s2t/exps/u2/bin/quant.py
+++ b/paddlespeech/s2t/exps/u2/bin/quant.py
@@ -20,6 +20,7 @@ import paddle
 import soundfile
 from paddleslim import PTQ
 from yacs.config import CfgNode
+from kaldiio import ReadHelper
 
 from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
@@ -34,7 +35,7 @@ class U2Infer():
     def __init__(self, config, args):
         self.args = args
         self.config = config
-        self.audio_file = args.audio_file
+        self.audio_scp = args.audio_scp
 
         self.preprocess_conf = config.preprocess_config
         self.preprocess_args = {"train": False}
@@ -63,133 +64,117 @@ class U2Infer():
         self.model.set_state_dict(model_dict)
 
     def run(self):
-        check(args.audio_file)
-
-        with paddle.no_grad():
-            # read
-            audio, sample_rate = soundfile.read(
-                self.audio_file, dtype="int16", always_2d=True)
-            audio = audio[:, 0]
-            logger.info(f"audio shape: {audio.shape}")
-
-            # fbank
-            feat = self.preprocessing(audio, **self.preprocess_args)
-            logger.info(f"feat shape: {feat.shape}")
-
-            ilen = paddle.to_tensor(feat.shape[0])
-            xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
-            decode_config = self.config.decode
-            logger.info(f"decode cfg: {decode_config}")
-            reverse_weight = getattr(decode_config, 'reverse_weight', 0.0)
-            result_transcripts = self.model.decode(
-                xs,
-                ilen,
-                text_feature=self.text_feature,
-                decoding_method=decode_config.decoding_method,
-                beam_size=decode_config.beam_size,
-                ctc_weight=decode_config.ctc_weight,
-                decoding_chunk_size=decode_config.decoding_chunk_size,
-                num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
-                simulate_streaming=decode_config.simulate_streaming,
-                reverse_weight=reverse_weight)
-            rsl = result_transcripts[0][0]
-            utt = Path(self.audio_file).name
-            logger.info(f"hyp: {utt} {rsl}")
-            # print(self.model)
-            # print(self.model.forward_encoder_chunk)
-
-            logger.info("-------------start quant ----------------------")
-            batch_size = 1
-            feat_dim = 80
-            model_size = 512
-            num_left_chunks = -1
-            reverse_weight = 0.3
-            logger.info(
-                f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}, reverse_weight {reverse_weight}"
-            )
-
-            # ######################## self.model.forward_encoder_chunk ############
-            # input_spec = [
-            #     # (T,), int16
-            #     paddle.static.InputSpec(shape=[None], dtype='int16'),
-            # ]
-            # self.model.forward_feature = paddle.jit.to_static(
-            #     self.model.forward_feature, input_spec=input_spec)
-
-            ######################### self.model.forward_encoder_chunk ############
-            input_spec = [
-                # xs, (B, T, D)
-                paddle.static.InputSpec(
-                    shape=[batch_size, None, feat_dim], dtype='float32'),
-                # offset, int, but need be tensor
-                paddle.static.InputSpec(shape=[1], dtype='int32'),
-                # required_cache_size, int
-                num_left_chunks,
-                # att_cache
-                paddle.static.InputSpec(
-                    shape=[None, None, None, None], dtype='float32'),
-                # cnn_cache
-                paddle.static.InputSpec(
-                    shape=[None, None, None, None], dtype='float32')
-            ]
-            self.model.forward_encoder_chunk = paddle.jit.to_static(
-                self.model.forward_encoder_chunk, input_spec=input_spec)
-
-            ######################### self.model.ctc_activation ########################
-            input_spec = [
-                # encoder_out, (B,T,D)
-                paddle.static.InputSpec(
-                    shape=[batch_size, None, model_size], dtype='float32')
-            ]
-            self.model.ctc_activation = paddle.jit.to_static(
-                self.model.ctc_activation, input_spec=input_spec)
-
-            ######################### self.model.forward_attention_decoder ########################
-            input_spec = [
-                # hyps, (B, U)
-                paddle.static.InputSpec(shape=[None, None], dtype='int64'),
-                # hyps_lens, (B,)
-                paddle.static.InputSpec(shape=[None], dtype='int64'),
-                # encoder_out, (B,T,D)
-                paddle.static.InputSpec(
-                    shape=[batch_size, None, model_size], dtype='float32'),
-                reverse_weight
-            ]
-            self.model.forward_attention_decoder = paddle.jit.to_static(
-                self.model.forward_attention_decoder, input_spec=input_spec)
-            ################################################################################
-
-            # jit save
-            logger.info(f"export save: {self.args.export_path}")
-            config = {
-                'is_static': True,
-                'combine_params': True,
-                'skip_forward': True
-            }
-            self.ptq.save_quantized_model(self.model, self.args.export_path)
-            # paddle.jit.save(
-            #     self.model,
-            #     self.args.export_path,
-            #     combine_params=True,
-            #     skip_forward=True)
-
-
-def check(audio_file):
-    if not os.path.isfile(audio_file):
-        print("Please input the right audio file path")
-        sys.exit(-1)
-
-    logger.info("checking the audio file format......")
-    try:
-        sig, sample_rate = soundfile.read(audio_file)
-    except Exception as e:
-        logger.error(str(e))
-        logger.error(
-            "can not open the wav file, please check the audio file format")
-        sys.exit(-1)
-    logger.info("The sample rate is %d" % sample_rate)
-    assert (sample_rate == 16000)
-    logger.info("The audio file format is right")
+        cnt = 0
+        with ReadHelper(f"scp:{self.audio_scp}") as reader:
+            for key, (rate, audio) in reader:
+                assert rate == 16000
+                cnt += 1
+                if cnt > args.num_utts:
+                    break
+
+                with paddle.no_grad():
+                    logger.info(f"audio shape: {audio.shape}")
+
+                    # fbank
+                    feat = self.preprocessing(audio, **self.preprocess_args)
+                    logger.info(f"feat shape: {feat.shape}")
+
+                    ilen = paddle.to_tensor(feat.shape[0])
+                    xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
+                    decode_config = self.config.decode
+                    logger.info(f"decode cfg: {decode_config}")
+                    result_transcripts = self.model.decode(
+                        xs,
+                        ilen,
+                        text_feature=self.text_feature,
+                        decoding_method=decode_config.decoding_method,
+                        beam_size=decode_config.beam_size,
+                        ctc_weight=decode_config.ctc_weight,
+                        decoding_chunk_size=decode_config.decoding_chunk_size,
+                        num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
+                        simulate_streaming=decode_config.simulate_streaming,
+                        reverse_weight=decode_config.reverse_weight)
+                    rsl = result_transcripts[0][0]
+                    utt = key
+                    logger.info(f"hyp: {utt} {rsl}")
+                    # print(self.model)
+                    # print(self.model.forward_encoder_chunk)
+
+
+        logger.info("-------------start quant ----------------------")
+        batch_size = 1
+        feat_dim = 80
+        model_size = 512
+        num_left_chunks = -1
+        reverse_weight = 0.3
+        logger.info(
+            f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}, reverse_weight {reverse_weight}"
+        )
+
+        # ######################## self.model.forward_encoder_chunk ############
+        # input_spec = [
+        #     # (T,), int16
+        #     paddle.static.InputSpec(shape=[None], dtype='int16'),
+        # ]
+        # self.model.forward_feature = paddle.jit.to_static(
+        #     self.model.forward_feature, input_spec=input_spec)
+
+        ######################### self.model.forward_encoder_chunk ############
+        input_spec = [
+            # xs, (B, T, D)
+            paddle.static.InputSpec(
+                shape=[batch_size, None, feat_dim], dtype='float32'),
+            # offset, int, but need be tensor
+            paddle.static.InputSpec(shape=[1], dtype='int32'),
+            # required_cache_size, int
+            num_left_chunks,
+            # att_cache
+            paddle.static.InputSpec(
+                shape=[None, None, None, None], dtype='float32'),
+            # cnn_cache
+            paddle.static.InputSpec(
+                shape=[None, None, None, None], dtype='float32')
+        ]
+        self.model.forward_encoder_chunk = paddle.jit.to_static(
+            self.model.forward_encoder_chunk, input_spec=input_spec)
+
+        ######################### self.model.ctc_activation ########################
+        input_spec = [
+            # encoder_out, (B,T,D)
+            paddle.static.InputSpec(
+                shape=[batch_size, None, model_size], dtype='float32')
+        ]
+        self.model.ctc_activation = paddle.jit.to_static(
+            self.model.ctc_activation, input_spec=input_spec)
+
+        ######################### self.model.forward_attention_decoder ########################
+        input_spec = [
+            # hyps, (B, U)
+            paddle.static.InputSpec(shape=[None, None], dtype='int64'),
+            # hyps_lens, (B,)
+            paddle.static.InputSpec(shape=[None], dtype='int64'),
+            # encoder_out, (B,T,D)
+            paddle.static.InputSpec(
+                shape=[batch_size, None, model_size], dtype='float32'),
+            reverse_weight
+        ]
+        self.model.forward_attention_decoder = paddle.jit.to_static(
+            self.model.forward_attention_decoder, input_spec=input_spec)
+        ################################################################################
+
+        # jit save
+        logger.info(f"export save: {self.args.export_path}")
+        config = {
+            'is_static': True,
+            'combine_params': True,
+            'skip_forward': True
+        }
+        self.ptq.save_quantized_model(self.model, self.args.export_path)
+        # paddle.jit.save(
+        #     self.model,
+        #     self.args.export_path,
+        #     combine_params=True,
+        #     skip_forward=True)
 
 
 def main(config, args):
@@ -202,7 +187,9 @@ if __name__ == "__main__":
     parser.add_argument(
         "--result_file", type=str, help="path of save the asr result")
     parser.add_argument(
-        "--audio_file", type=str, help="path of the input audio file")
+        "--audio_scp", type=str, help="path of the input audio file")
+    parser.add_argument(
+        "--num_utts", type=int, default=200, help="num utts for quant calibrition.")
     parser.add_argument(
         "--export_path",
         type=str,