From 4e7106d9e2a3eb9ee5ab870dcae3a3c59eac338e Mon Sep 17 00:00:00 2001 From: 0x45f Date: Wed, 27 Jul 2022 09:32:11 +0000 Subject: [PATCH 01/57] Support dy2st --- paddlespeech/s2t/exps/u2/model.py | 165 +++++++++++++++++- paddlespeech/s2t/models/u2/u2.py | 42 ++++- .../engine/asr/online/python/asr_engine.py | 17 +- 3 files changed, 210 insertions(+), 14 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index cdad3b8f7..b41f320b4 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -471,6 +471,165 @@ class U2Tester(U2Trainer): infer_model, input_spec = self.load_inferspec() assert isinstance(input_spec, list), type(input_spec) infer_model.eval() - static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) - logger.info(f"Export code: {static_model.forward.code}") - paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) + # logger.info(f"Export code: {static_model.forward.code}") + # paddle.jit.save(static_model, self.args.export_path) + + # # to check outputs + # def flatten(out): + # if isinstance(out, paddle.Tensor): + # return [out] + + # flatten_out = [] + # for var in out: + # if isinstance(var, (list, tuple)): + # flatten_out.extend(flatten(var)) + # else: + # flatten_out.append(var) + # return flatten_out + + + # ######################### infer_model.forward_attention_decoder ######################## + # a = paddle.full(shape=[10, 8], fill_value=10, dtype='int64') + # b = paddle.full(shape=[10], fill_value=8, dtype='int64') + # # c = paddle.rand(shape=[1, 20, 512], dtype='float32') + # c = paddle.full(shape=[1, 20, 512], fill_value=1, dtype='float32') + + # out1 = infer_model.forward_attention_decoder(a, b, c) + # print(out1) + + # input_spec = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # paddle.static.InputSpec(shape=[None], dtype='int64'), + # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_attention_decoder, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + # out2 = static_model(a, b, c) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + + + + + + # ######################### infer_model.forward_encoder_chunk ######################## + # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([80], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.randn(shape=[12, 8, 80, 128], dtype='float32') + # cnn_cache = paddle.randn(shape=[12, 1, 512, 14], dtype='float32') + # # out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + # zero_out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache=paddle.zeros([0, 0, 0, 0]), cnn_cache=paddle.zeros([0, 0, 0, 0])) + # # print(zero_out1) + + # input_spec = [ + # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # paddle.static.InputSpec(shape=[1], dtype='int32'), + # -16, + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + # # out2 = static_model(xs, offset, att_cache, cnn_cache) + # # print(out2) + # zero_out2 = static_model(xs, offset, paddle.zeros([0, 0, 0, 0]), paddle.zeros([0, 0, 0, 0])) + + # # out1 = flatten(out1) + # # out2 = flatten(out2) + # # for i in range(len(out1)): + # # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + # zero_out1 = flatten(zero_out1) + # zero_out2 = flatten(zero_out2) + # for i in range(len(zero_out1)): + # print(np.equal(zero_out1[i].numpy(), zero_out2[i].numpy()).all()) + + + + + + + + # ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## + # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([0], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + + # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + + # input_spec = [ + # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # paddle.static.InputSpec(shape=[1], dtype='int32'), + # -16, + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + + # offset = paddle.to_tensor([0], dtype='int32') + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + # xs, att_cache, cnn_cache = static_model(xs1, offset, att_cache, cnn_cache) + # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out2 = static_model(xs2, offset, att_cache, cnn_cache) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + + + + + + + ###################### save/load combine ######################## + paddle.jit.save(infer_model, '/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', combine_params=True) + + + # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([0], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + + # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + + + # from paddle.jit.layer import Layer + # layer = Layer() + # layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(0)) + + # offset = paddle.to_tensor([0], dtype='int32') + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) + # offset = paddle.to_tensor([16], dtype='int32') + # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) \ No newline at end of file diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 76f698e64..9148c7372 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -59,6 +59,20 @@ __all__ = ["U2Model", "U2InferModel"] logger = Log(__name__).getlog() +# input_spec1 = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), +# paddle.static.InputSpec(shape=[None], dtype='int64'), +# paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] + +# input_spec2 = [ +# paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), +# paddle.static.InputSpec(shape=[1], dtype='int32'), +# -16, +# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), +# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + +# input_spec3 = [paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), +# paddle.static.InputSpec(shape=[1], dtype='int64')] + class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" @@ -599,7 +613,12 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.eos - @jit.to_static + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + paddle.static.InputSpec(shape=[1], dtype='int32'), + -16, + paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) def forward_encoder_chunk( self, xs: paddle.Tensor, @@ -655,7 +674,10 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) - @jit.to_static + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[None, None], dtype='int64'), + paddle.static.InputSpec(shape=[None], dtype='int64'), + paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) def forward_attention_decoder( self, hyps: paddle.Tensor, @@ -918,6 +940,9 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), + paddle.static.InputSpec(shape=[1], dtype='int64')]) def forward(self, feats, feats_lengths, @@ -933,9 +958,10 @@ class U2InferModel(U2Model): Returns: List[List[int]]: best path result """ - return self.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) + # return self.ctc_greedy_search( + # feats, + # feats_lengths, + # decoding_chunk_size=decoding_chunk_size, + # num_decoding_left_chunks=num_decoding_left_chunks, + # simulate_streaming=simulate_streaming) + return feats, feats_lengths diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 4df38f09d..cd50f157a 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -80,6 +80,10 @@ class PaddleASRConnectionHanddler: self.init_decoder() self.reset() + from paddle.jit.layer import Layer + self.jit_layer = Layer() + self.jit_layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(1)) + def init_decoder(self): if "deepspeech2" in self.model_type: assert self.continuous_decoding is False, "ds2 model not support endpoint" @@ -474,9 +478,16 @@ class PaddleASRConnectionHanddler: # cur chunk chunk_xs = self.cached_feat[:, cur:end, :] # forward chunk - (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( - chunk_xs, self.offset, required_cache_size, - self.att_cache, self.cnn_cache) + # (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( + # chunk_xs, self.offset, required_cache_size, + # self.att_cache, self.cnn_cache) + + (y, self.att_cache, self.cnn_cache) = self.jit_layer.forward_encoder_chunk( + chunk_xs, + paddle.to_tensor([self.offset], dtype='int32'), + self.att_cache, + self.cnn_cache) + outputs.append(y) # update the global offset, in decoding frame unit From e5a6c243f1f53ea3d3d28a957010db98cdcd6db4 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 1 Aug 2022 08:03:04 +0000 Subject: [PATCH 02/57] fix jit save for conformer --- paddlespeech/s2t/exps/u2/model.py | 205 ++++++------------------------ paddlespeech/s2t/models/u2/u2.py | 62 ++++----- 2 files changed, 62 insertions(+), 205 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index b41f320b4..141e83bce 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -25,8 +25,6 @@ import paddle from paddle import distributed as dist from paddlespeech.s2t.frontend.featurizer import TextFeaturizer -from paddlespeech.s2t.io.dataloader import BatchDataLoader -from paddlespeech.s2t.io.dataloader import StreamDataLoader from paddlespeech.s2t.io.dataloader import DataLoaderFactory from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.optimizer import OptimizerFactory @@ -109,7 +107,8 @@ class U2Trainer(Trainer): def valid(self): self.model.eval() if not self.use_streamdata: - logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") + logger.info( + f"Valid Total Examples: {len(self.valid_loader.dataset)}") valid_losses = defaultdict(list) num_seen_utts = 1 total_loss = 0.0 @@ -136,7 +135,8 @@ class U2Trainer(Trainer): msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) if not self.use_streamdata: - msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) + msg += "batch: {}/{}, ".format(i + 1, + len(self.valid_loader)) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in valid_dump.items()) logger.info(msg) @@ -157,7 +157,8 @@ class U2Trainer(Trainer): self.before_train() if not self.use_streamdata: - logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") + logger.info( + f"Train Total Examples: {len(self.train_loader.dataset)}") while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() @@ -225,14 +226,18 @@ class U2Trainer(Trainer): config = self.config.clone() self.use_streamdata = config.get("use_stream_data", False) if self.train: - self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args) - self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args) + self.train_loader = DataLoaderFactory.get_dataloader( + 'train', config, self.args) + self.valid_loader = DataLoaderFactory.get_dataloader( + 'valid', config, self.args) logger.info("Setup train/valid Dataloader!") else: decode_batch_size = config.get('decode', dict()).get( 'decode_batch_size', 1) - self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args) - self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args) + self.test_loader = DataLoaderFactory.get_dataloader('test', config, + self.args) + self.align_loader = DataLoaderFactory.get_dataloader( + 'align', config, self.args) logger.info("Setup test/align Dataloader!") def setup_model(self): @@ -470,166 +475,30 @@ class U2Tester(U2Trainer): def export(self): infer_model, input_spec = self.load_inferspec() assert isinstance(input_spec, list), type(input_spec) + del input_spec infer_model.eval() - # static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) - # logger.info(f"Export code: {static_model.forward.code}") - # paddle.jit.save(static_model, self.args.export_path) - - # # to check outputs - # def flatten(out): - # if isinstance(out, paddle.Tensor): - # return [out] - - # flatten_out = [] - # for var in out: - # if isinstance(var, (list, tuple)): - # flatten_out.extend(flatten(var)) - # else: - # flatten_out.append(var) - # return flatten_out - - - # ######################### infer_model.forward_attention_decoder ######################## - # a = paddle.full(shape=[10, 8], fill_value=10, dtype='int64') - # b = paddle.full(shape=[10], fill_value=8, dtype='int64') - # # c = paddle.rand(shape=[1, 20, 512], dtype='float32') - # c = paddle.full(shape=[1, 20, 512], fill_value=1, dtype='float32') - - # out1 = infer_model.forward_attention_decoder(a, b, c) - # print(out1) - - # input_spec = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), - # paddle.static.InputSpec(shape=[None], dtype='int64'), - # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] - # static_model = paddle.jit.to_static(infer_model.forward_attention_decoder, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path) - # static_model = paddle.jit.load(self.args.export_path) - # out2 = static_model(a, b, c) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) - - - - - - - # ######################### infer_model.forward_encoder_chunk ######################## - # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([80], dtype='int32') - # required_cache_size = -16 - # att_cache = paddle.randn(shape=[12, 8, 80, 128], dtype='float32') - # cnn_cache = paddle.randn(shape=[12, 1, 512, 14], dtype='float32') - # # out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache, cnn_cache) - # # print(out1) - # zero_out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache=paddle.zeros([0, 0, 0, 0]), cnn_cache=paddle.zeros([0, 0, 0, 0])) - # # print(zero_out1) - - # input_spec = [ - # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - # paddle.static.InputSpec(shape=[1], dtype='int32'), - # -16, - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] - # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path) - # static_model = paddle.jit.load(self.args.export_path) - # # out2 = static_model(xs, offset, att_cache, cnn_cache) - # # print(out2) - # zero_out2 = static_model(xs, offset, paddle.zeros([0, 0, 0, 0]), paddle.zeros([0, 0, 0, 0])) - - # # out1 = flatten(out1) - # # out2 = flatten(out2) - # # for i in range(len(out1)): - # # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) - - # zero_out1 = flatten(zero_out1) - # zero_out2 = flatten(zero_out2) - # for i in range(len(zero_out1)): - # print(np.equal(zero_out1[i].numpy(), zero_out2[i].numpy()).all()) - - - - - - - - # ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## - # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([0], dtype='int32') - # required_cache_size = -16 - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - - # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - # # print(out1) - - # input_spec = [ - # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - # paddle.static.InputSpec(shape=[1], dtype='int32'), - # -16, - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] - # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path) - # static_model = paddle.jit.load(self.args.export_path) - - # offset = paddle.to_tensor([0], dtype='int32') - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = static_model(xs1, offset, att_cache, cnn_cache) - # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out2 = static_model(xs2, offset, att_cache, cnn_cache) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) - - - - - - - - ###################### save/load combine ######################## - paddle.jit.save(infer_model, '/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', combine_params=True) + ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## + input_spec = [ + paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + paddle.static.InputSpec(shape=[1], dtype='int32'), -1, + paddle.static.InputSpec( + shape=[None, None, None, None], + dtype='float32'), paddle.static.InputSpec( + shape=[None, None, None, None], dtype='float32') + ] + infer_model.forward_encoder_chunk = paddle.jit.to_static( + infer_model.forward_encoder_chunk, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path, combine_params=True) - # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([0], dtype='int32') - # required_cache_size = -16 - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) + ######################### infer_model.forward_attention_decoder ######################## + input_spec = [ + paddle.static.InputSpec(shape=[None, None], dtype='int64'), + paddle.static.InputSpec(shape=[None], dtype='int64'), + paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + ] + infer_model.forward_attention_decoder = paddle.jit.to_static( + infer_model.forward_attention_decoder, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path, combine_params=True) - # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - # # print(out1) - - - # from paddle.jit.layer import Layer - # layer = Layer() - # layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(0)) - - # offset = paddle.to_tensor([0], dtype='int32') - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) - # offset = paddle.to_tensor([16], dtype='int32') - # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) \ No newline at end of file + paddle.jit.save(infer_model, './export.jit', combine_params=True) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 9148c7372..432162aae 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -29,6 +29,9 @@ import paddle from paddle import jit from paddle import nn +from paddlespeech.audio.utils.tensor_utils import add_sos_eos +from paddlespeech.audio.utils.tensor_utils import pad_sequence +from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.decoders.scorers.ctc import CTCPrefixScorer from paddlespeech.s2t.frontend.utility import IGNORE_ID from paddlespeech.s2t.frontend.utility import load_cmvn @@ -48,9 +51,6 @@ from paddlespeech.s2t.utils import checkpoint from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank from paddlespeech.s2t.utils.log import Log -from paddlespeech.audio.utils.tensor_utils import add_sos_eos -from paddlespeech.audio.utils.tensor_utils import pad_sequence -from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.utils.utility import log_add from paddlespeech.s2t.utils.utility import UpdateConfig @@ -59,20 +59,6 @@ __all__ = ["U2Model", "U2InferModel"] logger = Log(__name__).getlog() -# input_spec1 = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), -# paddle.static.InputSpec(shape=[None], dtype='int64'), -# paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] - -# input_spec2 = [ -# paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), -# paddle.static.InputSpec(shape=[1], dtype='int32'), -# -16, -# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), -# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] - -# input_spec3 = [paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), -# paddle.static.InputSpec(shape=[1], dtype='int64')] - class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" @@ -588,44 +574,44 @@ class U2BaseModel(ASRInterface, nn.Layer): best_index = i return hyps[best_index][0] - #@jit.to_static + @jit.to_static(property=True) def subsampling_rate(self) -> int: """ Export interface for c++ call, return subsampling_rate of the model """ return self.encoder.embed.subsampling_rate - #@jit.to_static + @jit.to_static(property=True) def right_context(self) -> int: """ Export interface for c++ call, return right_context of the model """ return self.encoder.embed.right_context - #@jit.to_static + @jit.to_static(property=True) def sos_symbol(self) -> int: """ Export interface for c++ call, return sos symbol id of the model """ return self.sos - #@jit.to_static + @jit.to_static(property=True) def eos_symbol(self) -> int: """ Export interface for c++ call, return eos symbol id of the model """ return self.eos - @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - paddle.static.InputSpec(shape=[1], dtype='int32'), - -16, - paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) + # @jit.to_static(input_spec=[ + # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # paddle.static.InputSpec(shape=[1], dtype='int32'), + # -1, + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) def forward_encoder_chunk( self, xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Export interface for c++ call, give input chunk xs, and return output from time 0 to current chunk. @@ -660,8 +646,8 @@ class U2BaseModel(ASRInterface, nn.Layer): paddle.Tensor: new conformer cnn cache required for next chunk, with same shape as the original cnn_cache. """ - return self.encoder.forward_chunk( - xs, offset, required_cache_size, att_cache, cnn_cache) + return self.encoder.forward_chunk(xs, offset, required_cache_size, + att_cache, cnn_cache) # @jit.to_static def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: @@ -674,10 +660,10 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) - @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[None, None], dtype='int64'), - paddle.static.InputSpec(shape=[None], dtype='int64'), - paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) + # @jit.to_static(input_spec=[ + # paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # paddle.static.InputSpec(shape=[None], dtype='int64'), + # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) def forward_attention_decoder( self, hyps: paddle.Tensor, @@ -941,8 +927,9 @@ class U2InferModel(U2Model): super().__init__(configs) @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), - paddle.static.InputSpec(shape=[1], dtype='int64')]) + paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), + paddle.static.InputSpec(shape=[1], dtype='int64') + ]) def forward(self, feats, feats_lengths, @@ -958,6 +945,7 @@ class U2InferModel(U2Model): Returns: List[List[int]]: best path result """ + # dummy code for dy2st # return self.ctc_greedy_search( # feats, # feats_lengths, From d3572be0bb37cd2265691bbfe73c6c550d33f162 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 1 Aug 2022 08:06:25 +0000 Subject: [PATCH 03/57] add ws export.sh --- examples/wenetspeech/asr1/local/export.sh | 28 +++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 examples/wenetspeech/asr1/local/export.sh diff --git a/examples/wenetspeech/asr1/local/export.sh b/examples/wenetspeech/asr1/local/export.sh new file mode 100755 index 000000000..6b646b469 --- /dev/null +++ b/examples/wenetspeech/asr1/local/export.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +if [ $# != 3 ];then + echo "usage: $0 config_path ckpt_prefix jit_model_path" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_path_prefix=$2 +jit_model_export_path=$3 + +python3 -u ${BIN_DIR}/export.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--checkpoint_path ${ckpt_path_prefix} \ +--export_path ${jit_model_export_path} + + +if [ $? -ne 0 ]; then + echo "Failed in export!" + exit 1 +fi + + +exit 0 From 6149daa22142d7be2f252b9590b2728a5ec72a10 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 3 Aug 2022 08:38:43 +0000 Subject: [PATCH 04/57] export ctc_activation --- paddlespeech/s2t/exps/u2/model.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 141e83bce..fdccdf159 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -478,7 +478,8 @@ class U2Tester(U2Trainer): del input_spec infer_model.eval() - ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## + ######################### infer_model.forward_encoder_chunk zero Tensor online ############ + # TODO: 80(feature dim) be configable input_spec = [ paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), paddle.static.InputSpec(shape=[1], dtype='int32'), -1, @@ -492,6 +493,7 @@ class U2Tester(U2Trainer): # paddle.jit.save(static_model, self.args.export_path, combine_params=True) ######################### infer_model.forward_attention_decoder ######################## + # TODO: 512(encoder_output) be configable. 1 for B input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype='int64'), paddle.static.InputSpec(shape=[None], dtype='int64'), @@ -501,4 +503,12 @@ class U2Tester(U2Trainer): infer_model.forward_attention_decoder, input_spec=input_spec) # paddle.jit.save(static_model, self.args.export_path, combine_params=True) + ######################### infer_model.ctc_activation ######################## + # TODO: 512(encoder_output) be configable + input_spec = [ + paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + ] + infer_model.ctc_activation = paddle.jit.to_static( + infer_model.ctc_activation, input_spec=input_spec) + paddle.jit.save(infer_model, './export.jit', combine_params=True) From 05bc25883333d80a7ee1a5ec1314a1b81f57a81c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 3 Aug 2022 09:17:23 +0000 Subject: [PATCH 05/57] update docstring --- paddlespeech/s2t/models/u2/u2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index ca83ca170..e4c667e00 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -654,7 +654,7 @@ class U2BaseModel(ASRInterface, nn.Layer): Args: xs (paddle.Tensor): encoder output, (B, T, D) Returns: - paddle.Tensor: activation before ctc + paddle.Tensor: activation before ctc. (B, Tmax, odim) """ return self.ctc.log_softmax(xs) From c1fbfe928ec386eefa805c9215a369fc83b9b9fc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 4 Aug 2022 03:22:14 +0000 Subject: [PATCH 06/57] add test --- paddlespeech/s2t/exps/u2/model.py | 49 +++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index fdccdf159..5ce5f50bf 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -512,3 +512,52 @@ class U2Tester(U2Trainer): infer_model.ctc_activation, input_spec=input_spec) paddle.jit.save(infer_model, './export.jit', combine_params=True) + + def flatten(out): + if isinstance(out, paddle.Tensor): + return [out] + + flatten_out = [] + for var in out: + if isinstance(var, (list, tuple)): + flatten_out.extend(flatten(var)) + else: + flatten_out.append(var) + return flatten_out + + xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + offset = paddle.to_tensor([0], dtype='int32') + required_cache_size = -16 + att_cache = paddle.zeros([0, 0, 0, 0]) + cnn_cache = paddle.zeros([0, 0, 0, 0]) + + # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + # print(out1) + + xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk( + xs1, offset, att_cache, cnn_cache) + xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + offset = paddle.to_tensor([16], dtype='int32') + out1 = infer_model.forward_encoder_chunk(xs2, offset, att_cache, + cnn_cache) + print(out1) + + # from paddle.jit.layer import Layer + # layer = Layer() + # layer.load('./export.jit', paddle.CPUPlace()) + + # offset = paddle.to_tensor([0], dtype='int32') + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) + # offset = paddle.to_tensor([16], dtype='int32') + # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) From d638325c46e7092fcdb48ee7605c9c79f498bb1f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:09:29 +0000 Subject: [PATCH 07/57] do not jit save forward; using slice for zeros([0,0,0,0]) tensor --- paddlespeech/s2t/exps/u2/model.py | 51 +++++++++++------------------ paddlespeech/s2t/models/u2/u2.py | 4 --- paddlespeech/s2t/modules/encoder.py | 5 +-- 3 files changed, 23 insertions(+), 37 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 5ce5f50bf..66b95f63c 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -482,10 +482,12 @@ class U2Tester(U2Trainer): # TODO: 80(feature dim) be configable input_spec = [ paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - paddle.static.InputSpec(shape=[1], dtype='int32'), -1, + paddle.static.InputSpec(shape=[1], dtype='int32'), + -1, paddle.static.InputSpec( shape=[None, None, None, None], - dtype='float32'), paddle.static.InputSpec( + dtype='float32'), + paddle.static.InputSpec( shape=[None, None, None, None], dtype='float32') ] infer_model.forward_encoder_chunk = paddle.jit.to_static( @@ -511,7 +513,7 @@ class U2Tester(U2Trainer): infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) - paddle.jit.save(infer_model, './export.jit', combine_params=True) + paddle.jit.save(infer_model, './export.jit', combine_params=True, skip_forward=True) def flatten(out): if isinstance(out, paddle.Tensor): @@ -531,33 +533,20 @@ class U2Tester(U2Trainer): att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - # print(out1) - - xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk( - xs1, offset, att_cache, cnn_cache) + xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') offset = paddle.to_tensor([16], dtype='int32') - out1 = infer_model.forward_encoder_chunk(xs2, offset, att_cache, - cnn_cache) - print(out1) - - # from paddle.jit.layer import Layer - # layer = Layer() - # layer.load('./export.jit', paddle.CPUPlace()) - - # offset = paddle.to_tensor([0], dtype='int32') - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) - # offset = paddle.to_tensor([16], dtype='int32') - # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + print('py encoder', out1) + + from paddle.jit.layer import Layer + layer = Layer() + layer.load('./export.jit', paddle.CPUPlace()) + + xs1 = paddle.full([1, 7, 80], 0.1, dtype='float32') + offset = paddle.to_tensor([0], dtype='int32') + att_cache = paddle.zeros([0, 0, 0, 0]) + cnn_cache=paddle.zeros([0, 0, 0, 0]) + func = getattr(layer, 'forward_encoder_chunk') + xs, att_cache, cnn_cache = func(xs1, offset, att_cache, cnn_cache) + print('py static encoder', xs) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index e4c667e00..a1daccf18 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -924,10 +924,6 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) - @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), - paddle.static.InputSpec(shape=[1], dtype='int64') - ]) def forward(self, feats, feats_lengths, diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index bff2d69bb..a7919bca4 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -251,10 +251,11 @@ class BaseEncoder(nn.Layer): for i, layer in enumerate(self.encoders): # att_cache[i:i+1] = (1, head, cache_t1, d_k*2) # cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2) + # zeros([0,0,0,0]) support [i:i+1] slice xs, _, new_att_cache, new_cnn_cache = layer( xs, att_mask, pos_emb, - att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, + att_cache=att_cache[i:i+1], + cnn_cache=cnn_cache[i:i+1], ) # new_att_cache = (1, head, attention_key_size, d_k*2) # new_cnn_cache = (B=1, hidden-dim, cache_t2) From a7c6c54e75575ffddcae18ae353c858006653cb9 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:20:28 +0000 Subject: [PATCH 08/57] fix --- .../server/engine/asr/online/python/asr_engine.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index cd50f157a..e3cbd38f3 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -80,9 +80,6 @@ class PaddleASRConnectionHanddler: self.init_decoder() self.reset() - from paddle.jit.layer import Layer - self.jit_layer = Layer() - self.jit_layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(1)) def init_decoder(self): if "deepspeech2" in self.model_type: @@ -478,15 +475,9 @@ class PaddleASRConnectionHanddler: # cur chunk chunk_xs = self.cached_feat[:, cur:end, :] # forward chunk - # (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( - # chunk_xs, self.offset, required_cache_size, - # self.att_cache, self.cnn_cache) - - (y, self.att_cache, self.cnn_cache) = self.jit_layer.forward_encoder_chunk( - chunk_xs, - paddle.to_tensor([self.offset], dtype='int32'), - self.att_cache, - self.cnn_cache) + (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( + chunk_xs, self.offset, required_cache_size, + self.att_cache, self.cnn_cache) outputs.append(y) From 63aeb747b0be474140fc4b9f6808403b05d1cf84 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:29:55 +0000 Subject: [PATCH 09/57] more comment --- paddlespeech/s2t/exps/u2/model.py | 10 +++++----- paddlespeech/s2t/modules/encoder.py | 7 ++++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 66b95f63c..1d813761d 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -492,10 +492,9 @@ class U2Tester(U2Trainer): ] infer_model.forward_encoder_chunk = paddle.jit.to_static( infer_model.forward_encoder_chunk, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path, combine_params=True) ######################### infer_model.forward_attention_decoder ######################## - # TODO: 512(encoder_output) be configable. 1 for B + # TODO: 512(encoder_output) be configable. 1 for BatchSize input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype='int64'), paddle.static.InputSpec(shape=[None], dtype='int64'), @@ -503,7 +502,6 @@ class U2Tester(U2Trainer): ] infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path, combine_params=True) ######################### infer_model.ctc_activation ######################## # TODO: 512(encoder_output) be configable @@ -513,8 +511,10 @@ class U2Tester(U2Trainer): infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) - paddle.jit.save(infer_model, './export.jit', combine_params=True, skip_forward=True) + # jit save + paddle.jit.save(infer_model, self.args.export_path, combine_params=True, skip_forward=True) + # test dy2static def flatten(out): if isinstance(out, paddle.Tensor): return [out] @@ -541,7 +541,7 @@ class U2Tester(U2Trainer): from paddle.jit.layer import Layer layer = Layer() - layer.load('./export.jit', paddle.CPUPlace()) + layer.load(self.args.export_path, paddle.CPUPlace()) xs1 = paddle.full([1, 7, 80], 0.1, dtype='float32') offset = paddle.to_tensor([0], dtype='int32') diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index a7919bca4..230894d50 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -251,7 +251,12 @@ class BaseEncoder(nn.Layer): for i, layer in enumerate(self.encoders): # att_cache[i:i+1] = (1, head, cache_t1, d_k*2) # cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2) - # zeros([0,0,0,0]) support [i:i+1] slice + + # WARNING: eliminate if-else cond op in graph + # tensor zeros([0,0,0,0]) support [i:i+1] slice, will return zeros([0,0,0,0]) tensor + # raw code as below: + # att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, + # cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, xs, _, new_att_cache, new_cnn_cache = layer( xs, att_mask, pos_emb, att_cache=att_cache[i:i+1], From 1c9f238ba09e55b26b3b0c46033436ed27eb9613 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:45:26 +0000 Subject: [PATCH 10/57] configurable export --- paddlespeech/s2t/exps/u2/model.py | 37 +++++++++++++++++++------------ 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 1d813761d..45fbcb404 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -462,31 +462,37 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.clone(), self.args.checkpoint_path) + + batch_size = 1 feat_dim = self.test_loader.feat_dim - input_spec = [ - paddle.static.InputSpec(shape=[1, None, feat_dim], - dtype='float32'), # audio, [B,T,D] - paddle.static.InputSpec(shape=[1], - dtype='int64'), # audio_length, [B] - ] - return infer_model, input_spec + model_size = 512 + num_left_chunks = -1 + + return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) @paddle.no_grad() def export(self): infer_model, input_spec = self.load_inferspec() - assert isinstance(input_spec, list), type(input_spec) - del input_spec infer_model.eval() - ######################### infer_model.forward_encoder_chunk zero Tensor online ############ + assert isinstance(input_spec, list), type(input_spec) + batch_size, feat_dim, model_size, num_left_chunks = input_spec + + + ######################### infer_model.forward_encoder_chunk zero tensor online ############ # TODO: 80(feature dim) be configable input_spec = [ - paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # xs, (B, T, D) + paddle.static.InputSpec(shape=[batch_size, None, feat_dim], dtype='float32'), + # offset, int, but need be tensor paddle.static.InputSpec(shape=[1], dtype='int32'), - -1, + # required_cache_size, int + num_left_chunks, + # att_cache paddle.static.InputSpec( shape=[None, None, None, None], dtype='float32'), + # cnn_cache paddle.static.InputSpec( shape=[None, None, None, None], dtype='float32') ] @@ -496,9 +502,12 @@ class U2Tester(U2Trainer): ######################### infer_model.forward_attention_decoder ######################## # TODO: 512(encoder_output) be configable. 1 for BatchSize input_spec = [ + # hyps, (B, U) paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # hyps_lens, (B,) paddle.static.InputSpec(shape=[None], dtype='int64'), - paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + # encoder_out, (B,T,D) + paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') ] infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) @@ -529,7 +538,7 @@ class U2Tester(U2Trainer): xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') offset = paddle.to_tensor([0], dtype='int32') - required_cache_size = -16 + required_cache_size = num_left_chunks att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) From 3a8869fba496ecfbb153a094feae18ac1ce28fc9 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:50:11 +0000 Subject: [PATCH 11/57] rm to_static decarator; configure jit save for ctc_activation --- paddlespeech/s2t/exps/u2/model.py | 4 ++-- paddlespeech/s2t/models/u2/u2.py | 12 ++---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 45fbcb404..dae618db6 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -513,9 +513,9 @@ class U2Tester(U2Trainer): infer_model.forward_attention_decoder, input_spec=input_spec) ######################### infer_model.ctc_activation ######################## - # TODO: 512(encoder_output) be configable input_spec = [ - paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + # encoder_out, (B,T,D) + paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') ] infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index a1daccf18..149170ed6 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -599,12 +599,7 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.eos - # @jit.to_static(input_spec=[ - # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - # paddle.static.InputSpec(shape=[1], dtype='int32'), - # -1, - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) + # @jit.to_static def forward_encoder_chunk( self, xs: paddle.Tensor, @@ -658,10 +653,7 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) - # @jit.to_static(input_spec=[ - # paddle.static.InputSpec(shape=[None, None], dtype='int64'), - # paddle.static.InputSpec(shape=[None], dtype='int64'), - # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) + # @jit.to_static def forward_attention_decoder( self, hyps: paddle.Tensor, From 67709155e9f17e03579c7360882e2e92b65ad7c1 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 13 Sep 2022 08:29:21 +0000 Subject: [PATCH 12/57] add chunk conformer config from release model --- .../asr1/conf/chunk_conformer.yaml | 99 +++++++++++++++++++ .../wenetspeech/asr1/conf/preprocess.yaml | 2 +- .../asr1/conf/tuning/chunk_decode.yaml | 11 +++ 3 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 examples/wenetspeech/asr1/conf/chunk_conformer.yaml create mode 100644 examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml new file mode 100644 index 000000000..69fa223a1 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml @@ -0,0 +1,99 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + init_type: 'kaiming_uniform' + +# https://yaml.org/type/float.html +########################################### +# Data # +########################################### +train_manifest: data/train_l/data.list +dev_manifest: data/dev/data.list +test_manifest: data/test_meeting/data.list + +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +do_filter: True +maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced +minlen_in: 10 +minlen_out: 0 +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + + +########################################### +# Training # +########################################### +n_epoch: 26 +accum_grad: 32 +global_grad_clip: 5.0 +dist_sampler: True +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml index f7f4c58d5..c7ccc522d 100644 --- a/examples/wenetspeech/asr1/conf/preprocess.yaml +++ b/examples/wenetspeech/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: 0.1 + dither: 1.0 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml b/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 000000000..7e8afb7a8 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: 16 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: True # simulate streaming inference. Defaults to False. From 8690a00bd8d66c7d1358a8ac370967ddb4bd1ec5 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 13 Sep 2022 09:54:48 +0000 Subject: [PATCH 13/57] add feature pipeline layer(cmvn, fbank), but to_static and jit.layer output is not equal --- paddlespeech/audio/compliance/kaldi.py | 22 +++---- paddlespeech/s2t/exps/u2/bin/test_wav.py | 3 + paddlespeech/s2t/exps/u2/model.py | 75 ++++++++++++++++-------- paddlespeech/s2t/models/u2/u2.py | 58 ++++++++++++++++++ paddlespeech/s2t/modules/cmvn.py | 10 +++- paddlespeech/s2t/modules/fbank.py | 74 +++++++++++++++++++++++ 6 files changed, 206 insertions(+), 36 deletions(-) create mode 100644 paddlespeech/s2t/modules/fbank.py diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index 538be0196..beb2d86b9 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -74,16 +74,16 @@ def _feature_window_function( window_size: int, blackman_coeff: float, dtype: int, ) -> Tensor: - if window_type == HANNING: + if window_type == "hann": return get_window('hann', window_size, fftbins=False, dtype=dtype) - elif window_type == HAMMING: + elif window_type == "hamming": return get_window('hamming', window_size, fftbins=False, dtype=dtype) - elif window_type == POVEY: + elif window_type == "povey": return get_window( 'hann', window_size, fftbins=False, dtype=dtype).pow(0.85) - elif window_type == RECTANGULAR: + elif window_type == "rect": return paddle.ones([window_size], dtype=dtype) - elif window_type == BLACKMAN: + elif window_type == "blackman": a = 2 * math.pi / (window_size - 1) window_function = paddle.arange(window_size, dtype=dtype) return (blackman_coeff - 0.5 * paddle.cos(a * window_function) + @@ -216,7 +216,7 @@ def spectrogram(waveform: Tensor, sr: int=16000, snip_edges: bool=True, subtract_mean: bool=False, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. Args: @@ -236,7 +236,7 @@ def spectrogram(waveform: Tensor, snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. - window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". Returns: Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames @@ -418,11 +418,11 @@ def fbank(waveform: Tensor, vtln_high: float=-500.0, vtln_low: float=100.0, vtln_warp: float=1.0, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return filter banks from a waveform. The output is identical to Kaldi's. Args: - waveform (Tensor): A waveform tensor with shape `(C, T)`. + waveform (Tensor): A waveform tensor with shape `(C, T)`. `C` is in the range [0,1]. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. channel (int, optional): Select the channel of waveform. Defaults to -1. dither (float, optional): Dithering constant . Defaults to 0.0. @@ -448,7 +448,7 @@ def fbank(waveform: Tensor, vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. - window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". Returns: Tensor: A filter banks tensor with shape `(m, n_mels)`. @@ -537,7 +537,7 @@ def mfcc(waveform: Tensor, vtln_high: float=-500.0, vtln_low: float=100.0, vtln_warp: float=1.0, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return mel frequency cepstral coefficients from a waveform. The output is identical to Kaldi's. diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 887ec7a6d..c04e3ae47 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +import numpy as np from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation @@ -77,6 +78,8 @@ class U2Infer(): feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") + np.savetxt("feat.transform.txt", feat) + ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) decode_config = self.config.decode diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index dae618db6..ee4df9cb9 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -474,13 +474,20 @@ class U2Tester(U2Trainer): def export(self): infer_model, input_spec = self.load_inferspec() infer_model.eval() + paddle.set_device('cpu') - assert isinstance(input_spec, list), type(input_spec) + assert isinstance(input_spec, (list, tuple)), type(input_spec) batch_size, feat_dim, model_size, num_left_chunks = input_spec - ######################### infer_model.forward_encoder_chunk zero tensor online ############ - # TODO: 80(feature dim) be configable + ######################## infer_model.forward_encoder_chunk ############ + input_spec = [ + # (T,), int16 + paddle.static.InputSpec(shape=[None], dtype='int16'), + ] + infer_model.forward_feature = paddle.jit.to_static(infer_model.forward_feature, input_spec=input_spec) + + ######################### infer_model.forward_encoder_chunk ############ input_spec = [ # xs, (B, T, D) paddle.static.InputSpec(shape=[batch_size, None, feat_dim], dtype='float32'), @@ -499,8 +506,16 @@ class U2Tester(U2Trainer): infer_model.forward_encoder_chunk = paddle.jit.to_static( infer_model.forward_encoder_chunk, input_spec=input_spec) + ######################### infer_model.ctc_activation ######################## + input_spec = [ + # encoder_out, (B,T,D) + paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') + ] + infer_model.ctc_activation = paddle.jit.to_static( + infer_model.ctc_activation, input_spec=input_spec) + + ######################### infer_model.forward_attention_decoder ######################## - # TODO: 512(encoder_output) be configable. 1 for BatchSize input_spec = [ # hyps, (B, U) paddle.static.InputSpec(shape=[None, None], dtype='int64'), @@ -512,17 +527,11 @@ class U2Tester(U2Trainer): infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) - ######################### infer_model.ctc_activation ######################## - input_spec = [ - # encoder_out, (B,T,D) - paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') - ] - infer_model.ctc_activation = paddle.jit.to_static( - infer_model.ctc_activation, input_spec=input_spec) - # jit save + logger.info(f"export save: {self.args.export_path}") paddle.jit.save(infer_model, self.args.export_path, combine_params=True, skip_forward=True) + # test dy2static def flatten(out): if isinstance(out, paddle.Tensor): @@ -536,26 +545,44 @@ class U2Tester(U2Trainer): flatten_out.append(var) return flatten_out - xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # forward_encoder_chunk dygraph + xs1 = paddle.full([1, 67, 80], 0.1, dtype='float32') offset = paddle.to_tensor([0], dtype='int32') required_cache_size = num_left_chunks att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) - - xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - offset = paddle.to_tensor([16], dtype='int32') - out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - print('py encoder', out1) - + xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + + import soundfile + audio, sample_rate = soundfile.read( + './zh.wav', dtype="int16", always_2d=True) + audio = audio[:, 0] + logger.info(f"audio shape: {audio.shape}") + audio = paddle.to_tensor(audio, paddle.int16) + feat_d = infer_model.forward_feature(audio) + logger.info(f"{feat_d}") + np.savetxt("feat.tostatic.txt", feat_d) + + + # load static model from paddle.jit.layer import Layer layer = Layer() layer.load(self.args.export_path, paddle.CPUPlace()) - xs1 = paddle.full([1, 7, 80], 0.1, dtype='float32') + # forward_encoder_chunk static + xs1 = paddle.full([1, 67, 80], 0.1, dtype='float32') offset = paddle.to_tensor([0], dtype='int32') att_cache = paddle.zeros([0, 0, 0, 0]) - cnn_cache=paddle.zeros([0, 0, 0, 0]) + cnn_cache = paddle.zeros([0, 0, 0, 0]) func = getattr(layer, 'forward_encoder_chunk') - xs, att_cache, cnn_cache = func(xs1, offset, att_cache, cnn_cache) - print('py static encoder', xs) + xs_s, att_cache_s, cnn_cache_s = func(xs1, offset, att_cache, cnn_cache) + np.testing.assert_allclose(xs_d, xs_s, atol=1e-5) + np.testing.assert_allclose(att_cache_d, att_cache_s, atol=1e-4) + np.testing.assert_allclose(cnn_cache_d, cnn_cache_s, atol=1e-4) + # logger.info(f"forward_encoder_chunk output: {xs_s}") + + # forward_feature static + func = getattr(layer, 'forward_feature') + feat_s = func(audio)[0] + logger.info(f"{feat_s}") + np.testing.assert_allclose(feat_d, feat_s, atol=1e-5) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 149170ed6..d7b8630a3 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -916,6 +916,50 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) + from paddlespeech.s2t.modules.fbank import KaldiFbank + import yaml + import json + import numpy as np + + input_dim = configs['input_dim'] + process = configs['preprocess_config'] + with open(process, encoding="utf-8") as f: + conf = yaml.safe_load(f) + assert isinstance(conf, dict), type(self.conf) + + for idx, process in enumerate(conf['process']): + assert isinstance(process, dict), type(process) + opts = dict(process) + process_type = opts.pop("type") + + if process_type == 'fbank_kaldi': + opts.update({'n_mels': input_dim}) + opts['dither'] = 0.0 + self.fbank = KaldiFbank( + **opts + ) + logger.info(f"{self.__class__.__name__} export: {self.fbank}") + if process_type == 'cmvn_json': + # align with paddlespeech.audio.transform.cmvn:GlobalCMVN + std_floor = 1.0e-20 + + cmvn = opts['cmvn_path'] + if isinstance(cmvn, dict): + cmvn_stats = cmvn + else: + with open(cmvn) as f: + cmvn_stats = json.load(f) + count = cmvn_stats['frame_num'] + mean = np.array(cmvn_stats['mean_stat']) / count + square_sums = np.array(cmvn_stats['var_stat']) + var = square_sums / count - mean**2 + std = np.maximum(np.sqrt(var), std_floor) + istd = 1.0 / std + self.global_cmvn = GlobalCMVN( + paddle.to_tensor(mean, dtype=paddle.float), + paddle.to_tensor(istd, dtype=paddle.float)) + logger.info(f"{self.__class__.__name__} export: {self.global_cmvn}") + def forward(self, feats, feats_lengths, @@ -939,3 +983,17 @@ class U2InferModel(U2Model): # num_decoding_left_chunks=num_decoding_left_chunks, # simulate_streaming=simulate_streaming) return feats, feats_lengths + + def forward_feature(self, x): + """feature pipeline. + + Args: + x (paddle.Tensor): waveform (T,). + + Return: + feat (paddle.Tensor): feature (T, D) + """ + x = paddle.cast(x, paddle.float32) + feat = self.fbank(x) + feat = self.global_cmvn(feat) + return feat \ No newline at end of file diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py index 67f71b667..53c508f1a 100644 --- a/paddlespeech/s2t/modules/cmvn.py +++ b/paddlespeech/s2t/modules/cmvn.py @@ -40,6 +40,14 @@ class GlobalCMVN(nn.Layer): self.register_buffer("mean", mean) self.register_buffer("istd", istd) + def __repr__(self): + return ( + "{name}(mean={mean}, istd={istd}, norm_var={norm_var})".format( + name=self.__class__.__name__, + mean=self.mean, + istd=self.istd, + norm_var=self.norm_var)) + def forward(self, x: paddle.Tensor): """ Args: @@ -50,4 +58,4 @@ class GlobalCMVN(nn.Layer): x = x - self.mean if self.norm_var: x = x * self.istd - return x + return x \ No newline at end of file diff --git a/paddlespeech/s2t/modules/fbank.py b/paddlespeech/s2t/modules/fbank.py new file mode 100644 index 000000000..4ec620a79 --- /dev/null +++ b/paddlespeech/s2t/modules/fbank.py @@ -0,0 +1,74 @@ + + + +import paddle +from paddle import nn + +from paddlespeech.audio.compliance import kaldi + +from paddlespeech.s2t.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['KaldiFbank'] + +class KaldiFbank(nn.Layer): + def __init__(self, + fs=16000, + n_mels=80, + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + energy_floor=0.0, + dither=0.0): + """ + Args: + fs (int): sample rate of the audio + n_mels (int): number of mel filter banks + n_shift (int): number of points in a frame shift + win_length (int): number of points in a frame windows + energy_floor (float): Floor on energy in Spectrogram computation (absolute) + dither (float): Dithering constant. Default 0.0 + """ + super().__init__() + self.fs = fs + self.n_mels = n_mels + num_point_ms = fs / 1000 + self.n_frame_length = win_length / num_point_ms + self.n_frame_shift = n_shift / num_point_ms + self.energy_floor = energy_floor + self.dither = dither + + def __repr__(self): + return ( + "{name}(fs={fs}, n_mels={n_mels}, " + "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, " + "dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_frame_shift=self.n_frame_shift, + n_frame_length=self.n_frame_length, + dither=self.dither, )) + + def forward(self, x: paddle.Tensor): + """ + Args: + x (paddle.Tensor): shape (Ti). + Not support: [Time, Channel] and Batch mode. + + Returns: + paddle.Tensor: (T, D) + """ + assert x.ndim == 1 + + feat = kaldi.fbank( + x.unsqueeze(0), # append channel dim, (C, Ti) + n_mels=self.n_mels, + frame_length=self.n_frame_length, + frame_shift=self.n_frame_shift, + dither=self.dither, + energy_floor=self.energy_floor, + sr=self.fs) + + assert feat.ndim == 2 # (T,D) + return feat From 0d7d87120b79b71259a2d42c8a33f0e93adf67ee Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 14 Sep 2022 16:44:12 +0000 Subject: [PATCH 14/57] simplify feature pipeline graph --- paddlespeech/audio/compliance/kaldi.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index beb2d86b9..24415058c 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -357,10 +357,13 @@ def _get_mel_banks(num_bins: int, ('Bad values in options: vtln-low {} and vtln-high {}, versus ' 'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq)) - bin = paddle.arange(num_bins).unsqueeze(1) + bin = paddle.arange(num_bins, dtype=paddle.float32).unsqueeze(1) + # left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) + # center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1) + # right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1) left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) - center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1) - right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1) + center_mel = left_mel + mel_freq_delta + right_mel = center_mel + mel_freq_delta if vtln_warp_factor != 1.0: left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, @@ -373,7 +376,7 @@ def _get_mel_banks(num_bins: int, center_freqs = _inverse_mel_scale(center_mel) # (num_bins) # (1, num_fft_bins) - mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0) + mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins, dtype=paddle.float32)).unsqueeze(0) # (num_bins, num_fft_bins) up_slope = (mel - left_mel) / (center_mel - left_mel) @@ -472,7 +475,8 @@ def fbank(waveform: Tensor, # (n_mels, padded_window_size // 2) mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq, high_freq, vtln_low, vtln_high, vtln_warp) - mel_energies = mel_energies.astype(dtype) + # mel_energies = mel_energies.astype(dtype) + assert mel_energies.dtype == dtype # (n_mels, padded_window_size // 2 + 1) mel_energies = paddle.nn.functional.pad( From 260752aa2a3284a37c06b88da2fef3b6d0118280 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 19 Sep 2022 14:10:16 +0000 Subject: [PATCH 15/57] using forward_attention_decoder --- paddlespeech/s2t/exps/u2/bin/test_wav.py | 8 +++----- paddlespeech/s2t/models/u2/u2.py | 14 ++++++-------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index c04e3ae47..a55a1eca0 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -69,8 +69,7 @@ class U2Infer(): with paddle.no_grad(): # read audio, sample_rate = soundfile.read( - self.audio_file, dtype="int16", always_2d=True) - + self.audio_file, dtype="int16", always_2d=True) audio = audio[:, 0] logger.info(f"audio shape: {audio.shape}") @@ -78,11 +77,10 @@ class U2Infer(): feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") - np.savetxt("feat.transform.txt", feat) - ilen = paddle.to_tensor(feat.shape[0]) - xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) + xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode + logger.debug(f"decode cfg: {decode_config}") result_transcripts = self.model.decode( xs, ilen, diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index d7b8630a3..b4ec6b033 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -545,17 +545,11 @@ class U2BaseModel(ASRInterface, nn.Layer): [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) + logger.debug(f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = paddle.ones( - (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool) - decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, - hyps_lens) # (beam_size, max_hyps_len, vocab_size) # ctc score in ln domain - decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1) - decoder_out = decoder_out.numpy() + decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, encoder_out) # Only use decoder score for rescoring best_score = -float('inf') @@ -567,11 +561,15 @@ class U2BaseModel(ASRInterface, nn.Layer): score += decoder_out[i][j][w] # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] + logger.debug(f"hyp {i} len {len(hyp[0])} l2r rescore_score: {score} ctc_score: {hyp[1]}") + # add ctc score (which in ln domain) score += hyp[1] * ctc_weight if score > best_score: best_score = score best_index = i + + logger.debug(f"result: {hyps[best_index]}") return hyps[best_index][0] @jit.to_static(property=True) From 4d5cfd400386bcd5be8729f8b3e1dfc5bae8365c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 03:23:50 +0000 Subject: [PATCH 16/57] export param from cnofig --- paddlespeech/s2t/exps/u2/model.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index ee4df9cb9..2b70f117b 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -462,13 +462,13 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.clone(), self.args.checkpoint_path) - batch_size = 1 feat_dim = self.test_loader.feat_dim - model_size = 512 + model_size = self.config.encoder_conf.output_size num_left_chunks = -1 + logger.info(f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}") - return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) + return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) @paddle.no_grad() def export(self): @@ -553,20 +553,10 @@ class U2Tester(U2Trainer): cnn_cache = paddle.zeros([0, 0, 0, 0]) xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - import soundfile - audio, sample_rate = soundfile.read( - './zh.wav', dtype="int16", always_2d=True) - audio = audio[:, 0] - logger.info(f"audio shape: {audio.shape}") - audio = paddle.to_tensor(audio, paddle.int16) - feat_d = infer_model.forward_feature(audio) - logger.info(f"{feat_d}") - np.savetxt("feat.tostatic.txt", feat_d) - - # load static model from paddle.jit.layer import Layer layer = Layer() + logger.info(f"load export model: {self.args.export_path}") layer.load(self.args.export_path, paddle.CPUPlace()) # forward_encoder_chunk static @@ -580,9 +570,3 @@ class U2Tester(U2Trainer): np.testing.assert_allclose(att_cache_d, att_cache_s, atol=1e-4) np.testing.assert_allclose(cnn_cache_d, cnn_cache_s, atol=1e-4) # logger.info(f"forward_encoder_chunk output: {xs_s}") - - # forward_feature static - func = getattr(layer, 'forward_feature') - feat_s = func(audio)[0] - logger.info(f"{feat_s}") - np.testing.assert_allclose(feat_d, feat_s, atol=1e-5) From 549d477592fbba8533c9e6a3e573918bdf9ca82a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 03:27:33 +0000 Subject: [PATCH 17/57] fix code style --- paddlespeech/s2t/exps/u2/bin/test_wav.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index a55a1eca0..e01d0e401 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -18,7 +18,6 @@ from pathlib import Path import paddle import soundfile -import numpy as np from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation From 53d6baff0be0e2e1d64c6b6b5772d064c24c2bf3 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 03:33:35 +0000 Subject: [PATCH 18/57] format --- paddlespeech/audio/compliance/kaldi.py | 3 +- paddlespeech/s2t/exps/u2/bin/test_wav.py | 2 +- paddlespeech/s2t/exps/u2/model.py | 37 +++++++++++-------- paddlespeech/s2t/models/u2/u2.py | 19 ++++++---- paddlespeech/s2t/modules/cmvn.py | 13 +++---- paddlespeech/s2t/modules/encoder.py | 9 +++-- paddlespeech/s2t/modules/fbank.py | 12 +++--- .../engine/asr/online/python/asr_engine.py | 1 - 8 files changed, 52 insertions(+), 44 deletions(-) diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index 24415058c..eb92ec1f2 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -376,7 +376,8 @@ def _get_mel_banks(num_bins: int, center_freqs = _inverse_mel_scale(center_mel) # (num_bins) # (1, num_fft_bins) - mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins, dtype=paddle.float32)).unsqueeze(0) + mel = _mel_scale(fft_bin_width * paddle.arange( + num_fft_bins, dtype=paddle.float32)).unsqueeze(0) # (num_bins, num_fft_bins) up_slope = (mel - left_mel) / (center_mel - left_mel) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index e01d0e401..ccf44d6b4 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -68,7 +68,7 @@ class U2Infer(): with paddle.no_grad(): # read audio, sample_rate = soundfile.read( - self.audio_file, dtype="int16", always_2d=True) + self.audio_file, dtype="int16", always_2d=True) audio = audio[:, 0] logger.info(f"audio shape: {audio.shape}") diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 2b70f117b..68354ff68 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -462,11 +462,13 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.clone(), self.args.checkpoint_path) - batch_size = 1 + batch_size = 1 feat_dim = self.test_loader.feat_dim model_size = self.config.encoder_conf.output_size num_left_chunks = -1 - logger.info(f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}") + logger.info( + f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}" + ) return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) @@ -479,29 +481,29 @@ class U2Tester(U2Trainer): assert isinstance(input_spec, (list, tuple)), type(input_spec) batch_size, feat_dim, model_size, num_left_chunks = input_spec - ######################## infer_model.forward_encoder_chunk ############ input_spec = [ # (T,), int16 paddle.static.InputSpec(shape=[None], dtype='int16'), ] - infer_model.forward_feature = paddle.jit.to_static(infer_model.forward_feature, input_spec=input_spec) + infer_model.forward_feature = paddle.jit.to_static( + infer_model.forward_feature, input_spec=input_spec) ######################### infer_model.forward_encoder_chunk ############ input_spec = [ # xs, (B, T, D) - paddle.static.InputSpec(shape=[batch_size, None, feat_dim], dtype='float32'), + paddle.static.InputSpec( + shape=[batch_size, None, feat_dim], dtype='float32'), # offset, int, but need be tensor - paddle.static.InputSpec(shape=[1], dtype='int32'), + paddle.static.InputSpec(shape=[1], dtype='int32'), # required_cache_size, int num_left_chunks, # att_cache paddle.static.InputSpec( - shape=[None, None, None, None], - dtype='float32'), + shape=[None, None, None, None], dtype='float32'), # cnn_cache paddle.static.InputSpec( - shape=[None, None, None, None], dtype='float32') + shape=[None, None, None, None], dtype='float32') ] infer_model.forward_encoder_chunk = paddle.jit.to_static( infer_model.forward_encoder_chunk, input_spec=input_spec) @@ -509,12 +511,12 @@ class U2Tester(U2Trainer): ######################### infer_model.ctc_activation ######################## input_spec = [ # encoder_out, (B,T,D) - paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32') ] infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) - ######################### infer_model.forward_attention_decoder ######################## input_spec = [ # hyps, (B, U) @@ -522,15 +524,19 @@ class U2Tester(U2Trainer): # hyps_lens, (B,) paddle.static.InputSpec(shape=[None], dtype='int64'), # encoder_out, (B,T,D) - paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32') ] infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) # jit save logger.info(f"export save: {self.args.export_path}") - paddle.jit.save(infer_model, self.args.export_path, combine_params=True, skip_forward=True) - + paddle.jit.save( + infer_model, + self.args.export_path, + combine_params=True, + skip_forward=True) # test dy2static def flatten(out): @@ -551,7 +557,8 @@ class U2Tester(U2Trainer): required_cache_size = num_left_chunks att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) - xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk( + xs1, offset, required_cache_size, att_cache, cnn_cache) # load static model from paddle.jit.layer import Layer diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 135045aaa..32d0940d9 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -545,11 +545,13 @@ class U2BaseModel(ASRInterface, nn.Layer): [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - logger.debug(f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") + logger.debug( + f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") hyps_lens = hyps_lens + 1 # Add at begining # ctc score in ln domain - decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, encoder_out) + decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, + encoder_out) # Only use decoder score for rescoring best_score = -float('inf') @@ -561,7 +563,9 @@ class U2BaseModel(ASRInterface, nn.Layer): score += decoder_out[i][j][w] # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] - logger.debug(f"hyp {i} len {len(hyp[0])} l2r rescore_score: {score} ctc_score: {hyp[1]}") + logger.debug( + f"hyp {i} len {len(hyp[0])} l2r rescore_score: {score} ctc_score: {hyp[1]}" + ) # add ctc score (which in ln domain) score += hyp[1] * ctc_weight @@ -933,9 +937,7 @@ class U2InferModel(U2Model): if process_type == 'fbank_kaldi': opts.update({'n_mels': input_dim}) opts['dither'] = 0.0 - self.fbank = KaldiFbank( - **opts - ) + self.fbank = KaldiFbank(**opts) logger.info(f"{self.__class__.__name__} export: {self.fbank}") if process_type == 'cmvn_json': # align with paddlespeech.audio.transform.cmvn:GlobalCMVN @@ -956,7 +958,8 @@ class U2InferModel(U2Model): self.global_cmvn = GlobalCMVN( paddle.to_tensor(mean, dtype=paddle.float), paddle.to_tensor(istd, dtype=paddle.float)) - logger.info(f"{self.__class__.__name__} export: {self.global_cmvn}") + logger.info( + f"{self.__class__.__name__} export: {self.global_cmvn}") def forward(self, feats, @@ -994,4 +997,4 @@ class U2InferModel(U2Model): x = paddle.cast(x, paddle.float32) feat = self.fbank(x) feat = self.global_cmvn(feat) - return feat \ No newline at end of file + return feat diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py index 53c508f1a..6a8c1660c 100644 --- a/paddlespeech/s2t/modules/cmvn.py +++ b/paddlespeech/s2t/modules/cmvn.py @@ -41,12 +41,11 @@ class GlobalCMVN(nn.Layer): self.register_buffer("istd", istd) def __repr__(self): - return ( - "{name}(mean={mean}, istd={istd}, norm_var={norm_var})".format( - name=self.__class__.__name__, - mean=self.mean, - istd=self.istd, - norm_var=self.norm_var)) + return ("{name}(mean={mean}, istd={istd}, norm_var={norm_var})".format( + name=self.__class__.__name__, + mean=self.mean, + istd=self.istd, + norm_var=self.norm_var)) def forward(self, x: paddle.Tensor): """ @@ -58,4 +57,4 @@ class GlobalCMVN(nn.Layer): x = x - self.mean if self.norm_var: x = x * self.istd - return x \ No newline at end of file + return x diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 458921b5a..87b83ef55 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -256,10 +256,11 @@ class BaseEncoder(nn.Layer): # att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, # cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i+1], - cnn_cache=cnn_cache[i:i+1], - ) + xs, + att_mask, + pos_emb, + att_cache=att_cache[i:i + 1], + cnn_cache=cnn_cache[i:i + 1], ) # new_att_cache = (1, head, attention_key_size, d_k*2) # new_cnn_cache = (B=1, hidden-dim, cache_t2) r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) diff --git a/paddlespeech/s2t/modules/fbank.py b/paddlespeech/s2t/modules/fbank.py index 4ec620a79..8d76a4727 100644 --- a/paddlespeech/s2t/modules/fbank.py +++ b/paddlespeech/s2t/modules/fbank.py @@ -1,19 +1,17 @@ - - - import paddle from paddle import nn from paddlespeech.audio.compliance import kaldi - from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() __all__ = ['KaldiFbank'] + class KaldiFbank(nn.Layer): - def __init__(self, + def __init__( + self, fs=16000, n_mels=80, n_shift=160, # unit:sample, 10ms @@ -62,7 +60,7 @@ class KaldiFbank(nn.Layer): assert x.ndim == 1 feat = kaldi.fbank( - x.unsqueeze(0), # append channel dim, (C, Ti) + x.unsqueeze(0), # append channel dim, (C, Ti) n_mels=self.n_mels, frame_length=self.n_frame_length, frame_shift=self.n_frame_shift, @@ -70,5 +68,5 @@ class KaldiFbank(nn.Layer): energy_floor=self.energy_floor, sr=self.fs) - assert feat.ndim == 2 # (T,D) + assert feat.ndim == 2 # (T,D) return feat diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 1dc970891..5782d7035 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -80,7 +80,6 @@ class PaddleASRConnectionHanddler: self.init_decoder() self.reset() - def init_decoder(self): if "deepspeech2" in self.model_type: assert self.continuous_decoding is False, "ds2 model not support endpoint" From 322301a6db2280f9358a37059db276de9fdcdc9a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 11:27:22 +0000 Subject: [PATCH 19/57] add reverse pad with sos and eos test --- tests/unit/asr/reverse_pad_list.py | 145 +++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 tests/unit/asr/reverse_pad_list.py diff --git a/tests/unit/asr/reverse_pad_list.py b/tests/unit/asr/reverse_pad_list.py new file mode 100644 index 000000000..60e768bcf --- /dev/null +++ b/tests/unit/asr/reverse_pad_list.py @@ -0,0 +1,145 @@ + + + + +import paddle +import numpy as np +import unittest + +# from paddlespeech.audio.utils.tensor_utils import reverse_pad_list +import paddlespeech.s2t +from paddlespeech.audio.utils.tensor_utils import add_sos_eos +from paddlespeech.audio.utils.tensor_utils import pad_sequence + +def reverse_pad_list(ys_pad: paddle.Tensor, + ys_lens: paddle.Tensor, + pad_value: float=-1.0) -> paddle.Tensor: + """Reverse padding for the list of tensors. + Args: + ys_pad (tensor): The padded tensor (B, Tokenmax). + ys_lens (tensor): The lens of token seqs (B) + pad_value (int): Value for padding. + Returns: + Tensor: Padded tensor (B, Tokenmax). + Examples: + >>> x + tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) + >>> pad_list(x, 0) + tensor([[4, 3, 2, 1], + [7, 6, 5, 0], + [9, 8, 0, 0]]) + """ + r_ys_pad = pad_sequence([(paddle.flip(y[:i], [0])) + for y, i in zip(ys_pad, ys_lens)], True, pad_value) + return r_ys_pad + +def naive_reverse_pad_list_with_sos_eos(r_hyps, r_hyps_lens, sos=5000, eos=5000, ignore_id=-1): + r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(ignore_id)) + r_hyps, _ = add_sos_eos(r_hyps, sos, eos, ignore_id) + return r_hyps + +def reverse_pad_list_with_sos_eos(r_hyps, r_hyps_lens, sos=5000, eos=5000, ignore_id=-1): + # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) + # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) + max_len = paddle.max(r_hyps_lens) + index_range = paddle.arange(0, max_len, 1) + seq_len_expand = r_hyps_lens.unsqueeze(1) + seq_mask = seq_len_expand > index_range # (beam, max_len) + + index = (seq_len_expand - 1) - index_range # (beam, max_len) + # >>> index + # >>> tensor([[ 2, 1, 0], + # >>> [ 2, 1, 0], + # >>> [ 0, -1, -2]]) + index = index * seq_mask + + # >>> index + # >>> tensor([[2, 1, 0], + # >>> [2, 1, 0], + # >>> [0, 0, 0]]) + def paddle_gather(x, dim, index): + index_shape = index.shape + index_flatten = index.flatten() + if dim < 0: + dim = len(x.shape) + dim + nd_index = [] + for k in range(len(x.shape)): + if k == dim: + nd_index.append(index_flatten) + else: + reshape_shape = [1] * len(x.shape) + reshape_shape[k] = x.shape[k] + x_arange = paddle.arange(x.shape[k], dtype=index.dtype) + x_arange = x_arange.reshape(reshape_shape) + dim_index = paddle.expand(x_arange, index_shape).flatten() + nd_index.append(dim_index) + ind2 = paddle.transpose(paddle.stack(nd_index), + [1, 0]).astype("int64") + paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape) + return paddle_out + + r_hyps = paddle_gather(r_hyps, 1, index) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, 2, 2]]) + r_hyps = paddle.where(seq_mask, r_hyps, eos) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, eos, eos]]) + B = r_hyps.shape[0] + _sos = paddle.ones([B, 1], dtype=r_hyps.dtype) * sos + # r_hyps = paddle.concat([hyps[:, 0:1], r_hyps], axis=1) + r_hyps = paddle.concat([_sos, r_hyps], axis=1) + # >>> r_hyps + # >>> tensor([[sos, 3, 2, 1], + # >>> [sos, 4, 8, 9], + # >>> [sos, 2, eos, eos]]) + return r_hyps + + +class TestU2Model(unittest.TestCase): + def setUp(self): + paddle.set_device('cpu') + + self.sos=5000 + self.eos=5000 + self.ignore_id=-1 + self.reverse_hyps = paddle.to_tensor( + [[ 4, 3, 2, 1, -1], + [ 5, 4, 3, 2, 1]] + ) + self.reverse_hyps_sos_eos = paddle.to_tensor( + [[self.sos, 4 , 3 , 2 , 1 , self.eos], + [self.sos, 5 , 4 , 3 , 2 , 1 ]] + ) + + self.hyps = paddle.to_tensor( + [ + [1, 2, 3, 4, -1], + [1, 2, 3, 4, 5] + ] + ) + + + self.hyps_lens = paddle.to_tensor([4, 5], paddle.int32) + + def test_reverse_pad_list(self): + r_hyps = reverse_pad_list(self.hyps, self.hyps_lens) + self.assertSequenceEqual(r_hyps.tolist(), self.reverse_hyps.tolist()) + + def test_naive_reverse_pad_list_with_sos_eos(self): + r_hyps_sos_eos = naive_reverse_pad_list_with_sos_eos(self.hyps, self.hyps_lens) + self.assertSequenceEqual(r_hyps_sos_eos.tolist(), self.reverse_hyps_sos_eos.tolist()) + + def test_static_reverse_pad_list_with_sos_eos(self): + r_hyps_sos_eos_static = reverse_pad_list_with_sos_eos(self.hyps, self.hyps_lens) + self.assertSequenceEqual(r_hyps_sos_eos_static.tolist(), self.reverse_hyps_sos_eos.tolist()) + + + +if __name__ == '__main__': + unittest.main() + + From f95edc382c71e91b5c7fa10f10fc31e681a17169 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 11:32:35 +0000 Subject: [PATCH 20/57] format --- tests/unit/asr/reverse_pad_list.py | 193 +++++++++++++++-------------- 1 file changed, 102 insertions(+), 91 deletions(-) diff --git a/tests/unit/asr/reverse_pad_list.py b/tests/unit/asr/reverse_pad_list.py index 60e768bcf..215ed5ceb 100644 --- a/tests/unit/asr/reverse_pad_list.py +++ b/tests/unit/asr/reverse_pad_list.py @@ -1,16 +1,27 @@ - - - +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest import paddle -import numpy as np -import unittest -# from paddlespeech.audio.utils.tensor_utils import reverse_pad_list -import paddlespeech.s2t +import paddlespeech.s2t # noqa: F401 from paddlespeech.audio.utils.tensor_utils import add_sos_eos from paddlespeech.audio.utils.tensor_utils import pad_sequence +# from paddlespeech.audio.utils.tensor_utils import reverse_pad_list + + def reverse_pad_list(ys_pad: paddle.Tensor, ys_lens: paddle.Tensor, pad_value: float=-1.0) -> paddle.Tensor: @@ -33,95 +44,94 @@ def reverse_pad_list(ys_pad: paddle.Tensor, for y, i in zip(ys_pad, ys_lens)], True, pad_value) return r_ys_pad -def naive_reverse_pad_list_with_sos_eos(r_hyps, r_hyps_lens, sos=5000, eos=5000, ignore_id=-1): + +def naive_reverse_pad_list_with_sos_eos(r_hyps, + r_hyps_lens, + sos=5000, + eos=5000, + ignore_id=-1): r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(ignore_id)) r_hyps, _ = add_sos_eos(r_hyps, sos, eos, ignore_id) return r_hyps -def reverse_pad_list_with_sos_eos(r_hyps, r_hyps_lens, sos=5000, eos=5000, ignore_id=-1): - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = paddle.max(r_hyps_lens) - index_range = paddle.arange(0, max_len, 1) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - def paddle_gather(x, dim, index): - index_shape = index.shape - index_flatten = index.flatten() - if dim < 0: - dim = len(x.shape) + dim - nd_index = [] - for k in range(len(x.shape)): - if k == dim: - nd_index.append(index_flatten) - else: - reshape_shape = [1] * len(x.shape) - reshape_shape[k] = x.shape[k] - x_arange = paddle.arange(x.shape[k], dtype=index.dtype) - x_arange = x_arange.reshape(reshape_shape) - dim_index = paddle.expand(x_arange, index_shape).flatten() - nd_index.append(dim_index) - ind2 = paddle.transpose(paddle.stack(nd_index), - [1, 0]).astype("int64") - paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape) - return paddle_out - - r_hyps = paddle_gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = paddle.where(seq_mask, r_hyps, eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - B = r_hyps.shape[0] - _sos = paddle.ones([B, 1], dtype=r_hyps.dtype) * sos - # r_hyps = paddle.concat([hyps[:, 0:1], r_hyps], axis=1) - r_hyps = paddle.concat([_sos, r_hyps], axis=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - return r_hyps + +def reverse_pad_list_with_sos_eos(r_hyps, + r_hyps_lens, + sos=5000, + eos=5000, + ignore_id=-1): + # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) + # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) + max_len = paddle.max(r_hyps_lens) + index_range = paddle.arange(0, max_len, 1) + seq_len_expand = r_hyps_lens.unsqueeze(1) + seq_mask = seq_len_expand > index_range # (beam, max_len) + + index = (seq_len_expand - 1) - index_range # (beam, max_len) + # >>> index + # >>> tensor([[ 2, 1, 0], + # >>> [ 2, 1, 0], + # >>> [ 0, -1, -2]]) + index = index * seq_mask + + # >>> index + # >>> tensor([[2, 1, 0], + # >>> [2, 1, 0], + # >>> [0, 0, 0]]) + def paddle_gather(x, dim, index): + index_shape = index.shape + index_flatten = index.flatten() + if dim < 0: + dim = len(x.shape) + dim + nd_index = [] + for k in range(len(x.shape)): + if k == dim: + nd_index.append(index_flatten) + else: + reshape_shape = [1] * len(x.shape) + reshape_shape[k] = x.shape[k] + x_arange = paddle.arange(x.shape[k], dtype=index.dtype) + x_arange = x_arange.reshape(reshape_shape) + dim_index = paddle.expand(x_arange, index_shape).flatten() + nd_index.append(dim_index) + ind2 = paddle.transpose(paddle.stack(nd_index), [1, 0]).astype("int64") + paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape) + return paddle_out + + r_hyps = paddle_gather(r_hyps, 1, index) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, 2, 2]]) + r_hyps = paddle.where(seq_mask, r_hyps, eos) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, eos, eos]]) + B = r_hyps.shape[0] + _sos = paddle.ones([B, 1], dtype=r_hyps.dtype) * sos + # r_hyps = paddle.concat([hyps[:, 0:1], r_hyps], axis=1) + r_hyps = paddle.concat([_sos, r_hyps], axis=1) + # >>> r_hyps + # >>> tensor([[sos, 3, 2, 1], + # >>> [sos, 4, 8, 9], + # >>> [sos, 2, eos, eos]]) + return r_hyps class TestU2Model(unittest.TestCase): def setUp(self): paddle.set_device('cpu') - self.sos=5000 - self.eos=5000 - self.ignore_id=-1 - self.reverse_hyps = paddle.to_tensor( - [[ 4, 3, 2, 1, -1], - [ 5, 4, 3, 2, 1]] - ) + self.sos = 5000 + self.eos = 5000 + self.ignore_id = -1 + self.reverse_hyps = paddle.to_tensor([[4, 3, 2, 1, -1], + [5, 4, 3, 2, 1]]) self.reverse_hyps_sos_eos = paddle.to_tensor( - [[self.sos, 4 , 3 , 2 , 1 , self.eos], - [self.sos, 5 , 4 , 3 , 2 , 1 ]] - ) - - self.hyps = paddle.to_tensor( - [ - [1, 2, 3, 4, -1], - [1, 2, 3, 4, 5] - ] - ) + [[self.sos, 4, 3, 2, 1, self.eos], [self.sos, 5, 4, 3, 2, 1]]) + self.hyps = paddle.to_tensor([[1, 2, 3, 4, -1], [1, 2, 3, 4, 5]]) self.hyps_lens = paddle.to_tensor([4, 5], paddle.int32) @@ -130,16 +140,17 @@ class TestU2Model(unittest.TestCase): self.assertSequenceEqual(r_hyps.tolist(), self.reverse_hyps.tolist()) def test_naive_reverse_pad_list_with_sos_eos(self): - r_hyps_sos_eos = naive_reverse_pad_list_with_sos_eos(self.hyps, self.hyps_lens) - self.assertSequenceEqual(r_hyps_sos_eos.tolist(), self.reverse_hyps_sos_eos.tolist()) + r_hyps_sos_eos = naive_reverse_pad_list_with_sos_eos(self.hyps, + self.hyps_lens) + self.assertSequenceEqual(r_hyps_sos_eos.tolist(), + self.reverse_hyps_sos_eos.tolist()) def test_static_reverse_pad_list_with_sos_eos(self): - r_hyps_sos_eos_static = reverse_pad_list_with_sos_eos(self.hyps, self.hyps_lens) - self.assertSequenceEqual(r_hyps_sos_eos_static.tolist(), self.reverse_hyps_sos_eos.tolist()) - + r_hyps_sos_eos_static = reverse_pad_list_with_sos_eos(self.hyps, + self.hyps_lens) + self.assertSequenceEqual(r_hyps_sos_eos_static.tolist(), + self.reverse_hyps_sos_eos.tolist()) if __name__ == '__main__': unittest.main() - - From 6fc4b2809332ab8af057aa71b74baae7c7d06d2b Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 12:42:49 +0000 Subject: [PATCH 21/57] add comment --- examples/wenetspeech/asr1/local/export.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/wenetspeech/asr1/local/export.sh b/examples/wenetspeech/asr1/local/export.sh index 6b646b469..735c4f8e5 100755 --- a/examples/wenetspeech/asr1/local/export.sh +++ b/examples/wenetspeech/asr1/local/export.sh @@ -12,9 +12,12 @@ config_path=$1 ckpt_path_prefix=$2 jit_model_export_path=$3 + +# export can not using StreamdataDataloader, set use_stream_dta False python3 -u ${BIN_DIR}/export.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--opts use_stream_data False \ --checkpoint_path ${ckpt_path_prefix} \ --export_path ${jit_model_export_path} From 309c8d70d9e7168eac597a5ffb030fc6703d7e87 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 12:56:07 +0000 Subject: [PATCH 22/57] add reverse weight --- paddlespeech/s2t/exps/u2/model.py | 4 +++- paddlespeech/s2t/models/u2/u2.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 54810f22f..64b6c8df6 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -520,6 +520,7 @@ class U2Tester(U2Trainer): infer_model.ctc_activation, input_spec=input_spec) ######################### infer_model.forward_attention_decoder ######################## + reverse_weight = 0.3 input_spec = [ # hyps, (B, U) paddle.static.InputSpec(shape=[None, None], dtype='int64'), @@ -527,7 +528,8 @@ class U2Tester(U2Trainer): paddle.static.InputSpec(shape=[None], dtype='int64'), # encoder_out, (B,T,D) paddle.static.InputSpec( - shape=[batch_size, None, model_size], dtype='float32') + shape=[batch_size, None, model_size], dtype='float32'), + reverse_weight ] infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index d699b684b..1681bf1d9 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -706,7 +706,7 @@ class U2BaseModel(ASRInterface, nn.Layer): hyps: paddle.Tensor, hyps_lens: paddle.Tensor, encoder_out: paddle.Tensor, - reverse_weight: float=0.0, ) -> paddle.Tensor: + reverse_weight: float=0.0) -> paddle.Tensor: """ Export interface for c++ call, forward decoder with multiple hypothesis from ctc prefix beam search and one encoder output Args: From 00b2c1c8fb4fc81e723e8580cbc7ed6059378680 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 21 Sep 2022 07:50:02 +0000 Subject: [PATCH 23/57] fix forward attention decoder caller --- paddlespeech/s2t/exps/u2/bin/test_wav.py | 2 +- paddlespeech/s2t/models/u2/u2.py | 15 ++++++++------- paddlespeech/s2t/modules/decoder.py | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 9446884f8..31890cb19 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -79,7 +79,7 @@ class U2Infer(): ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode - logger.debug(f"decode cfg: {decode_config}") + logger.info(f"decode cfg: {decode_config}") result_transcripts = self.model.decode( xs, ilen, diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 1681bf1d9..7609b71e0 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -565,18 +565,18 @@ class U2BaseModel(ASRInterface, nn.Layer): [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - logger.debug( + logger.info( f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") hyps_lens = hyps_lens + 1 # Add at begining # ctc score in ln domain # (beam_size, max_hyps_len, vocab_size) decoder_out, r_decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, - encoder_out,reverse_weight ) + encoder_out, reverse_weight) + decoder_out = decoder_out.numpy() # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a # conventional transformer decoder. - r_decoder_out = paddle.nn.functional.log_softmax(r_decoder_out, axis=-1) r_decoder_out = r_decoder_out.numpy() # Only use decoder score for rescoring @@ -590,15 +590,16 @@ class U2BaseModel(ASRInterface, nn.Layer): # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] - logger.debug( - f"hyp {i} len {len(hyp[0])} l2r rescore_score: {score} ctc_score: {hyp[1]}" - ) + logger.info(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") if reverse_weight > 0: r_score = 0.0 for j, w in enumerate(hyp[0]): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.eos] + + logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + score = score * (1 - reverse_weight) + r_score * reverse_weight # add ctc score (which in ln domain) @@ -607,7 +608,7 @@ class U2BaseModel(ASRInterface, nn.Layer): best_score = score best_index = i - logger.debug(f"result: {hyps[best_index]}") + logger.info(f"result: {hyps[best_index]}") return hyps[best_index][0] @jit.to_static(property=True) diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index 3b1a7f23d..03b637b7c 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -343,7 +343,7 @@ class BiTransformerDecoder(BatchScorerInterface, nn.Layer): """ l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, ys_in_lens) - r_x = paddle.to_tensor(0.0) + r_x = paddle.zeros([1]) if reverse_weight > 0.0: r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, ys_in_lens) From b10512eb0e64d615621baa2cd203129f20dd1626 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 21 Sep 2022 09:16:32 +0000 Subject: [PATCH 24/57] more config or u2pp --- examples/wenetspeech/asr1/README.md | 31 ++++++ .../asr1/conf/chunk_conformer.yaml | 4 +- .../asr1/conf/chunk_conformer_u2pp.yaml | 100 ++++++++++++++++++ examples/wenetspeech/asr1/local/export.sh | 2 + paddlespeech/s2t/models/u2/u2.py | 8 +- 5 files changed, 140 insertions(+), 5 deletions(-) create mode 100644 examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md index c08b94e29..9fc2856ce 100644 --- a/examples/wenetspeech/asr1/README.md +++ b/examples/wenetspeech/asr1/README.md @@ -12,3 +12,34 @@ show model.tar.gz ``` tar tf model.tar.gz ``` + +other way is: + +```bash +tar cvzf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz model.yaml conf/tuning/ conf/chunk_conformer.yaml conf/preprocess.yaml data/mean_std.json exp/chunk_conformer/checkpoints/ +``` + +## Export Static Model + +>> `data/test_meeting/data.list` +>> {"input": [{"name": "input1", "shape": [3.2230625, 80], "feat": "/home/PaddleSpeech/dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0163.wav", "filetype": "sound"}], "output": [{"name": "target1", "shape": [9, 5538], "text": "\u697c\u5e02\u8c03\u63a7\u5c06\u53bb\u5411\u4f55\u65b9", "token": "\u697c \u5e02 \u8c03 \u63a7 \u5c06 \u53bb \u5411 \u4f55 \u65b9", "tokenid": "1891 1121 3502 1543 1018 477 528 163 1657"}], "utt": "BAC009S0764W0163", "utt2spk": "S0764"} + +>> Test Wav: +>> wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +### U2 chunk conformer +>> UiDecoder +>> Make sure `reverse_weight` in config is `0.0` +>> https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz +``` +tar zxvf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz +./local/export.sh conf/chunk_conformer.yaml exp/chunk_conformer/checkpoints/avg_10 ./export.ji +``` + +### U2++ chunk conformer +>> BiDecoder +>> https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.0.model.tar.gz +>> Make sure `reverse_weight` in config is not `0.0` + +``` +./local/export.sh conf/chunk_conformer_u2pp.yaml exp/chunk_conformer/checkpoints/avg_10 ./export.ji +``` diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml index 69fa223a1..d2f43d873 100644 --- a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml +++ b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml @@ -39,6 +39,7 @@ decoder_conf: model_conf: ctc_weight: 0.3 lsm_weight: 0.1 # label smoothing option + reverse_weight: 0.0 # unidecoder length_normalized_loss: false init_type: 'kaiming_uniform' @@ -53,8 +54,9 @@ test_manifest: data/test_meeting/data.list ########################################### # Dataloader # ########################################### -vocab_filepath: data/lang_char/vocab.txt +use_streaming_data: True unit_type: 'char' +vocab_filepath: data/lang_char/vocab.txt preprocess_config: conf/preprocess.yaml spm_model_prefix: '' feat_dim: 80 diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml new file mode 100644 index 000000000..2bb2006b5 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml @@ -0,0 +1,100 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: bitransformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 3 # the number of encoder blocks + r_num_blocks: 3 #only for bitransformer + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + reverse_weight: 0.3 # only for bitransformer decoder + init_type: 'kaiming_uniform' # !Warning: need to convergence + +########################################### +# Data # +########################################### +train_manifest: data/train_l/data.list +dev_manifest: data/dev/data.list +test_manifest: data/test_meeting/data.list + +########################################### +# Dataloader # +########################################### +use_stream_data: True +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +do_filter: True +maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced +minlen_in: 10 +minlen_out: 0 +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 150 +accum_grad: 8 +global_grad_clip: 5.0 +dist_sampler: False +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/wenetspeech/asr1/local/export.sh b/examples/wenetspeech/asr1/local/export.sh index 735c4f8e5..1f89afd6b 100755 --- a/examples/wenetspeech/asr1/local/export.sh +++ b/examples/wenetspeech/asr1/local/export.sh @@ -14,6 +14,8 @@ jit_model_export_path=$3 # export can not using StreamdataDataloader, set use_stream_dta False +# u2: reverse_weight should be 0.0 +# u2pp: reverse_weight should be same with config file. e.g. 0.3 python3 -u ${BIN_DIR}/export.py \ --ngpu ${ngpu} \ --config ${config_path} \ diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 7609b71e0..2279812ba 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -565,7 +565,7 @@ class U2BaseModel(ASRInterface, nn.Layer): [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - logger.info( + logger.debug( f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") hyps_lens = hyps_lens + 1 # Add at begining @@ -590,7 +590,7 @@ class U2BaseModel(ASRInterface, nn.Layer): # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] - logger.info(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + logger.debug(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") if reverse_weight > 0: r_score = 0.0 @@ -598,7 +598,7 @@ class U2BaseModel(ASRInterface, nn.Layer): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.eos] - logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") score = score * (1 - reverse_weight) + r_score * reverse_weight @@ -608,7 +608,7 @@ class U2BaseModel(ASRInterface, nn.Layer): best_score = score best_index = i - logger.info(f"result: {hyps[best_index]}") + logger.debug(f"result: {hyps[best_index]}") return hyps[best_index][0] @jit.to_static(property=True) From d25871a7b090fc76f7c1780eb3bf2fabb606aa14 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 21 Sep 2022 09:18:48 +0000 Subject: [PATCH 25/57] format --- paddlespeech/s2t/models/u2/u2.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 2279812ba..93c5d9106 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -571,8 +571,8 @@ class U2BaseModel(ASRInterface, nn.Layer): # ctc score in ln domain # (beam_size, max_hyps_len, vocab_size) - decoder_out, r_decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, - encoder_out, reverse_weight) + decoder_out, r_decoder_out = self.forward_attention_decoder( + hyps_pad, hyps_lens, encoder_out, reverse_weight) decoder_out = decoder_out.numpy() # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a @@ -590,7 +590,9 @@ class U2BaseModel(ASRInterface, nn.Layer): # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] - logger.debug(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + logger.debug( + f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}" + ) if reverse_weight > 0: r_score = 0.0 @@ -598,7 +600,9 @@ class U2BaseModel(ASRInterface, nn.Layer): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.eos] - logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + logger.info( + f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}" + ) score = score * (1 - reverse_weight) + r_score * reverse_weight @@ -702,12 +706,11 @@ class U2BaseModel(ASRInterface, nn.Layer): return self.ctc.log_softmax(xs) # @jit.to_static - def forward_attention_decoder( - self, - hyps: paddle.Tensor, - hyps_lens: paddle.Tensor, - encoder_out: paddle.Tensor, - reverse_weight: float=0.0) -> paddle.Tensor: + def forward_attention_decoder(self, + hyps: paddle.Tensor, + hyps_lens: paddle.Tensor, + encoder_out: paddle.Tensor, + reverse_weight: float=0.0) -> paddle.Tensor: """ Export interface for c++ call, forward decoder with multiple hypothesis from ctc prefix beam search and one encoder output Args: From 7382050e21990ae2b4dac0cd86a6dbac4d84d485 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 21 Sep 2022 11:15:00 +0000 Subject: [PATCH 26/57] fix bug on win --- paddlespeech/audio/utils/tensor_utils.py | 5 +++-- paddlespeech/s2t/models/u2/u2.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddlespeech/audio/utils/tensor_utils.py b/paddlespeech/audio/utils/tensor_utils.py index 44dcb52ec..b2436a121 100644 --- a/paddlespeech/audio/utils/tensor_utils.py +++ b/paddlespeech/audio/utils/tensor_utils.py @@ -237,7 +237,7 @@ def st_reverse_pad_list(ys_pad: paddle.Tensor, # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) B = ys_pad.shape[0] - _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos + _sos = paddle.full([B, 1], sos, dtype=ys_pad.dtype) max_len = paddle.max(ys_lens) index_range = paddle.arange(0, max_len, 1) seq_len_expand = ys_lens.unsqueeze(1) @@ -279,7 +279,8 @@ def st_reverse_pad_list(ys_pad: paddle.Tensor, # >>> tensor([[3, 2, 1], # >>> [4, 8, 9], # >>> [2, 2, 2]]) - r_hyps = paddle.where(seq_mask, r_hyps, eos) + _eos = paddle.full([1], eos, dtype=r_hyps.dtype) + r_hyps = paddle.where(seq_mask, r_hyps, _eos) # >>> r_hyps # >>> tensor([[3, 2, 1], # >>> [4, 8, 9], diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 93c5d9106..207e470a6 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -600,7 +600,7 @@ class U2BaseModel(ASRInterface, nn.Layer): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.eos] - logger.info( + logger.debug( f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}" ) From b7388ce25afc6da37b6011405141c0c9eb2ee99f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 22 Sep 2022 11:42:20 +0000 Subject: [PATCH 27/57] eliminate useless unsqueese --- paddlespeech/s2t/modules/embedding.py | 7 +++---- paddlespeech/s2t/modules/encoder.py | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index 3aeebd29b..54324c2f6 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -89,7 +89,7 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface): self.max_len = max_len self.xscale = paddle.to_tensor(math.sqrt(self.d_model)) self.dropout = nn.Dropout(p=dropout_rate) - self.pe = paddle.zeros([self.max_len, self.d_model]) #[T,D] + self.pe = paddle.zeros([1, self.max_len, self.d_model]) #[B=1,T,D] position = paddle.arange( 0, self.max_len, dtype=paddle.float32).unsqueeze(1) #[T, 1] @@ -97,9 +97,8 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface): paddle.arange(0, self.d_model, 2, dtype=paddle.float32) * -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = paddle.sin(position * div_term) - self.pe[:, 1::2] = paddle.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) #[1, T, D] + self.pe[:, :, 0::2] = paddle.sin(position * div_term) + self.pe[:, :, 1::2] = paddle.cos(position * div_term) def forward(self, x: paddle.Tensor, offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]: diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 87b83ef55..2e76ccb05 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -264,15 +264,15 @@ class BaseEncoder(nn.Layer): # new_att_cache = (1, head, attention_key_size, d_k*2) # new_cnn_cache = (B=1, hidden-dim, cache_t2) r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) # add elayer dim + r_cnn_cache.append(new_cnn_cache) # add elayer dim if self.normalize_before: xs = self.after_norm(xs) # r_att_cache (elayers, head, T, d_k*2) - # r_cnn_cache (elayers, B=1, hidden-dim, cache_t2) + # r_cnn_cache (elayers, B=1, hidden-dim, cache_t2) r_att_cache = paddle.concat(r_att_cache, axis=0) - r_cnn_cache = paddle.concat(r_cnn_cache, axis=0) + r_cnn_cache = paddle.stack(r_cnn_cache, axis=0) return xs, r_att_cache, r_cnn_cache def forward_chunk_by_chunk( From c4a5ae382524cc1461f172e8659ef39b8a310081 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 08:34:45 +0000 Subject: [PATCH 28/57] eliminate mul --- paddlespeech/audio/utils/tensor_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/audio/utils/tensor_utils.py b/paddlespeech/audio/utils/tensor_utils.py index b2436a121..93883c94d 100644 --- a/paddlespeech/audio/utils/tensor_utils.py +++ b/paddlespeech/audio/utils/tensor_utils.py @@ -152,8 +152,8 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int, # return pad_sequence(ys_in, padding_value=eos).transpose([1,0]), pad_sequence(ys_out, padding_value=ignore_id).transpose([1,0]) B = ys_pad.shape[0] - _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos - _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos + _sos = paddle.full([B, 1], sos, dtype=ys_pad.dtype) + _eos = paddle.full([B, 1], eos, dtype=ys_pad.dtype) ys_in = paddle.cat([_sos, ys_pad], dim=1) mask_pad = (ys_in == ignore_id) ys_in = ys_in.masked_fill(mask_pad, eos) From 8e7a315e00806f54d320136467b9104d802bdc78 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 11:02:28 +0000 Subject: [PATCH 29/57] remove comment --- paddlespeech/s2t/__init__.py | 1 - paddlespeech/s2t/exps/u2/bin/test.py | 2 -- paddlespeech/s2t/exps/u2_st/bin/test.py | 2 -- 3 files changed, 5 deletions(-) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 5fe2e16b9..3c704b272 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -22,7 +22,6 @@ from paddle.nn import functional as F from paddlespeech.s2t.utils.log import Log -#TODO(Hui Zhang): remove fluid import logger = Log(__name__).getlog() ########### hack logging ############# diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index f14d804f1..b13fd0d3f 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -20,8 +20,6 @@ from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load - def main_sp(config, args): exp = Tester(config, args) diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py index 1d70a3103..c07c95bd5 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/test.py +++ b/paddlespeech/s2t/exps/u2_st/bin/test.py @@ -20,8 +20,6 @@ from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load - def main_sp(config, args): exp = Tester(config, args) From 6de81d74d9b00c0ec4e6163d9b74bbba5ac20ff0 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 11:06:17 +0000 Subject: [PATCH 30/57] elimiete cast dtype for bool op --- paddlespeech/s2t/models/u2/u2.py | 12 +++--------- paddlespeech/s2t/models/u2_st/u2_st.py | 5 +---- paddlespeech/s2t/modules/decoder.py | 8 ++------ paddlespeech/s2t/modules/encoder.py | 26 +++++++------------------- paddlespeech/s2t/modules/mask.py | 9 ++------- paddlespeech/s2t/utils/tensor_utils.py | 11 +++-------- 6 files changed, 18 insertions(+), 53 deletions(-) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 207e470a6..c25c2186d 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -124,10 +124,7 @@ class U2BaseModel(ASRInterface, nn.Layer): encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_time = time.time() - start #logger.debug(f"encoder time: {encoder_time}") - #TODO(Hui Zhang): sum not support bool type - #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] - encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum( - 1) #[B, 1, T] -> [B] + encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] # 2a. Attention-decoder branch loss_att = None @@ -291,8 +288,7 @@ class U2BaseModel(ASRInterface, nn.Layer): # 2. Decoder forward step by step for i in range(1, maxlen + 1): # Stop if all batch and all beam produce eos - # TODO(Hui Zhang): if end_flag.sum() == running_size: - if end_flag.cast(paddle.int64).sum() == running_size: + if end_flag.sum() == running_size: break # 2.1 Forward decoder step @@ -378,9 +374,7 @@ class U2BaseModel(ASRInterface, nn.Layer): speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) maxlen = encoder_out.shape[1] - # (TODO Hui Zhang): bool no support reduce_sum - # encoder_out_lens = encoder_mask.squeeze(1).sum(1) - encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1) + encoder_out_lens = encoder_mask.squeeze(1).sum(1) ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index e8b61bc0d..31defbbaf 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -111,10 +111,7 @@ class U2STBaseModel(nn.Layer): encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_time = time.time() - start #logger.debug(f"encoder time: {encoder_time}") - #TODO(Hui Zhang): sum not support bool type - #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] - encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum( - 1) #[B, 1, T] -> [B] + encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] # 2a. ST-decoder branch start = time.time() diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index 03b637b7c..5e1b4c92b 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -140,9 +140,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): # m: (1, L, L) m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0) # tgt_mask: (B, L, L) - # TODO(Hui Zhang): not support & for tensor - # tgt_mask = tgt_mask & m - tgt_mask = tgt_mask.logical_and(m) + tgt_mask = tgt_mask & m x, _ = self.embed(tgt) for layer in self.decoders: @@ -153,9 +151,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): if self.use_output_layer: x = self.output_layer(x) - # TODO(Hui Zhang): reduce_sum not support bool type - # olens = tgt_mask.sum(1) - olens = tgt_mask.astype(paddle.int).sum(1) + olens = tgt_mask.sum(1) return x, paddle.to_tensor(0.0), olens def forward_one_step( diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 2e76ccb05..db5848847 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -164,12 +164,8 @@ class BaseEncoder(nn.Layer): if self.global_cmvn is not None: xs = self.global_cmvn(xs) - #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor - xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) - #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor - masks = masks.astype(paddle.bool) - #TODO(Hui Zhang): mask_pad = ~masks - mask_pad = masks.logical_not() + xs, pos_emb, masks = self.embed(xs, masks, offset=0) + mask_pad = ~masks chunk_masks = add_optional_chunk_mask( xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, decoding_chunk_size, self.static_chunk_size, @@ -215,11 +211,8 @@ class BaseEncoder(nn.Layer): same shape as the original cnn_cache """ assert xs.shape[0] == 1 # batch size must be one - # tmp_masks is just for interface compatibility - # TODO(Hui Zhang): stride_slice not support bool tensor - # tmp_masks = paddle.ones([1, paddle.shape(xs)[1]], dtype=paddle.bool) - tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32) - tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T] + # tmp_masks is just for interface compatibility, [B=1, C=1, T] + tmp_masks = paddle.ones([1, 1, xs.shape[1]], dtype=paddle.bool) if self.global_cmvn is not None: xs = self.global_cmvn(xs) @@ -228,9 +221,8 @@ class BaseEncoder(nn.Layer): xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) # after embed, xs=(B=1, chunk_size, hidden-dim) - elayers = paddle.shape(att_cache)[0] - cache_t1 = paddle.shape(att_cache)[2] - chunk_size = paddle.shape(xs)[1] + elayers, _, cache_t1, _ = att_cache.shape + chunk_size = xs.shape[1] attention_key_size = cache_t1 + chunk_size # only used when using `RelPositionMultiHeadedAttention` @@ -402,11 +394,7 @@ class TransformerEncoder(BaseEncoder): if self.global_cmvn is not None: xs = self.global_cmvn(xs) - #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor - xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) - #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor - masks = masks.astype(paddle.bool) - + xs, pos_emb, masks = self.embed(xs, masks, offset=0) if cache is None: cache = [None for _ in range(len(self.encoders))] new_cache = [] diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py index 1f66c015a..787a06528 100644 --- a/paddlespeech/s2t/modules/mask.py +++ b/paddlespeech/s2t/modules/mask.py @@ -109,13 +109,8 @@ def subsequent_mask(size: int) -> paddle.Tensor: [1, 1, 1]] """ ret = paddle.ones([size, size], dtype=paddle.bool) - #TODO(Hui Zhang): tril not support bool - #return paddle.tril(ret) - ret = ret.astype(paddle.float) - ret = paddle.tril(ret) - ret = ret.astype(paddle.bool) - return ret - + return paddle.tril(ret) + def subsequent_chunk_mask( size: int, diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py index 422d4f82a..3ac102f3c 100644 --- a/paddlespeech/s2t/utils/tensor_utils.py +++ b/paddlespeech/s2t/utils/tensor_utils.py @@ -184,13 +184,8 @@ def th_accuracy(pad_outputs: paddle.Tensor, pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1], pad_outputs.shape[1]).argmax(2) mask = pad_targets != ignore_label - #TODO(Hui Zhang): sum not support bool type - # numerator = paddle.sum( - # pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - numerator = ( + + numerator = paddle.sum( pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - numerator = paddle.sum(numerator.type_as(pad_targets)) - #TODO(Hui Zhang): sum not support bool type - # denominator = paddle.sum(mask) - denominator = paddle.sum(mask.type_as(pad_targets)) + denominator = paddle.sum(mask) return float(numerator) / float(denominator) From c2c8a662b14b09dc6fc0079ee074ead8d192c549 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 11:07:29 +0000 Subject: [PATCH 31/57] refactor reshape --- paddlespeech/s2t/modules/embedding.py | 5 +---- paddlespeech/s2t/modules/subsampling.py | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index 54324c2f6..f41a7b5d4 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -110,12 +110,10 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface): paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...) paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...) """ - T = x.shape[1] assert offset + x.shape[ 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( offset, x.shape[1], self.max_len) - #TODO(Hui Zhang): using T = paddle.shape(x)[1], __getitem__ not support Tensor - pos_emb = self.pe[:, offset:offset + T] + pos_emb = self.pe[:, offset:offset + x.shape[1]] x = x * self.xscale + pos_emb return self.dropout(x), self.dropout(pos_emb) @@ -164,6 +162,5 @@ class RelPositionalEncoding(PositionalEncoding): 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( offset, x.shape[1], self.max_len) x = x * self.xscale - #TODO(Hui Zhang): using paddle.shape(x)[1], __getitem__ not support Tensor pos_emb = self.pe[:, offset:offset + x.shape[1]] return self.dropout(x), self.dropout(pos_emb) diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py index 88451ddd7..2775988a7 100644 --- a/paddlespeech/s2t/modules/subsampling.py +++ b/paddlespeech/s2t/modules/subsampling.py @@ -139,8 +139,8 @@ class Conv2dSubsampling4(Conv2dSubsampling): """ x = x.unsqueeze(1) # (b, c=1, t, f) x = self.conv(x) - b, c, t, f = paddle.shape(x) - x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + b, c, t, f = x.shape + x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] From 3d7ca93861124b27ac390fa5bcaf2b4aef644f86 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 11:07:48 +0000 Subject: [PATCH 32/57] bool type slice --- paddlespeech/s2t/modules/decoder_layer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py index 37b124e84..cb7261107 100644 --- a/paddlespeech/s2t/modules/decoder_layer.py +++ b/paddlespeech/s2t/modules/decoder_layer.py @@ -114,10 +114,7 @@ class DecoderLayer(nn.Layer): ], f"{cache.shape} == {[tgt.shape[0], tgt.shape[1] - 1, self.size]}" tgt_q = tgt[:, -1:, :] residual = residual[:, -1:, :] - # TODO(Hui Zhang): slice not support bool type - # tgt_q_mask = tgt_mask[:, -1:, :] - tgt_q_mask = tgt_mask.cast(paddle.int64)[:, -1:, :].cast( - paddle.bool) + tgt_q_mask = tgt_mask[:, -1:, :] if self.concat_after: tgt_concat = paddle.cat( From f9e3eaa024218a5310c24bd504d4468826867bbd Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 11:55:26 +0000 Subject: [PATCH 33/57] transpose in matmul --- paddlespeech/s2t/modules/attention.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 2d236743a..c02de15e8 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -188,8 +188,9 @@ class MultiHeadedAttention(nn.Layer): # non-trivial to calculate `next_cache_start` here. new_cache = paddle.concat((k, v), axis=-1) - scores = paddle.matmul(q, - k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k) + # scores = paddle.matmul(q, + # k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k) + scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k) return self.forward_attention(v, scores, mask), new_cache @@ -309,11 +310,13 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): # first compute matrix a and matrix c # as described in https://arxiv.org/abs/1901.02860 Section 3.3 # (batch, head, time1, time2) - matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2])) + # matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2])) + matrix_ac = paddle.matmul(q_with_bias_u, k, transpose_y=True) # compute matrix b and matrix d # (batch, head, time1, time2) - matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2])) + # matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2])) + matrix_bd = paddle.matmul(q_with_bias_v, p, transpose_y=True) # Remove rel_shift since it is useless in speech recognition, # and it requires special attention for streaming. # matrix_bd = self.rel_shift(matrix_bd) From 46088c0a16aa1476c095b80fee551c7df4a8ce71 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 12:19:30 +0000 Subject: [PATCH 34/57] elimiate attn transpose --- paddlespeech/s2t/modules/attention.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index c02de15e8..67bb869ed 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -271,7 +271,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): and `head * d_k == size` """ q, k, v = self.forward_qkv(query, key, value) - q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) + # q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) # when export onnx model, for 1st chunk, we feed # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) @@ -302,9 +302,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3]) + # q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3]) + q_with_bias_u = q + self.pos_bias_u.unsqueeze(1) # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3]) + # q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3]) + q_with_bias_v = q + self.pos_bias_v.unsqueeze(1) # compute attention score # first compute matrix a and matrix c From 3adb20b468fa40a138316e66d59be12c4d20314e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 15:50:06 +0000 Subject: [PATCH 35/57] eliminate shape and slice --- paddlespeech/s2t/modules/conformer_convolution.py | 6 +++--- paddlespeech/s2t/modules/decoder.py | 2 +- paddlespeech/s2t/modules/encoder.py | 2 +- paddlespeech/s2t/modules/loss.py | 4 ++-- paddlespeech/s2t/modules/subsampling.py | 7 ++++--- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index be6056546..09d903eee 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -127,11 +127,11 @@ class ConvolutionModule(nn.Layer): x = x.transpose([0, 2, 1]) # [B, C, T] # mask batch padding - if paddle.shape(mask_pad)[2] > 0: # time > 0 + if mask_pad.shape[2] > 0: # time > 0 x = x.masked_fill(mask_pad, 0.0) if self.lorder > 0: - if paddle.shape(cache)[2] == 0: # cache_t == 0 + if cache.shape[2] == 0: # cache_t == 0 x = nn.functional.pad( x, [self.lorder, 0], 'constant', 0.0, data_format='NCL') else: @@ -161,7 +161,7 @@ class ConvolutionModule(nn.Layer): x = self.pointwise_conv2(x) # mask batch padding - if paddle.shape(mask_pad)[2] > 0: # time > 0 + if mask_pad.shape[2] > 0: # time > 0 x = x.masked_fill(mask_pad, 0.0) x = x.transpose([0, 2, 1]) # [B, T, C] diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index 5e1b4c92b..4ddf057b6 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -243,7 +243,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): ] # batch decoding - ys_mask = subsequent_mask(paddle.shape(ys)[-1]).unsqueeze(0) # (B,L,L) + ys_mask = subsequent_mask(ys.shape[-1]).unsqueeze(0) # (B,L,L) xs_mask = make_xs_mask(xs).unsqueeze(1) # (B,1,T) logp, states = self.forward_one_step( xs, xs_mask, ys, ys_mask, cache=batch_state) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index db5848847..f23d3f140 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -246,7 +246,7 @@ class BaseEncoder(nn.Layer): # tensor zeros([0,0,0,0]) support [i:i+1] slice, will return zeros([0,0,0,0]) tensor # raw code as below: # att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, - # cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, + # cnn_cache=cnn_cache[i:i+1] if cnn_cache.shape[0] > 0 else cnn_cache, xs, _, new_att_cache, new_cnn_cache = layer( xs, att_mask, diff --git a/paddlespeech/s2t/modules/loss.py b/paddlespeech/s2t/modules/loss.py index 884fb70c1..afd5201aa 100644 --- a/paddlespeech/s2t/modules/loss.py +++ b/paddlespeech/s2t/modules/loss.py @@ -85,7 +85,7 @@ class CTCLoss(nn.Layer): Returns: [paddle.Tensor]: scalar. If reduction is 'none', then (N), where N = \text{batch size}. """ - B = paddle.shape(logits)[0] + B = logits.shape[0] # warp-ctc need logits, and do softmax on logits by itself # warp-ctc need activation with shape [T, B, V + 1] # logits: (B, L, D) -> (L, B, D) @@ -158,7 +158,7 @@ class LabelSmoothingLoss(nn.Layer): Returns: loss (paddle.Tensor) : The KL loss, scalar float value """ - B, T, D = paddle.shape(x) + B, T, D = x.shape assert D == self.size x = x.reshape((-1, self.size)) target = target.reshape([-1]) diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py index 2775988a7..782a437ee 100644 --- a/paddlespeech/s2t/modules/subsampling.py +++ b/paddlespeech/s2t/modules/subsampling.py @@ -192,8 +192,8 @@ class Conv2dSubsampling6(Conv2dSubsampling): """ x = x.unsqueeze(1) # (b, c, t, f) x = self.conv(x) - b, c, t, f = paddle.shape(x) - x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + b, c, t, f = x.shape + x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-4:3] @@ -245,6 +245,7 @@ class Conv2dSubsampling8(Conv2dSubsampling): """ x = x.unsqueeze(1) # (b, c, t, f) x = self.conv(x) - x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + b, c, t, f = x.shape + x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2] From feb27e2a8483cacc3c9200805986937bb2cfc6cd Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 15:54:31 +0000 Subject: [PATCH 36/57] fuse linear kv --- paddlespeech/s2t/modules/attention.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 67bb869ed..2166ca8bf 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -20,6 +20,7 @@ from typing import Tuple import paddle from paddle import nn from paddle.nn import initializer as I +from paddle.nn import functional as F from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log @@ -45,6 +46,7 @@ class MultiHeadedAttention(nn.Layer): """ super().__init__() assert n_feat % n_head == 0 + self.n_feat = n_feat # We assume d_v always equals d_k self.d_k = n_feat // n_head self.h = n_head @@ -54,6 +56,15 @@ class MultiHeadedAttention(nn.Layer): self.linear_out = Linear(n_feat, n_feat) self.dropout = nn.Dropout(p=dropout_rate) + + def _build_once(self, *args, **kwargs): + super()._build_once(*args, **kwargs) + # if self.self_att: + # self.linear_kv = Linear(self.n_feat, self.n_feat*2) + self.weight = paddle.concat([self.linear_k.weight, self.linear_v.weight], axis=-1) + self.bias = paddle.concat([self.linear_k.bias, self.linear_v.bias]) + self._built = True + def forward_qkv(self, query: paddle.Tensor, key: paddle.Tensor, @@ -73,9 +84,12 @@ class MultiHeadedAttention(nn.Layer): (#batch, n_head, time2, d_k). """ n_batch = query.shape[0] + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + # k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + # v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + k, v = F.linear(key, self.weight, self.bias).view(n_batch, -1, 2 * self.h, self.d_k).split(2, axis=2) + q = q.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) k = k.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k) v = v.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k) @@ -108,10 +122,10 @@ class MultiHeadedAttention(nn.Layer): # When will `if mask.size(2) > 0` be False? # 1. onnx(16/-1, -1/-1, 16/0) # 2. jit (16/-1, -1/-1, 16/0, 16/4) - if paddle.shape(mask)[2] > 0: # time2 > 0 + if mask.shape[2] > 0: # time2 > 0 mask = mask.unsqueeze(1).equal(0) # (batch, 1, *, time2) # for last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :paddle.shape(scores)[-1]] + mask = mask[:, :, :, :scores.shape[-1]] scores = scores.masked_fill(mask, -float('inf')) attn = paddle.softmax( scores, axis=-1).masked_fill(mask, @@ -179,7 +193,7 @@ class MultiHeadedAttention(nn.Layer): # >>> torch.equal(b, c) # True # >>> d = torch.split(a, 2, dim=-1) # >>> torch.equal(d[0], d[1]) # True - if paddle.shape(cache)[0] > 0: + if cache.shape[0] > 0: # last dim `d_k * 2` for (key, val) key_cache, value_cache = paddle.split(cache, 2, axis=-1) k = paddle.concat([key_cache, k], axis=2) @@ -288,7 +302,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): # >>> torch.equal(b, c) # True # >>> d = torch.split(a, 2, dim=-1) # >>> torch.equal(d[0], d[1]) # True - if paddle.shape(cache)[0] > 0: + if cache.shape[0] > 0: # last dim `d_k * 2` for (key, val) key_cache, value_cache = paddle.split(cache, 2, axis=-1) k = paddle.concat([key_cache, k], axis=2) From b20bf7d5dee23eef82ef4a810db2eafe8752e6d8 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 27 Sep 2022 08:47:22 +0000 Subject: [PATCH 37/57] masked_fill by multiply, remove while --- paddlespeech/s2t/__init__.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 3c704b272..b67322cdc 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -166,15 +166,9 @@ def broadcast_shape(shp1, shp2): def masked_fill(xs: paddle.Tensor, mask: paddle.Tensor, value: Union[float, int]): - bshape = broadcast_shape(xs.shape, mask.shape) mask.stop_gradient = True - tmp = paddle.ones(shape=[len(bshape)], dtype='int32') - for index in range(len(bshape)): - tmp[index] = bshape[index] - mask = mask.broadcast_to(tmp) - trues = paddle.ones_like(xs) * value - xs = paddle.where(mask, trues, xs) - return xs + mask = mask.astype(xs.dtype) + return xs * (1.0 - mask) + mask * value if not hasattr(paddle.Tensor, 'masked_fill'): From afda7ed7d1f0fad154e1984bc6aa32980b98b368 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 28 Sep 2022 06:36:12 +0000 Subject: [PATCH 38/57] remove useless code --- paddlespeech/s2t/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index b67322cdc..4507365d6 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -166,7 +166,6 @@ def broadcast_shape(shp1, shp2): def masked_fill(xs: paddle.Tensor, mask: paddle.Tensor, value: Union[float, int]): - mask.stop_gradient = True mask = mask.astype(xs.dtype) return xs * (1.0 - mask) + mask * value From 3ed24474d2ee85d3aee71de37c9b84c97094f5ef Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 02:34:10 +0000 Subject: [PATCH 39/57] wenetspeech asr1 quant --- examples/wenetspeech/asr1/local/quant.sh | 59 ++++++ paddlespeech/s2t/exps/u2/bin/quant.py | 220 +++++++++++++++++++++++ 2 files changed, 279 insertions(+) create mode 100755 examples/wenetspeech/asr1/local/quant.sh create mode 100644 paddlespeech/s2t/exps/u2/bin/quant.py diff --git a/examples/wenetspeech/asr1/local/quant.sh b/examples/wenetspeech/asr1/local/quant.sh new file mode 100755 index 000000000..9dfea9045 --- /dev/null +++ b/examples/wenetspeech/asr1/local/quant.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 + +mkdir -p data +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ +if [ $? -ne 0 ]; then + exit 1 +fi + +if [ ! -f ${audio_file} ]; then + echo "Plase input the right audio_file path" + exit 1 +fi + + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi + +# download language model +#bash local/download_lm_ch.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + +for type in attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/quant.py \ + --ngpu ${ngpu} \ + --config ${config_path} \ + --decode_cfg ${decode_config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ + --audio_file ${audio_file} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done +exit 0 diff --git a/paddlespeech/s2t/exps/u2/bin/quant.py b/paddlespeech/s2t/exps/u2/bin/quant.py new file mode 100644 index 000000000..de7c27e79 --- /dev/null +++ b/paddlespeech/s2t/exps/u2/bin/quant.py @@ -0,0 +1,220 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation for U2 model.""" +import os +import sys +from pathlib import Path + +import paddle +import soundfile +from yacs.config import CfgNode + +from paddlespeech.audio.transform.transformation import Transformation +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.models.u2 import U2Model +from paddlespeech.s2t.training.cli import default_argument_parser +from paddlespeech.s2t.utils.log import Log +from paddlespeech.s2t.utils.utility import UpdateConfig +from paddleslim import PTQ +logger = Log(__name__).getlog() + + +class U2Infer(): + def __init__(self, config, args): + self.args = args + self.config = config + self.audio_file = args.audio_file + + self.preprocess_conf = config.preprocess_config + self.preprocess_args = {"train": False} + self.preprocessing = Transformation(self.preprocess_conf) + self.reverse_weight = getattr(config.model_conf, 'reverse_weight', 0.0) + self.text_feature = TextFeaturizer( + unit_type=config.unit_type, + vocab=config.vocab_filepath, + spm_model_prefix=config.spm_model_prefix) + + paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') + + # model + model_conf = config + with UpdateConfig(model_conf): + model_conf.input_dim = config.feat_dim + model_conf.output_dim = self.text_feature.vocab_size + model = U2Model.from_config(model_conf) + self.model = model + self.model.eval() + self.ptq = PTQ() + self.model = self.ptq.quantize(model) + + # load model + params_path = self.args.checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + self.model.set_state_dict(model_dict) + logger.info(f"model_dict: {model_dict.keys()}") + + def run(self): + check(args.audio_file) + + with paddle.no_grad(): + # read + audio, sample_rate = soundfile.read( + self.audio_file, dtype="int16", always_2d=True) + audio = audio[:, 0] + logger.info(f"audio shape: {audio.shape}") + + # fbank + feat = self.preprocessing(audio, **self.preprocess_args) + logger.info(f"feat shape: {feat.shape}") + + ilen = paddle.to_tensor(feat.shape[0]) + xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) + decode_config = self.config.decode + logger.info(f"decode cfg: {decode_config}") + result_transcripts = self.model.decode( + xs, + ilen, + text_feature=self.text_feature, + decoding_method=decode_config.decoding_method, + beam_size=decode_config.beam_size, + ctc_weight=decode_config.ctc_weight, + decoding_chunk_size=decode_config.decoding_chunk_size, + num_decoding_left_chunks=decode_config.num_decoding_left_chunks, + simulate_streaming=decode_config.simulate_streaming, + reverse_weight=self.reverse_weight) + rsl = result_transcripts[0][0] + utt = Path(self.audio_file).name + logger.info(f"hyp: {utt} {result_transcripts[0][0]}") + # print(self.model) + # print(self.model.forward_encoder_chunk) + # return rsl + + logger.info("-------------start export ----------------------") + batch_size = 1 + feat_dim = 80 + model_size = 512 + num_left_chunks = -1 + logger.info( + f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}" + ) + + # ######################## self.model.forward_encoder_chunk ############ + # input_spec = [ + # # (T,), int16 + # paddle.static.InputSpec(shape=[None], dtype='int16'), + # ] + # self.model.forward_feature = paddle.jit.to_static( + # self.model.forward_feature, input_spec=input_spec) + + ######################### self.model.forward_encoder_chunk ############ + input_spec = [ + # xs, (B, T, D) + paddle.static.InputSpec( + shape=[batch_size, None, feat_dim], dtype='float32'), + # offset, int, but need be tensor + paddle.static.InputSpec(shape=[1], dtype='int32'), + # required_cache_size, int + num_left_chunks, + # att_cache + paddle.static.InputSpec( + shape=[None, None, None, None], dtype='float32'), + # cnn_cache + paddle.static.InputSpec( + shape=[None, None, None, None], dtype='float32') + ] + self.model.forward_encoder_chunk = paddle.jit.to_static( + self.model.forward_encoder_chunk, input_spec=input_spec) + + ######################### self.model.ctc_activation ######################## + input_spec = [ + # encoder_out, (B,T,D) + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32') + ] + self.model.ctc_activation = paddle.jit.to_static( + self.model.ctc_activation, input_spec=input_spec) + + ######################### self.model.forward_attention_decoder ######################## + reverse_weight = 0.3 + input_spec = [ + # hyps, (B, U) + paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # hyps_lens, (B,) + paddle.static.InputSpec(shape=[None], dtype='int64'), + # encoder_out, (B,T,D) + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32'), + reverse_weight + ] + self.model.forward_attention_decoder = paddle.jit.to_static( + self.model.forward_attention_decoder, input_spec=input_spec) + ################################################################################ + + # jit save + logger.info(f"export save: {self.args.export_path}") + config = {'is_static': True, 'combine_params':True, 'skip_forward':True} + self.ptq.save_quantized_model(self.model, self.args.export_path) + # paddle.jit.save( + # self.model, + # self.args.export_path, + # combine_params=True, + # skip_forward=True) + + + +def check(audio_file): + if not os.path.isfile(audio_file): + print("Please input the right audio file path") + sys.exit(-1) + + logger.info("checking the audio file format......") + try: + sig, sample_rate = soundfile.read(audio_file) + except Exception as e: + logger.error(str(e)) + logger.error( + "can not open the wav file, please check the audio file format") + sys.exit(-1) + logger.info("The sample rate is %d" % sample_rate) + assert (sample_rate == 16000) + logger.info("The audio file format is right") + + +def main(config, args): + U2Infer(config, args).run() + + +if __name__ == "__main__": + parser = default_argument_parser() + # save asr result to + parser.add_argument( + "--result_file", type=str, help="path of save the asr result") + parser.add_argument( + "--audio_file", type=str, help="path of the input audio file") + parser.add_argument( + "--export_path", type=str, default='export', help="path of the input audio file") + args = parser.parse_args() + + config = CfgNode(new_allowed=True) + + if args.config: + config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + main(config, args) From 925abcca2347851af3b90d9e1dca06eb13ab04a2 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 03:44:13 +0000 Subject: [PATCH 40/57] format --- paddlespeech/s2t/modules/attention.py | 16 +++++++++------- paddlespeech/s2t/modules/encoder.py | 2 +- paddlespeech/s2t/modules/mask.py | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 2166ca8bf..d9ee763f1 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -19,8 +19,8 @@ from typing import Tuple import paddle from paddle import nn -from paddle.nn import initializer as I from paddle.nn import functional as F +from paddle.nn import initializer as I from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log @@ -56,12 +56,12 @@ class MultiHeadedAttention(nn.Layer): self.linear_out = Linear(n_feat, n_feat) self.dropout = nn.Dropout(p=dropout_rate) - def _build_once(self, *args, **kwargs): super()._build_once(*args, **kwargs) # if self.self_att: # self.linear_kv = Linear(self.n_feat, self.n_feat*2) - self.weight = paddle.concat([self.linear_k.weight, self.linear_v.weight], axis=-1) + self.weight = paddle.concat( + [self.linear_k.weight, self.linear_v.weight], axis=-1) self.bias = paddle.concat([self.linear_k.bias, self.linear_v.bias]) self._built = True @@ -84,12 +84,14 @@ class MultiHeadedAttention(nn.Layer): (#batch, n_head, time2, d_k). """ n_batch = query.shape[0] - + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) # k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) # v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - k, v = F.linear(key, self.weight, self.bias).view(n_batch, -1, 2 * self.h, self.d_k).split(2, axis=2) - + k, v = F.linear(key, self.weight, self.bias).view( + n_batch, -1, 2 * self.h, self.d_k).split( + 2, axis=2) + q = q.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) k = k.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k) v = v.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k) @@ -203,7 +205,7 @@ class MultiHeadedAttention(nn.Layer): new_cache = paddle.concat((k, v), axis=-1) # scores = paddle.matmul(q, - # k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k) + # k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k) scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k) return self.forward_attention(v, scores, mask), new_cache diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index f23d3f140..fd7bd7b9a 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -221,7 +221,7 @@ class BaseEncoder(nn.Layer): xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) # after embed, xs=(B=1, chunk_size, hidden-dim) - elayers, _, cache_t1, _ = att_cache.shape + elayers, _, cache_t1, _ = att_cache.shape chunk_size = xs.shape[1] attention_key_size = cache_t1 + chunk_size diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py index 787a06528..65619eb90 100644 --- a/paddlespeech/s2t/modules/mask.py +++ b/paddlespeech/s2t/modules/mask.py @@ -110,7 +110,7 @@ def subsequent_mask(size: int) -> paddle.Tensor: """ ret = paddle.ones([size, size], dtype=paddle.bool) return paddle.tril(ret) - + def subsequent_chunk_mask( size: int, From abe22e56a48e96dee1b83b71c9ac3babf0afa62e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 03:45:37 +0000 Subject: [PATCH 41/57] paddele vertion for u2/u2pp export --- examples/wenetspeech/asr1/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md index 9fc2856ce..5a516f8ea 100644 --- a/examples/wenetspeech/asr1/README.md +++ b/examples/wenetspeech/asr1/README.md @@ -21,6 +21,8 @@ tar cvzf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz model.yaml ## Export Static Model +>> Need Paddle >= 2.4 + >> `data/test_meeting/data.list` >> {"input": [{"name": "input1", "shape": [3.2230625, 80], "feat": "/home/PaddleSpeech/dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0163.wav", "filetype": "sound"}], "output": [{"name": "target1", "shape": [9, 5538], "text": "\u697c\u5e02\u8c03\u63a7\u5c06\u53bb\u5411\u4f55\u65b9", "token": "\u697c \u5e02 \u8c03 \u63a7 \u5c06 \u53bb \u5411 \u4f55 \u65b9", "tokenid": "1891 1121 3502 1543 1018 477 528 163 1657"}], "utt": "BAC009S0764W0163", "utt2spk": "S0764"} From e86337a4233d9bfa8b802a8cfd218e9c9637e158 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 03:49:19 +0000 Subject: [PATCH 42/57] fix bug --- paddlespeech/s2t/exps/u2/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 4208d389e..d093821d8 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -350,7 +350,7 @@ class U2Tester(U2Trainer): ctc_weight=decode_config.ctc_weight, decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, - simulate_streaming=decode_config.simulate_streaming + simulate_streaming=decode_config.simulate_streaming, reverse_weight=decode_config.reverse_weight) decode_time = time.time() - start_time From 1f4f98b171b490133c75e98e199e1ff4beb21962 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 06:34:39 +0000 Subject: [PATCH 43/57] fix bug --- paddlespeech/s2t/exps/u2/bin/quant.py | 18 ++++++++++----- paddlespeech/s2t/models/u2/u2.py | 22 +++++++++---------- .../engine/asr/online/python/asr_engine.py | 1 - 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/bin/quant.py b/paddlespeech/s2t/exps/u2/bin/quant.py index 907d79e5c..225bbf6db 100644 --- a/paddlespeech/s2t/exps/u2/bin/quant.py +++ b/paddlespeech/s2t/exps/u2/bin/quant.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +from paddleslim import PTQ from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation @@ -26,7 +27,6 @@ from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.utility import UpdateConfig -from paddleslim import PTQ logger = Log(__name__).getlog() @@ -90,14 +90,14 @@ class U2Infer(): ctc_weight=decode_config.ctc_weight, decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, - simulate_streaming=decode_config.simulate_streaming + simulate_streaming=decode_config.simulate_streaming, reverse_weight=decode_config.reverse_weight) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {rsl}") # print(self.model) # print(self.model.forward_encoder_chunk) - + logger.info("-------------start quant ----------------------") batch_size = 1 feat_dim = 80 @@ -161,7 +161,11 @@ class U2Infer(): # jit save logger.info(f"export save: {self.args.export_path}") - config = {'is_static': True, 'combine_params':True, 'skip_forward':True} + config = { + 'is_static': True, + 'combine_params': True, + 'skip_forward': True + } self.ptq.save_quantized_model(self.model, self.args.export_path) # paddle.jit.save( # self.model, @@ -169,7 +173,6 @@ class U2Infer(): # combine_params=True, # skip_forward=True) - def check(audio_file): if not os.path.isfile(audio_file): @@ -201,7 +204,10 @@ if __name__ == "__main__": parser.add_argument( "--audio_file", type=str, help="path of the input audio file") parser.add_argument( - "--export_path", type=str, default='export', help="path of the input audio file") + "--export_path", + type=str, + default='export', + help="path of the input audio file") args = parser.parse_args() config = CfgNode(new_allowed=True) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 5cdcae06f..544c1e836 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -131,7 +131,8 @@ class U2BaseModel(ASRInterface, nn.Layer): if self.ctc_weight != 1.0: start = time.time() loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths, self.reverse_weight) + text, text_lengths, + self.reverse_weight) decoder_time = time.time() - start #logger.debug(f"decoder time: {decoder_time}") @@ -152,13 +153,12 @@ class U2BaseModel(ASRInterface, nn.Layer): loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att return loss, loss_att, loss_ctc - def _calc_att_loss( - self, - encoder_out: paddle.Tensor, - encoder_mask: paddle.Tensor, - ys_pad: paddle.Tensor, - ys_pad_lens: paddle.Tensor, - reverse_weight: float) -> Tuple[paddle.Tensor, float]: + def _calc_att_loss(self, + encoder_out: paddle.Tensor, + encoder_mask: paddle.Tensor, + ys_pad: paddle.Tensor, + ys_pad_lens: paddle.Tensor, + reverse_weight: float) -> Tuple[paddle.Tensor, float]: """Calc attention loss. Args: @@ -188,8 +188,7 @@ class U2BaseModel(ASRInterface, nn.Layer): r_loss_att = paddle.to_tensor(0.0) if reverse_weight > 0.0: r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * (1 - reverse_weight - ) + r_loss_att * reverse_weight + loss_att = loss_att * (1 - reverse_weight) + r_loss_att * reverse_weight acc_att = th_accuracy( decoder_out.view(-1, self.vocab_size), ys_out_pad, @@ -599,8 +598,7 @@ class U2BaseModel(ASRInterface, nn.Layer): f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}" ) - score = score * (1 - reverse_weight - ) + r_score * reverse_weight + score = score * (1 - reverse_weight) + r_score * reverse_weight # add ctc score (which in ln domain) score += hyp[1] * ctc_weight if score > best_score: diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 27eda7ef6..67bbb4d48 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -22,7 +22,6 @@ from numpy import float32 from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation -from paddlespeech.audio.utils.tensor_utils import st_reverse_pad_list from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource From 9277fcb8a85d7a064f90eebdc7f9ba547abec13e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 08:15:51 +0000 Subject: [PATCH 44/57] fix attn can not train --- paddlespeech/s2t/modules/attention.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index d9ee763f1..128f87c07 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -60,9 +60,10 @@ class MultiHeadedAttention(nn.Layer): super()._build_once(*args, **kwargs) # if self.self_att: # self.linear_kv = Linear(self.n_feat, self.n_feat*2) - self.weight = paddle.concat( - [self.linear_k.weight, self.linear_v.weight], axis=-1) - self.bias = paddle.concat([self.linear_k.bias, self.linear_v.bias]) + if not self.training: + self.weight = paddle.concat( + [self.linear_k.weight, self.linear_v.weight], axis=-1) + self.bias = paddle.concat([self.linear_k.bias, self.linear_v.bias]) self._built = True def forward_qkv(self, @@ -86,11 +87,13 @@ class MultiHeadedAttention(nn.Layer): n_batch = query.shape[0] q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - # k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - # v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - k, v = F.linear(key, self.weight, self.bias).view( - n_batch, -1, 2 * self.h, self.d_k).split( - 2, axis=2) + if self.training: + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + else: + k, v = F.linear(key, self.weight, self.bias).view( + n_batch, -1, 2 * self.h, self.d_k).split( + 2, axis=2) q = q.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) k = k.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k) From c98b5dd173ffce56eb58e23f4873c7afd7378c51 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 09:07:30 +0000 Subject: [PATCH 45/57] fix masked_fill which will nan in trainning --- paddlespeech/s2t/__init__.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 4507365d6..6663bcf87 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -166,8 +166,19 @@ def broadcast_shape(shp1, shp2): def masked_fill(xs: paddle.Tensor, mask: paddle.Tensor, value: Union[float, int]): - mask = mask.astype(xs.dtype) - return xs * (1.0 - mask) + mask * value + # will be nan when value is `inf`. + # mask = mask.astype(xs.dtype) + # return xs * (1.0 - mask) + mask * value + + bshape = broadcast_shape(xs.shape, mask.shape) + mask.stop_gradient = True + # tmp = paddle.ones(shape=[len(bshape)], dtype='int32') + # for index in range(len(bshape)): + # tmp[index] = bshape[index] + mask = mask.broadcast_to(bshape) + trues = paddle.full_like(xs, fill_value=value) + xs = paddle.where(mask, trues, xs) + return xs if not hasattr(paddle.Tensor, 'masked_fill'): From cda440e6f0bfcc964727cf4b652ffe5a97f072d7 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Sun, 9 Oct 2022 01:46:44 +0000 Subject: [PATCH 46/57] use reverse_weight in decode.yaml --- docs/source/released_model.md | 2 +- paddlespeech/resource/pretrained_models.py | 12 ++++++------ paddlespeech/s2t/exps/u2/bin/quant.py | 4 ++-- paddlespeech/s2t/exps/u2/bin/test_wav.py | 3 ++- paddlespeech/s2t/exps/u2/model.py | 3 ++- .../server/engine/asr/online/python/asr_engine.py | 8 ++++---- 6 files changed, 17 insertions(+), 15 deletions(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index bdac2c5bb..a2456f1fe 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -9,7 +9,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) | onnx/inference/python | [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python | [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python | -[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.1.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python | +[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python | [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python | [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python | [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) | python | diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 0103651bc..55f7eff19 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -69,11 +69,11 @@ asr_dynamic_pretrained_models = { }, }, "conformer_u2pp_wenetspeech-zh-16k": { - '1.1': { + '1.3': { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.1.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.3.model.tar.gz', 'md5': - 'eae678c04ed3b3f89672052fdc0c5e10', + '662b347e1d2131b7a4dc5398365e2134', 'cfg_path': 'model.yaml', 'ckpt_path': @@ -89,11 +89,11 @@ asr_dynamic_pretrained_models = { }, }, "conformer_u2pp_online_wenetspeech-zh-16k": { - '1.1': { + '1.4': { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.2.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz', 'md5': - '925d047e9188dea7f421a718230c9ae3', + '3100fc1eac5779486cab859366992d0b', 'cfg_path': 'model.yaml', 'ckpt_path': diff --git a/paddlespeech/s2t/exps/u2/bin/quant.py b/paddlespeech/s2t/exps/u2/bin/quant.py index 225bbf6db..c38134c57 100644 --- a/paddlespeech/s2t/exps/u2/bin/quant.py +++ b/paddlespeech/s2t/exps/u2/bin/quant.py @@ -39,7 +39,6 @@ class U2Infer(): self.preprocess_conf = config.preprocess_config self.preprocess_args = {"train": False} self.preprocessing = Transformation(self.preprocess_conf) - self.reverse_weight = getattr(config.model_conf, 'reverse_weight', 0.0) self.text_feature = TextFeaturizer( unit_type=config.unit_type, vocab=config.vocab_filepath, @@ -81,6 +80,7 @@ class U2Infer(): xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode logger.info(f"decode cfg: {decode_config}") + reverse_weight = getattr(decode_config, 'reverse_weight', 0.0) result_transcripts = self.model.decode( xs, ilen, @@ -91,7 +91,7 @@ class U2Infer(): decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, simulate_streaming=decode_config.simulate_streaming, - reverse_weight=decode_config.reverse_weight) + reverse_weight=reverse_weight) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {rsl}") diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 2e067ab6b..d12ea3646 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -79,6 +79,7 @@ class U2Infer(): xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode logger.info(f"decode cfg: {decode_config}") + reverse_weight = getattr(decode_config, 'reverse_weight', 0.0) result_transcripts = self.model.decode( xs, ilen, @@ -89,7 +90,7 @@ class U2Infer(): decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, simulate_streaming=decode_config.simulate_streaming, - reverse_weight=decode_config.reverse_weight) + reverse_weight=reverse_weight) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {result_transcripts[0][0]}") diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index d093821d8..5b7654d4a 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -337,6 +337,7 @@ class U2Tester(U2Trainer): errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if decode_config.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer + reverse_weight = getattr(decode_config, 'reverse_weight', 0.0) start_time = time.time() target_transcripts = self.id2token(texts, texts_len, self.text_feature) @@ -351,7 +352,7 @@ class U2Tester(U2Trainer): decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, simulate_streaming=decode_config.simulate_streaming, - reverse_weight=decode_config.reverse_weight) + reverse_weight=reverse_weight) decode_time = time.time() - start_time for utt, target, result, rec_tids in zip( diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 67bbb4d48..536ffe0a9 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -580,6 +580,7 @@ class PaddleASRConnectionHanddler: self.update_result() beam_size = self.ctc_decode_config.beam_size + reverse_weight = getattr(self.ctc_decode_config, 'reverse_weight', 0.0) hyps = self.searcher.get_hyps() if hyps is None or len(hyps) == 0: logger.info("No Hyps!") @@ -613,7 +614,7 @@ class PaddleASRConnectionHanddler: # ctc score in ln domain # (beam_size, max_hyps_len, vocab_size) decoder_out, r_decoder_out = self.model.forward_attention_decoder( - hyps_pad, hyps_lens, self.encoder_out, self.model.reverse_weight) + hyps_pad, hyps_lens, self.encoder_out, reverse_weight) decoder_out = decoder_out.numpy() # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a @@ -631,13 +632,12 @@ class PaddleASRConnectionHanddler: # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.model.eos] - if self.model.reverse_weight > 0: + if reverse_weight > 0: r_score = 0.0 for j, w in enumerate(hyp[0]): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.model.eos] - score = score * (1 - self.model.reverse_weight - ) + r_score * self.model.reverse_weight + score = score * (1 - reverse_weight) + r_score * reverse_weight # add ctc score (which in ln domain) score += hyp[1] * self.ctc_decode_config.ctc_weight From 5a66a14659b5839e93afc315fc0d8b1ff4efeba8 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Sun, 9 Oct 2022 02:31:14 +0000 Subject: [PATCH 47/57] fix u2pp model version number --- paddlespeech/resource/pretrained_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 55f7eff19..efd6bb3f2 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -69,7 +69,7 @@ asr_dynamic_pretrained_models = { }, }, "conformer_u2pp_wenetspeech-zh-16k": { - '1.3': { + '1.1': { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.3.model.tar.gz', 'md5': @@ -89,7 +89,7 @@ asr_dynamic_pretrained_models = { }, }, "conformer_u2pp_online_wenetspeech-zh-16k": { - '1.4': { + '1.1': { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz', 'md5': From d2999ba21dd2480e51f5ef892d24557ff780d468 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Sun, 9 Oct 2022 11:39:32 +0000 Subject: [PATCH 48/57] update install.md --- README.md | 6 +++--- docs/source/install.md | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 72db64b7d..d3eccdc92 100644 --- a/README.md +++ b/README.md @@ -183,19 +183,19 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision ## Installation -We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7* and *paddlepaddle>=2.3.1*. +We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7* and *paddlepaddle>=2.4rc*. ### **Dependency Introduction** + gcc >= 4.8.5 -+ paddlepaddle >= 2.3.1 ++ paddlepaddle >= 2.4rc + python >= 3.7 + OS support: Linux(recommend), Windows, Mac OSX PaddleSpeech depends on paddlepaddle. For installation, please refer to the official website of [paddlepaddle](https://www.paddlepaddle.org.cn/en) and choose according to your own machine. Here is an example of the cpu version. ```bash -pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html ``` There are two quick installation methods for PaddleSpeech, one is pip installation, and the other is source code compilation (recommended). diff --git a/docs/source/install.md b/docs/source/install.md index 6a9ff3bc8..f789b37d2 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -58,7 +58,7 @@ pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple ``` Then you can use the following commands: ```bash -pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` > If you encounter problem with downloading **nltk_data** while using paddlespeech, it maybe due to your poor network, we suggest you download the [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) provided by us, and extract it to your `${HOME}`. @@ -117,9 +117,9 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ``` (Hip: Do not use the last script if you want to install by **Hard** way): ### Install PaddlePaddle -You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu 2.3.1: +You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu develop: ```bash -python3 -m pip install paddlepaddle-gpu==2.3.1 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` ### Install PaddleSpeech You can install `paddlespeech` by the following command,then you can use the `ready-made` examples in `paddlespeech` : @@ -180,9 +180,9 @@ Some users may fail to install `kaldiio` due to the default download source, you ```bash pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple ``` -Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle 2.3.1: +Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle develop: ```bash -python3 -m pip install paddlepaddle-gpu==2.3.1 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` ### Install PaddleSpeech in Developing Mode ```bash From e3672427650f451faae87dd6e226ad9fc6c9793e Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Mon, 10 Oct 2022 06:11:22 +0000 Subject: [PATCH 49/57] update dependency of paddle --- README.md | 2 +- README_cn.md | 4 ++-- demos/speech_server/README.md | 2 +- demos/speech_server/README_cn.md | 2 +- demos/streaming_asr_server/README.md | 2 +- demos/streaming_asr_server/README_cn.md | 2 +- demos/streaming_tts_server/README.md | 2 +- demos/streaming_tts_server/README_cn.md | 2 +- docker/ubuntu16-gpu/Dockerfile | 2 +- docs/source/install.md | 14 ++++++++++--- docs/source/install_cn.md | 20 +++++++++++++------ .../server/tests/asr/online/README.md | 4 ++-- .../server/tests/asr/online/README_cn.md | 2 +- 13 files changed, 38 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index d3eccdc92..c80a31fde 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ We strongly recommend our users to install PaddleSpeech in **Linux** with *pytho PaddleSpeech depends on paddlepaddle. For installation, please refer to the official website of [paddlepaddle](https://www.paddlepaddle.org.cn/en) and choose according to your own machine. Here is an example of the cpu version. ```bash -pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html +pip install paddlepaddle==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple ``` There are two quick installation methods for PaddleSpeech, one is pip installation, and the other is source code compilation (recommended). diff --git a/README_cn.md b/README_cn.md index 725f7eda1..49f42dae5 100644 --- a/README_cn.md +++ b/README_cn.md @@ -215,14 +215,14 @@ ### 相关依赖 + gcc >= 4.8.5 -+ paddlepaddle >= 2.3.1 ++ paddlepaddle >= 2.4rc + python >= 3.7 + linux(推荐), mac, windows PaddleSpeech 依赖于 paddlepaddle,安装可以参考[ paddlepaddle 官网](https://www.paddlepaddle.org.cn/),根据自己机器的情况进行选择。这里给出 cpu 版本示例,其它版本大家可以根据自己机器的情况进行安装。 ```shell -pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +pip install paddlepaddle==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple ``` PaddleSpeech 快速安装方式有两种,一种是 pip 安装,一种是源码编译(推荐)。 diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index e400f7e74..7e7d4b2c5 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -13,7 +13,7 @@ For service interface definition, please check: ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.3.1** or above. +It is recommended to use **paddlepaddle 2.4rc** or above. You can choose one way from easy, meduim and hard to install paddlespeech. diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 628468c83..594928281 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -14,7 +14,7 @@ ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.3.1** 或以上版本。 +推荐使用 **paddlepaddle 2.4rc** 或以上版本。 你可以从简单,中等,困难 几种方式中选择一种方式安装 PaddleSpeech。 diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index a97486757..5eef82866 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -14,7 +14,7 @@ Streaming ASR server only support `websocket` protocol, and doesn't support `htt ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.3.1** or above. +It is recommended to use **paddlepaddle 2.4rc** or above. You can choose one way from easy, meduim and hard to install paddlespeech. diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 267367729..1902a2fa9 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -14,7 +14,7 @@ ### 1. 安装 安装 PaddleSpeech 的详细过程请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md)。 -推荐使用 **paddlepaddle 2.3.1** 或以上版本。 +推荐使用 **paddlepaddle 2.4rc** 或以上版本。 你可以从简单,中等,困难 几种方式中选择一种方式安装 PaddleSpeech。 diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index 15448a46f..ca5d6f1f8 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -13,7 +13,7 @@ For service interface definition, please check: ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.3.1** or above. +It is recommended to use **paddlepaddle 2.4rc** or above. You can choose one way from easy, meduim and hard to install paddlespeech. diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index b99155bca..125f37033 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -12,7 +12,7 @@ ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.3.1** 或以上版本。 +推荐使用 **paddlepaddle 2.4rc** 或以上版本。 你可以从简单,中等,困难 几种方式中选择一种方式安装 PaddleSpeech。 diff --git a/docker/ubuntu16-gpu/Dockerfile b/docker/ubuntu16-gpu/Dockerfile index f275471ee..a8c11e37b 100644 --- a/docker/ubuntu16-gpu/Dockerfile +++ b/docker/ubuntu16-gpu/Dockerfile @@ -62,7 +62,7 @@ RUN mkdir -p ~/.pip && echo "[global]" > ~/.pip/pip.conf && \ echo "index-url=https://mirror.baidu.com/pypi/simple" >> ~/.pip/pip.conf && \ echo "trusted-host=mirror.baidu.com" >> ~/.pip/pip.conf && \ python3 -m pip install --upgrade pip && \ - pip install paddlepaddle-gpu==2.3.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html && \ + pip install paddlepaddle-gpu==2.4.0rc0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html && \ rm -rf ~/.cache/pip RUN git clone https://github.com/PaddlePaddle/PaddleSpeech.git && cd PaddleSpeech && \ diff --git a/docs/source/install.md b/docs/source/install.md index f789b37d2..187bd4ea0 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -58,7 +58,7 @@ pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple ``` Then you can use the following commands: ```bash -pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html +pip install paddlepaddle==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` > If you encounter problem with downloading **nltk_data** while using paddlespeech, it maybe due to your poor network, we suggest you download the [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) provided by us, and extract it to your `${HOME}`. @@ -117,7 +117,11 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ``` (Hip: Do not use the last script if you want to install by **Hard** way): ### Install PaddlePaddle -You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu develop: +You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu 2.4rc: +```bash +python3 -m pip install paddlepaddle-gpu==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple +``` +You can also install the develop version of paddlepaddle. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu develop: ```bash python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` @@ -180,7 +184,11 @@ Some users may fail to install `kaldiio` due to the default download source, you ```bash pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple ``` -Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle develop: +Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle 2.4rc: +```bash +python3 -m pip install paddlepaddle-gpu==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple +``` +You can also install the develop version of paddlepaddle. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu develop: ```bash python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` diff --git a/docs/source/install_cn.md b/docs/source/install_cn.md index 9f49ebad6..9936a214a 100644 --- a/docs/source/install_cn.md +++ b/docs/source/install_cn.md @@ -55,8 +55,8 @@ pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple ``` 然后你可以使用如下命令: ```bash -pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple -pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple +pip install paddlepaddle==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple +pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` > 如果您在使用 paddlespeech 的过程中遇到关于下载 **nltk_data** 的问题,可能是您的网络不佳,我们建议您下载我们提供的 [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) 并解压缩到您的 `${HOME}` 目录下。 @@ -111,9 +111,13 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ``` (提示: 如果你想使用**困难**方式完成安装,请不要使用最后一条命令) ### 安装 PaddlePaddle -你可以根据系统配置选择 PaddlePaddle 版本,例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.3.1: +你可以根据系统配置选择 PaddlePaddle 版本,例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.4rc: ```bash -python3 -m pip install paddlepaddle-gpu==2.3.1 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle-gpu==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple +``` +你也可以安装 develop 版本的PaddlePaddle. 例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu develop: +```bash +python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` ### 安装 PaddleSpeech 最后安装 `paddlespeech`,这样你就可以使用 `paddlespeech` 中已有的 examples: @@ -168,9 +172,13 @@ conda activate tools/venv conda install -y -c conda-forge sox libsndfile swig bzip2 libflac bc ``` ### 安装 PaddlePaddle -请确认你系统是否有 GPU,并且使用了正确版本的 paddlepaddle。例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.3.1: +请确认你系统是否有 GPU,并且使用了正确版本的 paddlepaddle。例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.4rc: +```bash +python3 -m pip install paddlepaddle-gpu==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple +``` +你也可以安装 develop 版本的PaddlePaddle. 例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu develop: ```bash -python3 -m pip install paddlepaddle-gpu==2.3.1 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` ### 用开发者模式安装 PaddleSpeech 部分用户系统由于默认源的问题,安装中会出现 kaldiio 安转出错的问题,建议首先安装 pytest-runner: diff --git a/paddlespeech/server/tests/asr/online/README.md b/paddlespeech/server/tests/asr/online/README.md index e1e4d9506..1d7fa8824 100644 --- a/paddlespeech/server/tests/asr/online/README.md +++ b/paddlespeech/server/tests/asr/online/README.md @@ -11,8 +11,8 @@ This document introduces a client for streaming asr service: microphone ### 1. Install Refer [Install](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). - **paddlepaddle 2.2.1** 或以上版本。 -It is recommended to use **paddlepaddle 2.2.1** or above. + **paddlepaddle 2.4rc** 或以上版本。 +It is recommended to use **paddlepaddle 2.4rc** or above. You can choose one way from meduim and hard to install paddlespeech. diff --git a/paddlespeech/server/tests/asr/online/README_cn.md b/paddlespeech/server/tests/asr/online/README_cn.md index 46dff250e..403216369 100644 --- a/paddlespeech/server/tests/asr/online/README_cn.md +++ b/paddlespeech/server/tests/asr/online/README_cn.md @@ -10,7 +10,7 @@ ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.2.1** 或以上版本。 +推荐使用 **paddlepaddle 2.4rc** 或以上版本。 你可以从 medium,hard 三中方式中选择一种方式安装 PaddleSpeech。 From 6e429f051316628f99ed5e68ccaa91f6d1a32cc0 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Mon, 10 Oct 2022 11:42:44 +0000 Subject: [PATCH 50/57] support wav2vec2ASR on librispeech --- examples/librispeech/asr3/README.md | 191 +++ examples/librispeech/asr3/RESULTS.md | 8 + examples/librispeech/asr3/cmd.sh | 89 ++ .../librispeech/asr3/conf/preprocess.yaml | 4 + .../librispeech/asr3/conf/tuning/decode.yaml | 11 + .../librispeech/asr3/conf/wav2vec2ASR.yaml | 120 ++ examples/librispeech/asr3/local/test.sh | 84 ++ examples/librispeech/asr3/local/test_wav.sh | 58 + examples/librispeech/asr3/local/train.sh | 55 + examples/librispeech/asr3/path.sh | 15 + examples/librispeech/asr3/run.sh | 48 + examples/librispeech/asr3/utils | 1 + .../s2t/exps/wav2vec2/bin/__init__.py | 13 + paddlespeech/s2t/exps/wav2vec2/bin/test.py | 66 + .../s2t/exps/wav2vec2/bin/test_wav.py | 118 ++ paddlespeech/s2t/exps/wav2vec2/bin/train.py | 54 + paddlespeech/s2t/exps/wav2vec2/model.py | 435 +++++++ paddlespeech/s2t/models/wav2vec2/__init__.py | 0 .../s2t/models/wav2vec2/modules/VanillaNN.py | 45 + .../models/wav2vec2/modules/activations.py | 175 +++ .../s2t/models/wav2vec2/modules/containers.py | 131 ++ .../s2t/models/wav2vec2/modules/linear.py | 73 ++ .../wav2vec2/modules/modeling_outputs.py | 1129 ++++++++++++++++ .../wav2vec2/modules/modeling_wav2vec2.py | 1131 +++++++++++++++++ .../wav2vec2/processing/signal_processing.py | 242 ++++ .../processing/speech_augmentation.py | 727 +++++++++++ .../s2t/models/wav2vec2/wav2vec2_ASR.py | 247 ++++ 27 files changed, 5270 insertions(+) create mode 100644 examples/librispeech/asr3/README.md create mode 100644 examples/librispeech/asr3/RESULTS.md create mode 100644 examples/librispeech/asr3/cmd.sh create mode 100644 examples/librispeech/asr3/conf/preprocess.yaml create mode 100644 examples/librispeech/asr3/conf/tuning/decode.yaml create mode 100644 examples/librispeech/asr3/conf/wav2vec2ASR.yaml create mode 100644 examples/librispeech/asr3/local/test.sh create mode 100644 examples/librispeech/asr3/local/test_wav.sh create mode 100644 examples/librispeech/asr3/local/train.sh create mode 100644 examples/librispeech/asr3/path.sh create mode 100644 examples/librispeech/asr3/run.sh create mode 120000 examples/librispeech/asr3/utils create mode 100644 paddlespeech/s2t/exps/wav2vec2/bin/__init__.py create mode 100644 paddlespeech/s2t/exps/wav2vec2/bin/test.py create mode 100644 paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py create mode 100644 paddlespeech/s2t/exps/wav2vec2/bin/train.py create mode 100644 paddlespeech/s2t/exps/wav2vec2/model.py create mode 100644 paddlespeech/s2t/models/wav2vec2/__init__.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/VanillaNN.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/activations.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/containers.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/linear.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/modeling_outputs.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py create mode 100644 paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py create mode 100644 paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py create mode 100644 paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py diff --git a/examples/librispeech/asr3/README.md b/examples/librispeech/asr3/README.md new file mode 100644 index 000000000..bd96af86f --- /dev/null +++ b/examples/librispeech/asr3/README.md @@ -0,0 +1,191 @@ +# Wav2vec2ASR with Librispeech +This example contains code used to finetune [wav2vec2.0](https://https://arxiv.org/pdf/2006.11477.pdf) model with [Librispeech dataset](http://www.openslr.org/resources/12) +## Overview +All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function. +| Stage | Function | +|:---- |:----------------------------------------------------------- | +| 0 | Process data. It includes:
(1) Download the dataset
(2) Calculate the CMVN of the train dataset
(3) Get the vocabulary file
(4) Get the manifest files of the train, development and test dataset
(5) Download the pretrained wav2vec2 model | +| 1 | Train the model | +| 2 | Get the final model by averaging the top-k models, set k = 1 means to choose the best model | +| 3 | Test the final model performance | +| 4 | Infer the single audio file | + + +You can choose to run a range of stages by setting `stage` and `stop_stage `. + +For example, if you want to execute the code in stage 2 and stage 3, you can run this script: +```bash +bash run.sh --stage 2 --stop_stage 3 +``` +Or you can set `stage` equal to `stop-stage` to only run one stage. +For example, if you only want to run `stage 0`, you can use the script below: +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +The document below will describe the scripts in `run.sh` in detail. +## The Environment Variables +The path.sh contains the environment variables. +```bash +. ./path.sh +. ./cmd.sh +``` +This script needs to be run first. And another script is also needed: +```bash +source ${MAIN_ROOT}/utils/parse_options.sh +``` +It will support the way of using `--variable value` in the shell scripts. +## The Local Variables +Some local variables are set in `run.sh`. +`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. +`stage` denotes the number of stages you want to start from in the experiments. +`stop stage` denotes the number of the stage you want to end at in the experiments. +`conf_path` denotes the config path of the model. +`avg_num` denotes the number K of top-K models you want to average to get the final model. +`audio file` denotes the file path of the single file you want to infer in stage 5 +`ckpt` denotes the checkpoint prefix of the model, e.g. "wav2vec2ASR" + +You can set the local variables (except `ckpt`) when you use `run.sh` + +For example, you can set the `gpus` and `avg_num` when you use the command line: +```bash +bash run.sh --gpus 0,1 --avg_num 20 +``` +## Stage 0: Data Processing +To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below: +```bash + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 + fi +``` +Stage 0 is for processing the data. + +If you only want to process the data. You can run +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +You can also just run these scripts in your command line. +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +``` +After processing the data, the `data` directory will look like this: +```bash +data/ +|-- dev.meta +|-- lang_char +| `-- bpe_unigram_5000.model +| `-- bpe_unigram_5000.vocab +| `-- vocab.txt +|-- manifest.dev +|-- manifest.dev.raw +|-- manifest.test +|-- manifest.test.raw +|-- manifest.train +|-- manifest.train.raw +|-- mean_std.json +|-- test.meta +`-- train.meta +``` +## Stage 1: Model Training +If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. +```bash +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + fi +``` +If you want to train the model, you can use the script below to execute stage 0 and stage 1: +```bash +bash run.sh --stage 0 --stop_stage 1 +``` +or you can run these scripts in the command line (only use CPU). +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +``` +## Stage 2: Top-k Models Averaging +After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below. Note: We only train one epoch for wav2vec2ASR, thus the `avg_num` is set to 1. +```bash + if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh best exp/${ckpt}/checkpoints ${avg_num} + fi +``` +The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`. +If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2: +```bash +bash run.sh --stage 0 --stop_stage 2 +``` +or you can run these scripts in the command line (only use CPU). + +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +avg.sh best exp/wav2vec2ASR/checkpoints 1 +``` +## Stage 3: Model Testing +The test stage is to evaluate the model performance. The code of test stage is shown below: +```bash + if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + fi +``` +If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 : +```bash +bash run.sh --stage 0 --stop_stage 3 +``` +or you can run these scripts in the command line (only use CPU). +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +avg.sh best exp/wav2vec2ASR/checkpoints 1 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 +``` +## Pretrained Model +You can get the pretrained wav2vec2ASR from [this](../../../docs/source/released_model.md). + +using the `tar` scripts to unpack the model and then you can use the script to test the model. + +For example: +```bash +wget https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/wav2vec2ASR-large-960h-librispeech_ckpt_1.3.0.model.tar.gz +tar xzvf wav2vec2ASR-large-960h-librispeech_ckpt_1.3.0.model.tar.gz +source path.sh +# If you have process the data and get the manifest file, you can skip the following 2 steps +bash local/data.sh --stage -1 --stop_stage -1 +bash local/data.sh --stage 2 --stop_stage 2 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 +``` +The performance of the released models are shown in [here](./RESULTS.md). + + +## Stage 4: Single Audio File Inference +In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below +```bash + if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + fi +``` +you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below: +```bash +wget https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/wav2vec2ASR-large-960h-librispeech_ckpt_1.3.0.model.tar.gz +tar xzvf wav2vec2ASR-large-960h-librispeech_ckpt_1.3.0.model.tar.gz +``` +You can download the audio demo: +```bash +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ +``` +You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. +```bash +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_002_en.wav +``` diff --git a/examples/librispeech/asr3/RESULTS.md b/examples/librispeech/asr3/RESULTS.md new file mode 100644 index 000000000..1c5626d9e --- /dev/null +++ b/examples/librispeech/asr3/RESULTS.md @@ -0,0 +1,8 @@ +# LibriSpeech + +## Wav2VecASR +train: Epoch 1, 1*V100-32G, batchsize:10 + +| Model | Params | Config | Augmentation| Test set | Decode method | WER | +| --- | --- | --- | --- | --- | --- | --- | +| wav2vec2ASR | 302.86 M | conf/wav2vec2ASR.yaml | spec_aug | test-clean | greedy search | 0.018887 | diff --git a/examples/librispeech/asr3/cmd.sh b/examples/librispeech/asr3/cmd.sh new file mode 100644 index 000000000..7b70ef5e0 --- /dev/null +++ b/examples/librispeech/asr3/cmd.sh @@ -0,0 +1,89 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time