From 4e7106d9e2a3eb9ee5ab870dcae3a3c59eac338e Mon Sep 17 00:00:00 2001 From: 0x45f Date: Wed, 27 Jul 2022 09:32:11 +0000 Subject: [PATCH 001/113] Support dy2st --- paddlespeech/s2t/exps/u2/model.py | 165 +++++++++++++++++- paddlespeech/s2t/models/u2/u2.py | 42 ++++- .../engine/asr/online/python/asr_engine.py | 17 +- 3 files changed, 210 insertions(+), 14 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index cdad3b8f7..b41f320b4 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -471,6 +471,165 @@ class U2Tester(U2Trainer): infer_model, input_spec = self.load_inferspec() assert isinstance(input_spec, list), type(input_spec) infer_model.eval() - static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) - logger.info(f"Export code: {static_model.forward.code}") - paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) + # logger.info(f"Export code: {static_model.forward.code}") + # paddle.jit.save(static_model, self.args.export_path) + + # # to check outputs + # def flatten(out): + # if isinstance(out, paddle.Tensor): + # return [out] + + # flatten_out = [] + # for var in out: + # if isinstance(var, (list, tuple)): + # flatten_out.extend(flatten(var)) + # else: + # flatten_out.append(var) + # return flatten_out + + + # ######################### infer_model.forward_attention_decoder ######################## + # a = paddle.full(shape=[10, 8], fill_value=10, dtype='int64') + # b = paddle.full(shape=[10], fill_value=8, dtype='int64') + # # c = paddle.rand(shape=[1, 20, 512], dtype='float32') + # c = paddle.full(shape=[1, 20, 512], fill_value=1, dtype='float32') + + # out1 = infer_model.forward_attention_decoder(a, b, c) + # print(out1) + + # input_spec = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # paddle.static.InputSpec(shape=[None], dtype='int64'), + # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_attention_decoder, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + # out2 = static_model(a, b, c) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + + + + + + # ######################### infer_model.forward_encoder_chunk ######################## + # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([80], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.randn(shape=[12, 8, 80, 128], dtype='float32') + # cnn_cache = paddle.randn(shape=[12, 1, 512, 14], dtype='float32') + # # out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + # zero_out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache=paddle.zeros([0, 0, 0, 0]), cnn_cache=paddle.zeros([0, 0, 0, 0])) + # # print(zero_out1) + + # input_spec = [ + # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # paddle.static.InputSpec(shape=[1], dtype='int32'), + # -16, + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + # # out2 = static_model(xs, offset, att_cache, cnn_cache) + # # print(out2) + # zero_out2 = static_model(xs, offset, paddle.zeros([0, 0, 0, 0]), paddle.zeros([0, 0, 0, 0])) + + # # out1 = flatten(out1) + # # out2 = flatten(out2) + # # for i in range(len(out1)): + # # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + # zero_out1 = flatten(zero_out1) + # zero_out2 = flatten(zero_out2) + # for i in range(len(zero_out1)): + # print(np.equal(zero_out1[i].numpy(), zero_out2[i].numpy()).all()) + + + + + + + + # ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## + # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([0], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + + # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + + # input_spec = [ + # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # paddle.static.InputSpec(shape=[1], dtype='int32'), + # -16, + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + + # offset = paddle.to_tensor([0], dtype='int32') + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + # xs, att_cache, cnn_cache = static_model(xs1, offset, att_cache, cnn_cache) + # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out2 = static_model(xs2, offset, att_cache, cnn_cache) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + + + + + + + ###################### save/load combine ######################## + paddle.jit.save(infer_model, '/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', combine_params=True) + + + # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([0], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + + # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + + + # from paddle.jit.layer import Layer + # layer = Layer() + # layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(0)) + + # offset = paddle.to_tensor([0], dtype='int32') + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) + # offset = paddle.to_tensor([16], dtype='int32') + # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) \ No newline at end of file diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 76f698e64..9148c7372 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -59,6 +59,20 @@ __all__ = ["U2Model", "U2InferModel"] logger = Log(__name__).getlog() +# input_spec1 = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), +# paddle.static.InputSpec(shape=[None], dtype='int64'), +# paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] + +# input_spec2 = [ +# paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), +# paddle.static.InputSpec(shape=[1], dtype='int32'), +# -16, +# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), +# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + +# input_spec3 = [paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), +# paddle.static.InputSpec(shape=[1], dtype='int64')] + class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" @@ -599,7 +613,12 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.eos - @jit.to_static + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + paddle.static.InputSpec(shape=[1], dtype='int32'), + -16, + paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) def forward_encoder_chunk( self, xs: paddle.Tensor, @@ -655,7 +674,10 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) - @jit.to_static + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[None, None], dtype='int64'), + paddle.static.InputSpec(shape=[None], dtype='int64'), + paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) def forward_attention_decoder( self, hyps: paddle.Tensor, @@ -918,6 +940,9 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), + paddle.static.InputSpec(shape=[1], dtype='int64')]) def forward(self, feats, feats_lengths, @@ -933,9 +958,10 @@ class U2InferModel(U2Model): Returns: List[List[int]]: best path result """ - return self.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) + # return self.ctc_greedy_search( + # feats, + # feats_lengths, + # decoding_chunk_size=decoding_chunk_size, + # num_decoding_left_chunks=num_decoding_left_chunks, + # simulate_streaming=simulate_streaming) + return feats, feats_lengths diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 4df38f09d..cd50f157a 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -80,6 +80,10 @@ class PaddleASRConnectionHanddler: self.init_decoder() self.reset() + from paddle.jit.layer import Layer + self.jit_layer = Layer() + self.jit_layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(1)) + def init_decoder(self): if "deepspeech2" in self.model_type: assert self.continuous_decoding is False, "ds2 model not support endpoint" @@ -474,9 +478,16 @@ class PaddleASRConnectionHanddler: # cur chunk chunk_xs = self.cached_feat[:, cur:end, :] # forward chunk - (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( - chunk_xs, self.offset, required_cache_size, - self.att_cache, self.cnn_cache) + # (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( + # chunk_xs, self.offset, required_cache_size, + # self.att_cache, self.cnn_cache) + + (y, self.att_cache, self.cnn_cache) = self.jit_layer.forward_encoder_chunk( + chunk_xs, + paddle.to_tensor([self.offset], dtype='int32'), + self.att_cache, + self.cnn_cache) + outputs.append(y) # update the global offset, in decoding frame unit From e5a6c243f1f53ea3d3d28a957010db98cdcd6db4 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 1 Aug 2022 08:03:04 +0000 Subject: [PATCH 002/113] fix jit save for conformer --- paddlespeech/s2t/exps/u2/model.py | 205 ++++++------------------------ paddlespeech/s2t/models/u2/u2.py | 62 ++++----- 2 files changed, 62 insertions(+), 205 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index b41f320b4..141e83bce 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -25,8 +25,6 @@ import paddle from paddle import distributed as dist from paddlespeech.s2t.frontend.featurizer import TextFeaturizer -from paddlespeech.s2t.io.dataloader import BatchDataLoader -from paddlespeech.s2t.io.dataloader import StreamDataLoader from paddlespeech.s2t.io.dataloader import DataLoaderFactory from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.optimizer import OptimizerFactory @@ -109,7 +107,8 @@ class U2Trainer(Trainer): def valid(self): self.model.eval() if not self.use_streamdata: - logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") + logger.info( + f"Valid Total Examples: {len(self.valid_loader.dataset)}") valid_losses = defaultdict(list) num_seen_utts = 1 total_loss = 0.0 @@ -136,7 +135,8 @@ class U2Trainer(Trainer): msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) if not self.use_streamdata: - msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) + msg += "batch: {}/{}, ".format(i + 1, + len(self.valid_loader)) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in valid_dump.items()) logger.info(msg) @@ -157,7 +157,8 @@ class U2Trainer(Trainer): self.before_train() if not self.use_streamdata: - logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") + logger.info( + f"Train Total Examples: {len(self.train_loader.dataset)}") while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() @@ -225,14 +226,18 @@ class U2Trainer(Trainer): config = self.config.clone() self.use_streamdata = config.get("use_stream_data", False) if self.train: - self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args) - self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args) + self.train_loader = DataLoaderFactory.get_dataloader( + 'train', config, self.args) + self.valid_loader = DataLoaderFactory.get_dataloader( + 'valid', config, self.args) logger.info("Setup train/valid Dataloader!") else: decode_batch_size = config.get('decode', dict()).get( 'decode_batch_size', 1) - self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args) - self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args) + self.test_loader = DataLoaderFactory.get_dataloader('test', config, + self.args) + self.align_loader = DataLoaderFactory.get_dataloader( + 'align', config, self.args) logger.info("Setup test/align Dataloader!") def setup_model(self): @@ -470,166 +475,30 @@ class U2Tester(U2Trainer): def export(self): infer_model, input_spec = self.load_inferspec() assert isinstance(input_spec, list), type(input_spec) + del input_spec infer_model.eval() - # static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) - # logger.info(f"Export code: {static_model.forward.code}") - # paddle.jit.save(static_model, self.args.export_path) - - # # to check outputs - # def flatten(out): - # if isinstance(out, paddle.Tensor): - # return [out] - - # flatten_out = [] - # for var in out: - # if isinstance(var, (list, tuple)): - # flatten_out.extend(flatten(var)) - # else: - # flatten_out.append(var) - # return flatten_out - - - # ######################### infer_model.forward_attention_decoder ######################## - # a = paddle.full(shape=[10, 8], fill_value=10, dtype='int64') - # b = paddle.full(shape=[10], fill_value=8, dtype='int64') - # # c = paddle.rand(shape=[1, 20, 512], dtype='float32') - # c = paddle.full(shape=[1, 20, 512], fill_value=1, dtype='float32') - - # out1 = infer_model.forward_attention_decoder(a, b, c) - # print(out1) - - # input_spec = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), - # paddle.static.InputSpec(shape=[None], dtype='int64'), - # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] - # static_model = paddle.jit.to_static(infer_model.forward_attention_decoder, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path) - # static_model = paddle.jit.load(self.args.export_path) - # out2 = static_model(a, b, c) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) - - - - - - - # ######################### infer_model.forward_encoder_chunk ######################## - # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([80], dtype='int32') - # required_cache_size = -16 - # att_cache = paddle.randn(shape=[12, 8, 80, 128], dtype='float32') - # cnn_cache = paddle.randn(shape=[12, 1, 512, 14], dtype='float32') - # # out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache, cnn_cache) - # # print(out1) - # zero_out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache=paddle.zeros([0, 0, 0, 0]), cnn_cache=paddle.zeros([0, 0, 0, 0])) - # # print(zero_out1) - - # input_spec = [ - # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - # paddle.static.InputSpec(shape=[1], dtype='int32'), - # -16, - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] - # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path) - # static_model = paddle.jit.load(self.args.export_path) - # # out2 = static_model(xs, offset, att_cache, cnn_cache) - # # print(out2) - # zero_out2 = static_model(xs, offset, paddle.zeros([0, 0, 0, 0]), paddle.zeros([0, 0, 0, 0])) - - # # out1 = flatten(out1) - # # out2 = flatten(out2) - # # for i in range(len(out1)): - # # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) - - # zero_out1 = flatten(zero_out1) - # zero_out2 = flatten(zero_out2) - # for i in range(len(zero_out1)): - # print(np.equal(zero_out1[i].numpy(), zero_out2[i].numpy()).all()) - - - - - - - - # ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## - # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([0], dtype='int32') - # required_cache_size = -16 - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - - # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - # # print(out1) - - # input_spec = [ - # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - # paddle.static.InputSpec(shape=[1], dtype='int32'), - # -16, - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] - # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path) - # static_model = paddle.jit.load(self.args.export_path) - - # offset = paddle.to_tensor([0], dtype='int32') - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = static_model(xs1, offset, att_cache, cnn_cache) - # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out2 = static_model(xs2, offset, att_cache, cnn_cache) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) - - - - - - - - ###################### save/load combine ######################## - paddle.jit.save(infer_model, '/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', combine_params=True) + ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## + input_spec = [ + paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + paddle.static.InputSpec(shape=[1], dtype='int32'), -1, + paddle.static.InputSpec( + shape=[None, None, None, None], + dtype='float32'), paddle.static.InputSpec( + shape=[None, None, None, None], dtype='float32') + ] + infer_model.forward_encoder_chunk = paddle.jit.to_static( + infer_model.forward_encoder_chunk, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path, combine_params=True) - # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([0], dtype='int32') - # required_cache_size = -16 - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) + ######################### infer_model.forward_attention_decoder ######################## + input_spec = [ + paddle.static.InputSpec(shape=[None, None], dtype='int64'), + paddle.static.InputSpec(shape=[None], dtype='int64'), + paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + ] + infer_model.forward_attention_decoder = paddle.jit.to_static( + infer_model.forward_attention_decoder, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path, combine_params=True) - # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - # # print(out1) - - - # from paddle.jit.layer import Layer - # layer = Layer() - # layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(0)) - - # offset = paddle.to_tensor([0], dtype='int32') - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) - # offset = paddle.to_tensor([16], dtype='int32') - # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) \ No newline at end of file + paddle.jit.save(infer_model, './export.jit', combine_params=True) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 9148c7372..432162aae 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -29,6 +29,9 @@ import paddle from paddle import jit from paddle import nn +from paddlespeech.audio.utils.tensor_utils import add_sos_eos +from paddlespeech.audio.utils.tensor_utils import pad_sequence +from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.decoders.scorers.ctc import CTCPrefixScorer from paddlespeech.s2t.frontend.utility import IGNORE_ID from paddlespeech.s2t.frontend.utility import load_cmvn @@ -48,9 +51,6 @@ from paddlespeech.s2t.utils import checkpoint from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank from paddlespeech.s2t.utils.log import Log -from paddlespeech.audio.utils.tensor_utils import add_sos_eos -from paddlespeech.audio.utils.tensor_utils import pad_sequence -from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.utils.utility import log_add from paddlespeech.s2t.utils.utility import UpdateConfig @@ -59,20 +59,6 @@ __all__ = ["U2Model", "U2InferModel"] logger = Log(__name__).getlog() -# input_spec1 = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), -# paddle.static.InputSpec(shape=[None], dtype='int64'), -# paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] - -# input_spec2 = [ -# paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), -# paddle.static.InputSpec(shape=[1], dtype='int32'), -# -16, -# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), -# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] - -# input_spec3 = [paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), -# paddle.static.InputSpec(shape=[1], dtype='int64')] - class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" @@ -588,44 +574,44 @@ class U2BaseModel(ASRInterface, nn.Layer): best_index = i return hyps[best_index][0] - #@jit.to_static + @jit.to_static(property=True) def subsampling_rate(self) -> int: """ Export interface for c++ call, return subsampling_rate of the model """ return self.encoder.embed.subsampling_rate - #@jit.to_static + @jit.to_static(property=True) def right_context(self) -> int: """ Export interface for c++ call, return right_context of the model """ return self.encoder.embed.right_context - #@jit.to_static + @jit.to_static(property=True) def sos_symbol(self) -> int: """ Export interface for c++ call, return sos symbol id of the model """ return self.sos - #@jit.to_static + @jit.to_static(property=True) def eos_symbol(self) -> int: """ Export interface for c++ call, return eos symbol id of the model """ return self.eos - @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - paddle.static.InputSpec(shape=[1], dtype='int32'), - -16, - paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) + # @jit.to_static(input_spec=[ + # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # paddle.static.InputSpec(shape=[1], dtype='int32'), + # -1, + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) def forward_encoder_chunk( self, xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Export interface for c++ call, give input chunk xs, and return output from time 0 to current chunk. @@ -660,8 +646,8 @@ class U2BaseModel(ASRInterface, nn.Layer): paddle.Tensor: new conformer cnn cache required for next chunk, with same shape as the original cnn_cache. """ - return self.encoder.forward_chunk( - xs, offset, required_cache_size, att_cache, cnn_cache) + return self.encoder.forward_chunk(xs, offset, required_cache_size, + att_cache, cnn_cache) # @jit.to_static def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: @@ -674,10 +660,10 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) - @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[None, None], dtype='int64'), - paddle.static.InputSpec(shape=[None], dtype='int64'), - paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) + # @jit.to_static(input_spec=[ + # paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # paddle.static.InputSpec(shape=[None], dtype='int64'), + # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) def forward_attention_decoder( self, hyps: paddle.Tensor, @@ -941,8 +927,9 @@ class U2InferModel(U2Model): super().__init__(configs) @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), - paddle.static.InputSpec(shape=[1], dtype='int64')]) + paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), + paddle.static.InputSpec(shape=[1], dtype='int64') + ]) def forward(self, feats, feats_lengths, @@ -958,6 +945,7 @@ class U2InferModel(U2Model): Returns: List[List[int]]: best path result """ + # dummy code for dy2st # return self.ctc_greedy_search( # feats, # feats_lengths, From d3572be0bb37cd2265691bbfe73c6c550d33f162 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 1 Aug 2022 08:06:25 +0000 Subject: [PATCH 003/113] add ws export.sh --- examples/wenetspeech/asr1/local/export.sh | 28 +++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 examples/wenetspeech/asr1/local/export.sh diff --git a/examples/wenetspeech/asr1/local/export.sh b/examples/wenetspeech/asr1/local/export.sh new file mode 100755 index 000000000..6b646b469 --- /dev/null +++ b/examples/wenetspeech/asr1/local/export.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +if [ $# != 3 ];then + echo "usage: $0 config_path ckpt_prefix jit_model_path" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_path_prefix=$2 +jit_model_export_path=$3 + +python3 -u ${BIN_DIR}/export.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--checkpoint_path ${ckpt_path_prefix} \ +--export_path ${jit_model_export_path} + + +if [ $? -ne 0 ]; then + echo "Failed in export!" + exit 1 +fi + + +exit 0 From 6149daa22142d7be2f252b9590b2728a5ec72a10 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 3 Aug 2022 08:38:43 +0000 Subject: [PATCH 004/113] export ctc_activation --- paddlespeech/s2t/exps/u2/model.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 141e83bce..fdccdf159 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -478,7 +478,8 @@ class U2Tester(U2Trainer): del input_spec infer_model.eval() - ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## + ######################### infer_model.forward_encoder_chunk zero Tensor online ############ + # TODO: 80(feature dim) be configable input_spec = [ paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), paddle.static.InputSpec(shape=[1], dtype='int32'), -1, @@ -492,6 +493,7 @@ class U2Tester(U2Trainer): # paddle.jit.save(static_model, self.args.export_path, combine_params=True) ######################### infer_model.forward_attention_decoder ######################## + # TODO: 512(encoder_output) be configable. 1 for B input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype='int64'), paddle.static.InputSpec(shape=[None], dtype='int64'), @@ -501,4 +503,12 @@ class U2Tester(U2Trainer): infer_model.forward_attention_decoder, input_spec=input_spec) # paddle.jit.save(static_model, self.args.export_path, combine_params=True) + ######################### infer_model.ctc_activation ######################## + # TODO: 512(encoder_output) be configable + input_spec = [ + paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + ] + infer_model.ctc_activation = paddle.jit.to_static( + infer_model.ctc_activation, input_spec=input_spec) + paddle.jit.save(infer_model, './export.jit', combine_params=True) From 05bc25883333d80a7ee1a5ec1314a1b81f57a81c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 3 Aug 2022 09:17:23 +0000 Subject: [PATCH 005/113] update docstring --- paddlespeech/s2t/models/u2/u2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index ca83ca170..e4c667e00 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -654,7 +654,7 @@ class U2BaseModel(ASRInterface, nn.Layer): Args: xs (paddle.Tensor): encoder output, (B, T, D) Returns: - paddle.Tensor: activation before ctc + paddle.Tensor: activation before ctc. (B, Tmax, odim) """ return self.ctc.log_softmax(xs) From c1fbfe928ec386eefa805c9215a369fc83b9b9fc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 4 Aug 2022 03:22:14 +0000 Subject: [PATCH 006/113] add test --- paddlespeech/s2t/exps/u2/model.py | 49 +++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index fdccdf159..5ce5f50bf 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -512,3 +512,52 @@ class U2Tester(U2Trainer): infer_model.ctc_activation, input_spec=input_spec) paddle.jit.save(infer_model, './export.jit', combine_params=True) + + def flatten(out): + if isinstance(out, paddle.Tensor): + return [out] + + flatten_out = [] + for var in out: + if isinstance(var, (list, tuple)): + flatten_out.extend(flatten(var)) + else: + flatten_out.append(var) + return flatten_out + + xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + offset = paddle.to_tensor([0], dtype='int32') + required_cache_size = -16 + att_cache = paddle.zeros([0, 0, 0, 0]) + cnn_cache = paddle.zeros([0, 0, 0, 0]) + + # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + # print(out1) + + xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk( + xs1, offset, att_cache, cnn_cache) + xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + offset = paddle.to_tensor([16], dtype='int32') + out1 = infer_model.forward_encoder_chunk(xs2, offset, att_cache, + cnn_cache) + print(out1) + + # from paddle.jit.layer import Layer + # layer = Layer() + # layer.load('./export.jit', paddle.CPUPlace()) + + # offset = paddle.to_tensor([0], dtype='int32') + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) + # offset = paddle.to_tensor([16], dtype='int32') + # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) From d638325c46e7092fcdb48ee7605c9c79f498bb1f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:09:29 +0000 Subject: [PATCH 007/113] do not jit save forward; using slice for zeros([0,0,0,0]) tensor --- paddlespeech/s2t/exps/u2/model.py | 51 +++++++++++------------------ paddlespeech/s2t/models/u2/u2.py | 4 --- paddlespeech/s2t/modules/encoder.py | 5 +-- 3 files changed, 23 insertions(+), 37 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 5ce5f50bf..66b95f63c 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -482,10 +482,12 @@ class U2Tester(U2Trainer): # TODO: 80(feature dim) be configable input_spec = [ paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - paddle.static.InputSpec(shape=[1], dtype='int32'), -1, + paddle.static.InputSpec(shape=[1], dtype='int32'), + -1, paddle.static.InputSpec( shape=[None, None, None, None], - dtype='float32'), paddle.static.InputSpec( + dtype='float32'), + paddle.static.InputSpec( shape=[None, None, None, None], dtype='float32') ] infer_model.forward_encoder_chunk = paddle.jit.to_static( @@ -511,7 +513,7 @@ class U2Tester(U2Trainer): infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) - paddle.jit.save(infer_model, './export.jit', combine_params=True) + paddle.jit.save(infer_model, './export.jit', combine_params=True, skip_forward=True) def flatten(out): if isinstance(out, paddle.Tensor): @@ -531,33 +533,20 @@ class U2Tester(U2Trainer): att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - # print(out1) - - xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk( - xs1, offset, att_cache, cnn_cache) + xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') offset = paddle.to_tensor([16], dtype='int32') - out1 = infer_model.forward_encoder_chunk(xs2, offset, att_cache, - cnn_cache) - print(out1) - - # from paddle.jit.layer import Layer - # layer = Layer() - # layer.load('./export.jit', paddle.CPUPlace()) - - # offset = paddle.to_tensor([0], dtype='int32') - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) - # offset = paddle.to_tensor([16], dtype='int32') - # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + print('py encoder', out1) + + from paddle.jit.layer import Layer + layer = Layer() + layer.load('./export.jit', paddle.CPUPlace()) + + xs1 = paddle.full([1, 7, 80], 0.1, dtype='float32') + offset = paddle.to_tensor([0], dtype='int32') + att_cache = paddle.zeros([0, 0, 0, 0]) + cnn_cache=paddle.zeros([0, 0, 0, 0]) + func = getattr(layer, 'forward_encoder_chunk') + xs, att_cache, cnn_cache = func(xs1, offset, att_cache, cnn_cache) + print('py static encoder', xs) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index e4c667e00..a1daccf18 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -924,10 +924,6 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) - @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), - paddle.static.InputSpec(shape=[1], dtype='int64') - ]) def forward(self, feats, feats_lengths, diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index bff2d69bb..a7919bca4 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -251,10 +251,11 @@ class BaseEncoder(nn.Layer): for i, layer in enumerate(self.encoders): # att_cache[i:i+1] = (1, head, cache_t1, d_k*2) # cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2) + # zeros([0,0,0,0]) support [i:i+1] slice xs, _, new_att_cache, new_cnn_cache = layer( xs, att_mask, pos_emb, - att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, + att_cache=att_cache[i:i+1], + cnn_cache=cnn_cache[i:i+1], ) # new_att_cache = (1, head, attention_key_size, d_k*2) # new_cnn_cache = (B=1, hidden-dim, cache_t2) From a7c6c54e75575ffddcae18ae353c858006653cb9 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:20:28 +0000 Subject: [PATCH 008/113] fix --- .../server/engine/asr/online/python/asr_engine.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index cd50f157a..e3cbd38f3 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -80,9 +80,6 @@ class PaddleASRConnectionHanddler: self.init_decoder() self.reset() - from paddle.jit.layer import Layer - self.jit_layer = Layer() - self.jit_layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(1)) def init_decoder(self): if "deepspeech2" in self.model_type: @@ -478,15 +475,9 @@ class PaddleASRConnectionHanddler: # cur chunk chunk_xs = self.cached_feat[:, cur:end, :] # forward chunk - # (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( - # chunk_xs, self.offset, required_cache_size, - # self.att_cache, self.cnn_cache) - - (y, self.att_cache, self.cnn_cache) = self.jit_layer.forward_encoder_chunk( - chunk_xs, - paddle.to_tensor([self.offset], dtype='int32'), - self.att_cache, - self.cnn_cache) + (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( + chunk_xs, self.offset, required_cache_size, + self.att_cache, self.cnn_cache) outputs.append(y) From 63aeb747b0be474140fc4b9f6808403b05d1cf84 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:29:55 +0000 Subject: [PATCH 009/113] more comment --- paddlespeech/s2t/exps/u2/model.py | 10 +++++----- paddlespeech/s2t/modules/encoder.py | 7 ++++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 66b95f63c..1d813761d 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -492,10 +492,9 @@ class U2Tester(U2Trainer): ] infer_model.forward_encoder_chunk = paddle.jit.to_static( infer_model.forward_encoder_chunk, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path, combine_params=True) ######################### infer_model.forward_attention_decoder ######################## - # TODO: 512(encoder_output) be configable. 1 for B + # TODO: 512(encoder_output) be configable. 1 for BatchSize input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype='int64'), paddle.static.InputSpec(shape=[None], dtype='int64'), @@ -503,7 +502,6 @@ class U2Tester(U2Trainer): ] infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path, combine_params=True) ######################### infer_model.ctc_activation ######################## # TODO: 512(encoder_output) be configable @@ -513,8 +511,10 @@ class U2Tester(U2Trainer): infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) - paddle.jit.save(infer_model, './export.jit', combine_params=True, skip_forward=True) + # jit save + paddle.jit.save(infer_model, self.args.export_path, combine_params=True, skip_forward=True) + # test dy2static def flatten(out): if isinstance(out, paddle.Tensor): return [out] @@ -541,7 +541,7 @@ class U2Tester(U2Trainer): from paddle.jit.layer import Layer layer = Layer() - layer.load('./export.jit', paddle.CPUPlace()) + layer.load(self.args.export_path, paddle.CPUPlace()) xs1 = paddle.full([1, 7, 80], 0.1, dtype='float32') offset = paddle.to_tensor([0], dtype='int32') diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index a7919bca4..230894d50 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -251,7 +251,12 @@ class BaseEncoder(nn.Layer): for i, layer in enumerate(self.encoders): # att_cache[i:i+1] = (1, head, cache_t1, d_k*2) # cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2) - # zeros([0,0,0,0]) support [i:i+1] slice + + # WARNING: eliminate if-else cond op in graph + # tensor zeros([0,0,0,0]) support [i:i+1] slice, will return zeros([0,0,0,0]) tensor + # raw code as below: + # att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, + # cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, xs, _, new_att_cache, new_cnn_cache = layer( xs, att_mask, pos_emb, att_cache=att_cache[i:i+1], From 1c9f238ba09e55b26b3b0c46033436ed27eb9613 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:45:26 +0000 Subject: [PATCH 010/113] configurable export --- paddlespeech/s2t/exps/u2/model.py | 37 +++++++++++++++++++------------ 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 1d813761d..45fbcb404 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -462,31 +462,37 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.clone(), self.args.checkpoint_path) + + batch_size = 1 feat_dim = self.test_loader.feat_dim - input_spec = [ - paddle.static.InputSpec(shape=[1, None, feat_dim], - dtype='float32'), # audio, [B,T,D] - paddle.static.InputSpec(shape=[1], - dtype='int64'), # audio_length, [B] - ] - return infer_model, input_spec + model_size = 512 + num_left_chunks = -1 + + return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) @paddle.no_grad() def export(self): infer_model, input_spec = self.load_inferspec() - assert isinstance(input_spec, list), type(input_spec) - del input_spec infer_model.eval() - ######################### infer_model.forward_encoder_chunk zero Tensor online ############ + assert isinstance(input_spec, list), type(input_spec) + batch_size, feat_dim, model_size, num_left_chunks = input_spec + + + ######################### infer_model.forward_encoder_chunk zero tensor online ############ # TODO: 80(feature dim) be configable input_spec = [ - paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # xs, (B, T, D) + paddle.static.InputSpec(shape=[batch_size, None, feat_dim], dtype='float32'), + # offset, int, but need be tensor paddle.static.InputSpec(shape=[1], dtype='int32'), - -1, + # required_cache_size, int + num_left_chunks, + # att_cache paddle.static.InputSpec( shape=[None, None, None, None], dtype='float32'), + # cnn_cache paddle.static.InputSpec( shape=[None, None, None, None], dtype='float32') ] @@ -496,9 +502,12 @@ class U2Tester(U2Trainer): ######################### infer_model.forward_attention_decoder ######################## # TODO: 512(encoder_output) be configable. 1 for BatchSize input_spec = [ + # hyps, (B, U) paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # hyps_lens, (B,) paddle.static.InputSpec(shape=[None], dtype='int64'), - paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + # encoder_out, (B,T,D) + paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') ] infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) @@ -529,7 +538,7 @@ class U2Tester(U2Trainer): xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') offset = paddle.to_tensor([0], dtype='int32') - required_cache_size = -16 + required_cache_size = num_left_chunks att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) From 3a8869fba496ecfbb153a094feae18ac1ce28fc9 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:50:11 +0000 Subject: [PATCH 011/113] rm to_static decarator; configure jit save for ctc_activation --- paddlespeech/s2t/exps/u2/model.py | 4 ++-- paddlespeech/s2t/models/u2/u2.py | 12 ++---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 45fbcb404..dae618db6 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -513,9 +513,9 @@ class U2Tester(U2Trainer): infer_model.forward_attention_decoder, input_spec=input_spec) ######################### infer_model.ctc_activation ######################## - # TODO: 512(encoder_output) be configable input_spec = [ - paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + # encoder_out, (B,T,D) + paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') ] infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index a1daccf18..149170ed6 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -599,12 +599,7 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.eos - # @jit.to_static(input_spec=[ - # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - # paddle.static.InputSpec(shape=[1], dtype='int32'), - # -1, - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) + # @jit.to_static def forward_encoder_chunk( self, xs: paddle.Tensor, @@ -658,10 +653,7 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) - # @jit.to_static(input_spec=[ - # paddle.static.InputSpec(shape=[None, None], dtype='int64'), - # paddle.static.InputSpec(shape=[None], dtype='int64'), - # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) + # @jit.to_static def forward_attention_decoder( self, hyps: paddle.Tensor, From 67709155e9f17e03579c7360882e2e92b65ad7c1 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 13 Sep 2022 08:29:21 +0000 Subject: [PATCH 012/113] add chunk conformer config from release model --- .../asr1/conf/chunk_conformer.yaml | 99 +++++++++++++++++++ .../wenetspeech/asr1/conf/preprocess.yaml | 2 +- .../asr1/conf/tuning/chunk_decode.yaml | 11 +++ 3 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 examples/wenetspeech/asr1/conf/chunk_conformer.yaml create mode 100644 examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml new file mode 100644 index 000000000..69fa223a1 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml @@ -0,0 +1,99 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + init_type: 'kaiming_uniform' + +# https://yaml.org/type/float.html +########################################### +# Data # +########################################### +train_manifest: data/train_l/data.list +dev_manifest: data/dev/data.list +test_manifest: data/test_meeting/data.list + +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +do_filter: True +maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced +minlen_in: 10 +minlen_out: 0 +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + + +########################################### +# Training # +########################################### +n_epoch: 26 +accum_grad: 32 +global_grad_clip: 5.0 +dist_sampler: True +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml index f7f4c58d5..c7ccc522d 100644 --- a/examples/wenetspeech/asr1/conf/preprocess.yaml +++ b/examples/wenetspeech/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: 0.1 + dither: 1.0 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml b/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 000000000..7e8afb7a8 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: 16 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: True # simulate streaming inference. Defaults to False. From 8690a00bd8d66c7d1358a8ac370967ddb4bd1ec5 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 13 Sep 2022 09:54:48 +0000 Subject: [PATCH 013/113] add feature pipeline layer(cmvn, fbank), but to_static and jit.layer output is not equal --- paddlespeech/audio/compliance/kaldi.py | 22 +++---- paddlespeech/s2t/exps/u2/bin/test_wav.py | 3 + paddlespeech/s2t/exps/u2/model.py | 75 ++++++++++++++++-------- paddlespeech/s2t/models/u2/u2.py | 58 ++++++++++++++++++ paddlespeech/s2t/modules/cmvn.py | 10 +++- paddlespeech/s2t/modules/fbank.py | 74 +++++++++++++++++++++++ 6 files changed, 206 insertions(+), 36 deletions(-) create mode 100644 paddlespeech/s2t/modules/fbank.py diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index 538be0196..beb2d86b9 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -74,16 +74,16 @@ def _feature_window_function( window_size: int, blackman_coeff: float, dtype: int, ) -> Tensor: - if window_type == HANNING: + if window_type == "hann": return get_window('hann', window_size, fftbins=False, dtype=dtype) - elif window_type == HAMMING: + elif window_type == "hamming": return get_window('hamming', window_size, fftbins=False, dtype=dtype) - elif window_type == POVEY: + elif window_type == "povey": return get_window( 'hann', window_size, fftbins=False, dtype=dtype).pow(0.85) - elif window_type == RECTANGULAR: + elif window_type == "rect": return paddle.ones([window_size], dtype=dtype) - elif window_type == BLACKMAN: + elif window_type == "blackman": a = 2 * math.pi / (window_size - 1) window_function = paddle.arange(window_size, dtype=dtype) return (blackman_coeff - 0.5 * paddle.cos(a * window_function) + @@ -216,7 +216,7 @@ def spectrogram(waveform: Tensor, sr: int=16000, snip_edges: bool=True, subtract_mean: bool=False, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. Args: @@ -236,7 +236,7 @@ def spectrogram(waveform: Tensor, snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. - window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". Returns: Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames @@ -418,11 +418,11 @@ def fbank(waveform: Tensor, vtln_high: float=-500.0, vtln_low: float=100.0, vtln_warp: float=1.0, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return filter banks from a waveform. The output is identical to Kaldi's. Args: - waveform (Tensor): A waveform tensor with shape `(C, T)`. + waveform (Tensor): A waveform tensor with shape `(C, T)`. `C` is in the range [0,1]. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. channel (int, optional): Select the channel of waveform. Defaults to -1. dither (float, optional): Dithering constant . Defaults to 0.0. @@ -448,7 +448,7 @@ def fbank(waveform: Tensor, vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. - window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". Returns: Tensor: A filter banks tensor with shape `(m, n_mels)`. @@ -537,7 +537,7 @@ def mfcc(waveform: Tensor, vtln_high: float=-500.0, vtln_low: float=100.0, vtln_warp: float=1.0, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return mel frequency cepstral coefficients from a waveform. The output is identical to Kaldi's. diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 887ec7a6d..c04e3ae47 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +import numpy as np from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation @@ -77,6 +78,8 @@ class U2Infer(): feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") + np.savetxt("feat.transform.txt", feat) + ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) decode_config = self.config.decode diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index dae618db6..ee4df9cb9 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -474,13 +474,20 @@ class U2Tester(U2Trainer): def export(self): infer_model, input_spec = self.load_inferspec() infer_model.eval() + paddle.set_device('cpu') - assert isinstance(input_spec, list), type(input_spec) + assert isinstance(input_spec, (list, tuple)), type(input_spec) batch_size, feat_dim, model_size, num_left_chunks = input_spec - ######################### infer_model.forward_encoder_chunk zero tensor online ############ - # TODO: 80(feature dim) be configable + ######################## infer_model.forward_encoder_chunk ############ + input_spec = [ + # (T,), int16 + paddle.static.InputSpec(shape=[None], dtype='int16'), + ] + infer_model.forward_feature = paddle.jit.to_static(infer_model.forward_feature, input_spec=input_spec) + + ######################### infer_model.forward_encoder_chunk ############ input_spec = [ # xs, (B, T, D) paddle.static.InputSpec(shape=[batch_size, None, feat_dim], dtype='float32'), @@ -499,8 +506,16 @@ class U2Tester(U2Trainer): infer_model.forward_encoder_chunk = paddle.jit.to_static( infer_model.forward_encoder_chunk, input_spec=input_spec) + ######################### infer_model.ctc_activation ######################## + input_spec = [ + # encoder_out, (B,T,D) + paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') + ] + infer_model.ctc_activation = paddle.jit.to_static( + infer_model.ctc_activation, input_spec=input_spec) + + ######################### infer_model.forward_attention_decoder ######################## - # TODO: 512(encoder_output) be configable. 1 for BatchSize input_spec = [ # hyps, (B, U) paddle.static.InputSpec(shape=[None, None], dtype='int64'), @@ -512,17 +527,11 @@ class U2Tester(U2Trainer): infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) - ######################### infer_model.ctc_activation ######################## - input_spec = [ - # encoder_out, (B,T,D) - paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') - ] - infer_model.ctc_activation = paddle.jit.to_static( - infer_model.ctc_activation, input_spec=input_spec) - # jit save + logger.info(f"export save: {self.args.export_path}") paddle.jit.save(infer_model, self.args.export_path, combine_params=True, skip_forward=True) + # test dy2static def flatten(out): if isinstance(out, paddle.Tensor): @@ -536,26 +545,44 @@ class U2Tester(U2Trainer): flatten_out.append(var) return flatten_out - xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # forward_encoder_chunk dygraph + xs1 = paddle.full([1, 67, 80], 0.1, dtype='float32') offset = paddle.to_tensor([0], dtype='int32') required_cache_size = num_left_chunks att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) - - xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - offset = paddle.to_tensor([16], dtype='int32') - out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - print('py encoder', out1) - + xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + + import soundfile + audio, sample_rate = soundfile.read( + './zh.wav', dtype="int16", always_2d=True) + audio = audio[:, 0] + logger.info(f"audio shape: {audio.shape}") + audio = paddle.to_tensor(audio, paddle.int16) + feat_d = infer_model.forward_feature(audio) + logger.info(f"{feat_d}") + np.savetxt("feat.tostatic.txt", feat_d) + + + # load static model from paddle.jit.layer import Layer layer = Layer() layer.load(self.args.export_path, paddle.CPUPlace()) - xs1 = paddle.full([1, 7, 80], 0.1, dtype='float32') + # forward_encoder_chunk static + xs1 = paddle.full([1, 67, 80], 0.1, dtype='float32') offset = paddle.to_tensor([0], dtype='int32') att_cache = paddle.zeros([0, 0, 0, 0]) - cnn_cache=paddle.zeros([0, 0, 0, 0]) + cnn_cache = paddle.zeros([0, 0, 0, 0]) func = getattr(layer, 'forward_encoder_chunk') - xs, att_cache, cnn_cache = func(xs1, offset, att_cache, cnn_cache) - print('py static encoder', xs) + xs_s, att_cache_s, cnn_cache_s = func(xs1, offset, att_cache, cnn_cache) + np.testing.assert_allclose(xs_d, xs_s, atol=1e-5) + np.testing.assert_allclose(att_cache_d, att_cache_s, atol=1e-4) + np.testing.assert_allclose(cnn_cache_d, cnn_cache_s, atol=1e-4) + # logger.info(f"forward_encoder_chunk output: {xs_s}") + + # forward_feature static + func = getattr(layer, 'forward_feature') + feat_s = func(audio)[0] + logger.info(f"{feat_s}") + np.testing.assert_allclose(feat_d, feat_s, atol=1e-5) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 149170ed6..d7b8630a3 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -916,6 +916,50 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) + from paddlespeech.s2t.modules.fbank import KaldiFbank + import yaml + import json + import numpy as np + + input_dim = configs['input_dim'] + process = configs['preprocess_config'] + with open(process, encoding="utf-8") as f: + conf = yaml.safe_load(f) + assert isinstance(conf, dict), type(self.conf) + + for idx, process in enumerate(conf['process']): + assert isinstance(process, dict), type(process) + opts = dict(process) + process_type = opts.pop("type") + + if process_type == 'fbank_kaldi': + opts.update({'n_mels': input_dim}) + opts['dither'] = 0.0 + self.fbank = KaldiFbank( + **opts + ) + logger.info(f"{self.__class__.__name__} export: {self.fbank}") + if process_type == 'cmvn_json': + # align with paddlespeech.audio.transform.cmvn:GlobalCMVN + std_floor = 1.0e-20 + + cmvn = opts['cmvn_path'] + if isinstance(cmvn, dict): + cmvn_stats = cmvn + else: + with open(cmvn) as f: + cmvn_stats = json.load(f) + count = cmvn_stats['frame_num'] + mean = np.array(cmvn_stats['mean_stat']) / count + square_sums = np.array(cmvn_stats['var_stat']) + var = square_sums / count - mean**2 + std = np.maximum(np.sqrt(var), std_floor) + istd = 1.0 / std + self.global_cmvn = GlobalCMVN( + paddle.to_tensor(mean, dtype=paddle.float), + paddle.to_tensor(istd, dtype=paddle.float)) + logger.info(f"{self.__class__.__name__} export: {self.global_cmvn}") + def forward(self, feats, feats_lengths, @@ -939,3 +983,17 @@ class U2InferModel(U2Model): # num_decoding_left_chunks=num_decoding_left_chunks, # simulate_streaming=simulate_streaming) return feats, feats_lengths + + def forward_feature(self, x): + """feature pipeline. + + Args: + x (paddle.Tensor): waveform (T,). + + Return: + feat (paddle.Tensor): feature (T, D) + """ + x = paddle.cast(x, paddle.float32) + feat = self.fbank(x) + feat = self.global_cmvn(feat) + return feat \ No newline at end of file diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py index 67f71b667..53c508f1a 100644 --- a/paddlespeech/s2t/modules/cmvn.py +++ b/paddlespeech/s2t/modules/cmvn.py @@ -40,6 +40,14 @@ class GlobalCMVN(nn.Layer): self.register_buffer("mean", mean) self.register_buffer("istd", istd) + def __repr__(self): + return ( + "{name}(mean={mean}, istd={istd}, norm_var={norm_var})".format( + name=self.__class__.__name__, + mean=self.mean, + istd=self.istd, + norm_var=self.norm_var)) + def forward(self, x: paddle.Tensor): """ Args: @@ -50,4 +58,4 @@ class GlobalCMVN(nn.Layer): x = x - self.mean if self.norm_var: x = x * self.istd - return x + return x \ No newline at end of file diff --git a/paddlespeech/s2t/modules/fbank.py b/paddlespeech/s2t/modules/fbank.py new file mode 100644 index 000000000..4ec620a79 --- /dev/null +++ b/paddlespeech/s2t/modules/fbank.py @@ -0,0 +1,74 @@ + + + +import paddle +from paddle import nn + +from paddlespeech.audio.compliance import kaldi + +from paddlespeech.s2t.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['KaldiFbank'] + +class KaldiFbank(nn.Layer): + def __init__(self, + fs=16000, + n_mels=80, + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + energy_floor=0.0, + dither=0.0): + """ + Args: + fs (int): sample rate of the audio + n_mels (int): number of mel filter banks + n_shift (int): number of points in a frame shift + win_length (int): number of points in a frame windows + energy_floor (float): Floor on energy in Spectrogram computation (absolute) + dither (float): Dithering constant. Default 0.0 + """ + super().__init__() + self.fs = fs + self.n_mels = n_mels + num_point_ms = fs / 1000 + self.n_frame_length = win_length / num_point_ms + self.n_frame_shift = n_shift / num_point_ms + self.energy_floor = energy_floor + self.dither = dither + + def __repr__(self): + return ( + "{name}(fs={fs}, n_mels={n_mels}, " + "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, " + "dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_frame_shift=self.n_frame_shift, + n_frame_length=self.n_frame_length, + dither=self.dither, )) + + def forward(self, x: paddle.Tensor): + """ + Args: + x (paddle.Tensor): shape (Ti). + Not support: [Time, Channel] and Batch mode. + + Returns: + paddle.Tensor: (T, D) + """ + assert x.ndim == 1 + + feat = kaldi.fbank( + x.unsqueeze(0), # append channel dim, (C, Ti) + n_mels=self.n_mels, + frame_length=self.n_frame_length, + frame_shift=self.n_frame_shift, + dither=self.dither, + energy_floor=self.energy_floor, + sr=self.fs) + + assert feat.ndim == 2 # (T,D) + return feat From 0d7d87120b79b71259a2d42c8a33f0e93adf67ee Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 14 Sep 2022 16:44:12 +0000 Subject: [PATCH 014/113] simplify feature pipeline graph --- paddlespeech/audio/compliance/kaldi.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index beb2d86b9..24415058c 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -357,10 +357,13 @@ def _get_mel_banks(num_bins: int, ('Bad values in options: vtln-low {} and vtln-high {}, versus ' 'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq)) - bin = paddle.arange(num_bins).unsqueeze(1) + bin = paddle.arange(num_bins, dtype=paddle.float32).unsqueeze(1) + # left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) + # center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1) + # right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1) left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) - center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1) - right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1) + center_mel = left_mel + mel_freq_delta + right_mel = center_mel + mel_freq_delta if vtln_warp_factor != 1.0: left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, @@ -373,7 +376,7 @@ def _get_mel_banks(num_bins: int, center_freqs = _inverse_mel_scale(center_mel) # (num_bins) # (1, num_fft_bins) - mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0) + mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins, dtype=paddle.float32)).unsqueeze(0) # (num_bins, num_fft_bins) up_slope = (mel - left_mel) / (center_mel - left_mel) @@ -472,7 +475,8 @@ def fbank(waveform: Tensor, # (n_mels, padded_window_size // 2) mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq, high_freq, vtln_low, vtln_high, vtln_warp) - mel_energies = mel_energies.astype(dtype) + # mel_energies = mel_energies.astype(dtype) + assert mel_energies.dtype == dtype # (n_mels, padded_window_size // 2 + 1) mel_energies = paddle.nn.functional.pad( From 260752aa2a3284a37c06b88da2fef3b6d0118280 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 19 Sep 2022 14:10:16 +0000 Subject: [PATCH 015/113] using forward_attention_decoder --- paddlespeech/s2t/exps/u2/bin/test_wav.py | 8 +++----- paddlespeech/s2t/models/u2/u2.py | 14 ++++++-------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index c04e3ae47..a55a1eca0 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -69,8 +69,7 @@ class U2Infer(): with paddle.no_grad(): # read audio, sample_rate = soundfile.read( - self.audio_file, dtype="int16", always_2d=True) - + self.audio_file, dtype="int16", always_2d=True) audio = audio[:, 0] logger.info(f"audio shape: {audio.shape}") @@ -78,11 +77,10 @@ class U2Infer(): feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") - np.savetxt("feat.transform.txt", feat) - ilen = paddle.to_tensor(feat.shape[0]) - xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) + xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode + logger.debug(f"decode cfg: {decode_config}") result_transcripts = self.model.decode( xs, ilen, diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index d7b8630a3..b4ec6b033 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -545,17 +545,11 @@ class U2BaseModel(ASRInterface, nn.Layer): [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) + logger.debug(f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = paddle.ones( - (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool) - decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, - hyps_lens) # (beam_size, max_hyps_len, vocab_size) # ctc score in ln domain - decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1) - decoder_out = decoder_out.numpy() + decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, encoder_out) # Only use decoder score for rescoring best_score = -float('inf') @@ -567,11 +561,15 @@ class U2BaseModel(ASRInterface, nn.Layer): score += decoder_out[i][j][w] # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] + logger.debug(f"hyp {i} len {len(hyp[0])} l2r rescore_score: {score} ctc_score: {hyp[1]}") + # add ctc score (which in ln domain) score += hyp[1] * ctc_weight if score > best_score: best_score = score best_index = i + + logger.debug(f"result: {hyps[best_index]}") return hyps[best_index][0] @jit.to_static(property=True) From 4d5cfd400386bcd5be8729f8b3e1dfc5bae8365c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 03:23:50 +0000 Subject: [PATCH 016/113] export param from cnofig --- paddlespeech/s2t/exps/u2/model.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index ee4df9cb9..2b70f117b 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -462,13 +462,13 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.clone(), self.args.checkpoint_path) - batch_size = 1 feat_dim = self.test_loader.feat_dim - model_size = 512 + model_size = self.config.encoder_conf.output_size num_left_chunks = -1 + logger.info(f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}") - return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) + return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) @paddle.no_grad() def export(self): @@ -553,20 +553,10 @@ class U2Tester(U2Trainer): cnn_cache = paddle.zeros([0, 0, 0, 0]) xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - import soundfile - audio, sample_rate = soundfile.read( - './zh.wav', dtype="int16", always_2d=True) - audio = audio[:, 0] - logger.info(f"audio shape: {audio.shape}") - audio = paddle.to_tensor(audio, paddle.int16) - feat_d = infer_model.forward_feature(audio) - logger.info(f"{feat_d}") - np.savetxt("feat.tostatic.txt", feat_d) - - # load static model from paddle.jit.layer import Layer layer = Layer() + logger.info(f"load export model: {self.args.export_path}") layer.load(self.args.export_path, paddle.CPUPlace()) # forward_encoder_chunk static @@ -580,9 +570,3 @@ class U2Tester(U2Trainer): np.testing.assert_allclose(att_cache_d, att_cache_s, atol=1e-4) np.testing.assert_allclose(cnn_cache_d, cnn_cache_s, atol=1e-4) # logger.info(f"forward_encoder_chunk output: {xs_s}") - - # forward_feature static - func = getattr(layer, 'forward_feature') - feat_s = func(audio)[0] - logger.info(f"{feat_s}") - np.testing.assert_allclose(feat_d, feat_s, atol=1e-5) From 549d477592fbba8533c9e6a3e573918bdf9ca82a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 03:27:33 +0000 Subject: [PATCH 017/113] fix code style --- paddlespeech/s2t/exps/u2/bin/test_wav.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index a55a1eca0..e01d0e401 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -18,7 +18,6 @@ from pathlib import Path import paddle import soundfile -import numpy as np from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation From 53d6baff0be0e2e1d64c6b6b5772d064c24c2bf3 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 03:33:35 +0000 Subject: [PATCH 018/113] format --- paddlespeech/audio/compliance/kaldi.py | 3 +- paddlespeech/s2t/exps/u2/bin/test_wav.py | 2 +- paddlespeech/s2t/exps/u2/model.py | 37 +++++++++++-------- paddlespeech/s2t/models/u2/u2.py | 19 ++++++---- paddlespeech/s2t/modules/cmvn.py | 13 +++---- paddlespeech/s2t/modules/encoder.py | 9 +++-- paddlespeech/s2t/modules/fbank.py | 12 +++--- .../engine/asr/online/python/asr_engine.py | 1 - 8 files changed, 52 insertions(+), 44 deletions(-) diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index 24415058c..eb92ec1f2 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -376,7 +376,8 @@ def _get_mel_banks(num_bins: int, center_freqs = _inverse_mel_scale(center_mel) # (num_bins) # (1, num_fft_bins) - mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins, dtype=paddle.float32)).unsqueeze(0) + mel = _mel_scale(fft_bin_width * paddle.arange( + num_fft_bins, dtype=paddle.float32)).unsqueeze(0) # (num_bins, num_fft_bins) up_slope = (mel - left_mel) / (center_mel - left_mel) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index e01d0e401..ccf44d6b4 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -68,7 +68,7 @@ class U2Infer(): with paddle.no_grad(): # read audio, sample_rate = soundfile.read( - self.audio_file, dtype="int16", always_2d=True) + self.audio_file, dtype="int16", always_2d=True) audio = audio[:, 0] logger.info(f"audio shape: {audio.shape}") diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 2b70f117b..68354ff68 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -462,11 +462,13 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.clone(), self.args.checkpoint_path) - batch_size = 1 + batch_size = 1 feat_dim = self.test_loader.feat_dim model_size = self.config.encoder_conf.output_size num_left_chunks = -1 - logger.info(f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}") + logger.info( + f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}" + ) return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) @@ -479,29 +481,29 @@ class U2Tester(U2Trainer): assert isinstance(input_spec, (list, tuple)), type(input_spec) batch_size, feat_dim, model_size, num_left_chunks = input_spec - ######################## infer_model.forward_encoder_chunk ############ input_spec = [ # (T,), int16 paddle.static.InputSpec(shape=[None], dtype='int16'), ] - infer_model.forward_feature = paddle.jit.to_static(infer_model.forward_feature, input_spec=input_spec) + infer_model.forward_feature = paddle.jit.to_static( + infer_model.forward_feature, input_spec=input_spec) ######################### infer_model.forward_encoder_chunk ############ input_spec = [ # xs, (B, T, D) - paddle.static.InputSpec(shape=[batch_size, None, feat_dim], dtype='float32'), + paddle.static.InputSpec( + shape=[batch_size, None, feat_dim], dtype='float32'), # offset, int, but need be tensor - paddle.static.InputSpec(shape=[1], dtype='int32'), + paddle.static.InputSpec(shape=[1], dtype='int32'), # required_cache_size, int num_left_chunks, # att_cache paddle.static.InputSpec( - shape=[None, None, None, None], - dtype='float32'), + shape=[None, None, None, None], dtype='float32'), # cnn_cache paddle.static.InputSpec( - shape=[None, None, None, None], dtype='float32') + shape=[None, None, None, None], dtype='float32') ] infer_model.forward_encoder_chunk = paddle.jit.to_static( infer_model.forward_encoder_chunk, input_spec=input_spec) @@ -509,12 +511,12 @@ class U2Tester(U2Trainer): ######################### infer_model.ctc_activation ######################## input_spec = [ # encoder_out, (B,T,D) - paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32') ] infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) - ######################### infer_model.forward_attention_decoder ######################## input_spec = [ # hyps, (B, U) @@ -522,15 +524,19 @@ class U2Tester(U2Trainer): # hyps_lens, (B,) paddle.static.InputSpec(shape=[None], dtype='int64'), # encoder_out, (B,T,D) - paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32') ] infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) # jit save logger.info(f"export save: {self.args.export_path}") - paddle.jit.save(infer_model, self.args.export_path, combine_params=True, skip_forward=True) - + paddle.jit.save( + infer_model, + self.args.export_path, + combine_params=True, + skip_forward=True) # test dy2static def flatten(out): @@ -551,7 +557,8 @@ class U2Tester(U2Trainer): required_cache_size = num_left_chunks att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) - xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk( + xs1, offset, required_cache_size, att_cache, cnn_cache) # load static model from paddle.jit.layer import Layer diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 135045aaa..32d0940d9 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -545,11 +545,13 @@ class U2BaseModel(ASRInterface, nn.Layer): [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - logger.debug(f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") + logger.debug( + f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") hyps_lens = hyps_lens + 1 # Add at begining # ctc score in ln domain - decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, encoder_out) + decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, + encoder_out) # Only use decoder score for rescoring best_score = -float('inf') @@ -561,7 +563,9 @@ class U2BaseModel(ASRInterface, nn.Layer): score += decoder_out[i][j][w] # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] - logger.debug(f"hyp {i} len {len(hyp[0])} l2r rescore_score: {score} ctc_score: {hyp[1]}") + logger.debug( + f"hyp {i} len {len(hyp[0])} l2r rescore_score: {score} ctc_score: {hyp[1]}" + ) # add ctc score (which in ln domain) score += hyp[1] * ctc_weight @@ -933,9 +937,7 @@ class U2InferModel(U2Model): if process_type == 'fbank_kaldi': opts.update({'n_mels': input_dim}) opts['dither'] = 0.0 - self.fbank = KaldiFbank( - **opts - ) + self.fbank = KaldiFbank(**opts) logger.info(f"{self.__class__.__name__} export: {self.fbank}") if process_type == 'cmvn_json': # align with paddlespeech.audio.transform.cmvn:GlobalCMVN @@ -956,7 +958,8 @@ class U2InferModel(U2Model): self.global_cmvn = GlobalCMVN( paddle.to_tensor(mean, dtype=paddle.float), paddle.to_tensor(istd, dtype=paddle.float)) - logger.info(f"{self.__class__.__name__} export: {self.global_cmvn}") + logger.info( + f"{self.__class__.__name__} export: {self.global_cmvn}") def forward(self, feats, @@ -994,4 +997,4 @@ class U2InferModel(U2Model): x = paddle.cast(x, paddle.float32) feat = self.fbank(x) feat = self.global_cmvn(feat) - return feat \ No newline at end of file + return feat diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py index 53c508f1a..6a8c1660c 100644 --- a/paddlespeech/s2t/modules/cmvn.py +++ b/paddlespeech/s2t/modules/cmvn.py @@ -41,12 +41,11 @@ class GlobalCMVN(nn.Layer): self.register_buffer("istd", istd) def __repr__(self): - return ( - "{name}(mean={mean}, istd={istd}, norm_var={norm_var})".format( - name=self.__class__.__name__, - mean=self.mean, - istd=self.istd, - norm_var=self.norm_var)) + return ("{name}(mean={mean}, istd={istd}, norm_var={norm_var})".format( + name=self.__class__.__name__, + mean=self.mean, + istd=self.istd, + norm_var=self.norm_var)) def forward(self, x: paddle.Tensor): """ @@ -58,4 +57,4 @@ class GlobalCMVN(nn.Layer): x = x - self.mean if self.norm_var: x = x * self.istd - return x \ No newline at end of file + return x diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 458921b5a..87b83ef55 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -256,10 +256,11 @@ class BaseEncoder(nn.Layer): # att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, # cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i+1], - cnn_cache=cnn_cache[i:i+1], - ) + xs, + att_mask, + pos_emb, + att_cache=att_cache[i:i + 1], + cnn_cache=cnn_cache[i:i + 1], ) # new_att_cache = (1, head, attention_key_size, d_k*2) # new_cnn_cache = (B=1, hidden-dim, cache_t2) r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) diff --git a/paddlespeech/s2t/modules/fbank.py b/paddlespeech/s2t/modules/fbank.py index 4ec620a79..8d76a4727 100644 --- a/paddlespeech/s2t/modules/fbank.py +++ b/paddlespeech/s2t/modules/fbank.py @@ -1,19 +1,17 @@ - - - import paddle from paddle import nn from paddlespeech.audio.compliance import kaldi - from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() __all__ = ['KaldiFbank'] + class KaldiFbank(nn.Layer): - def __init__(self, + def __init__( + self, fs=16000, n_mels=80, n_shift=160, # unit:sample, 10ms @@ -62,7 +60,7 @@ class KaldiFbank(nn.Layer): assert x.ndim == 1 feat = kaldi.fbank( - x.unsqueeze(0), # append channel dim, (C, Ti) + x.unsqueeze(0), # append channel dim, (C, Ti) n_mels=self.n_mels, frame_length=self.n_frame_length, frame_shift=self.n_frame_shift, @@ -70,5 +68,5 @@ class KaldiFbank(nn.Layer): energy_floor=self.energy_floor, sr=self.fs) - assert feat.ndim == 2 # (T,D) + assert feat.ndim == 2 # (T,D) return feat diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 1dc970891..5782d7035 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -80,7 +80,6 @@ class PaddleASRConnectionHanddler: self.init_decoder() self.reset() - def init_decoder(self): if "deepspeech2" in self.model_type: assert self.continuous_decoding is False, "ds2 model not support endpoint" From a02654660a270478a4d405a312dce4e090d17a76 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 20 Sep 2022 16:20:05 +0800 Subject: [PATCH 019/113] Update pretrained_models.py Add a new model for faster text process --- paddlespeech/resource/pretrained_models.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index f049879a3..0a1ed15e1 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -529,7 +529,7 @@ text_dynamic_pretrained_models = { 'ckpt/model_state.pdparams', 'vocab_file': 'punc_vocab.txt', - }, + } }, "ernie_linear_p3_wudao-punc-zh": { '1.0': { @@ -543,10 +543,26 @@ text_dynamic_pretrained_models = { 'ckpt/model_state.pdparams', 'vocab_file': 'punc_vocab.txt', - }, + } }, + "ernie_linear_p3_wudao_fast-punc-zh": { + '1.0':{ + 'url': + 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao_fast-punc-zh.tar.gz', + 'md5': + 'c93f9594119541a5dbd763381a751d08', + 'cfg_path': + 'ckpt/model_config.json', + 'ckpt_path': + 'ckpt/model_state.pdparams', + 'vocab_file': + 'punc_vocab.txt', + } + } } + + # --------------------------------- # -------------- TTS -------------- # --------------------------------- From b627666ce9fde479793e492a063d6c977f12cf60 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 20 Sep 2022 16:22:32 +0800 Subject: [PATCH 020/113] Update model_alias.py Add a new model for faster text process in cli --- paddlespeech/resource/model_alias.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddlespeech/resource/model_alias.py b/paddlespeech/resource/model_alias.py index 9c76dd4b3..85187a8d1 100644 --- a/paddlespeech/resource/model_alias.py +++ b/paddlespeech/resource/model_alias.py @@ -51,6 +51,10 @@ model_alias = { "paddlespeech.text.models:ErnieLinear", "paddlenlp.transformers:ErnieTokenizer" ], + "ernie_linear_p3_wudao": [ + "paddlespeech.text.models:ErnieLinear", + "paddlenlp.transformers:ErnieTokenizer" + ], # --------------------------------- # -------------- TTS -------------- From 57dcd0d17f559a5f22c83a0d321f4db9d57d08d9 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 20 Sep 2022 16:29:10 +0800 Subject: [PATCH 021/113] Update infer.py change the infer in order to implement the new faster model for text --- paddlespeech/cli/text/infer.py | 91 ++++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 9 deletions(-) diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index 24b8c9c25..ff822f674 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -20,10 +20,13 @@ from typing import Optional from typing import Union import paddle +import yaml +from yacs.config import CfgNode from ..executor import BaseExecutor from ..log import logger from ..utils import stats_wrapper +from paddlespeech.text.models.ernie_linear import ErnieLinear __all__ = ['TextExecutor'] @@ -139,6 +142,66 @@ class TextExecutor(BaseExecutor): self.model.eval() + #init new models + def _init_from_path_new(self, + task: str='punc', + model_type: str='ernie_linear_p7_wudao', + lang: str='zh', + cfg_path: Optional[os.PathLike]=None, + ckpt_path: Optional[os.PathLike]=None, + vocab_file: Optional[os.PathLike]=None): + if hasattr(self, 'model'): + logger.debug('Model had been initialized.') + return + + self.task = task + + if cfg_path is None or ckpt_path is None or vocab_file is None: + tag = '-'.join([model_type, task, lang]) + self.task_resource.set_task_model(tag, version=None) + self.cfg_path = os.path.join( + self.task_resource.res_dir, + self.task_resource.res_dict['cfg_path']) + self.ckpt_path = os.path.join( + self.task_resource.res_dir, + self.task_resource.res_dict['ckpt_path']) + self.vocab_file = os.path.join( + self.task_resource.res_dir, + self.task_resource.res_dict['vocab_file']) + else: + self.cfg_path = os.path.abspath(cfg_path) + self.ckpt_path = os.path.abspath(ckpt_path) + self.vocab_file = os.path.abspath(vocab_file) + + model_name = model_type[:model_type.rindex('_')] + + if self.task == 'punc': + # punc list + self._punc_list = [] + with open(self.vocab_file, 'r') as f: + for line in f: + self._punc_list.append(line.strip()) + + # model + with open(self.cfg_path) as f: + config = CfgNode(yaml.safe_load(f)) + self.model = ErnieLinear(**config["model"]) + + _, tokenizer_class = self.task_resource.get_model_class(model_name) + state_dict = paddle.load(self.ckpt_path) + self.model.set_state_dict(state_dict["main_params"]) + self.model.eval() + + #tokenizer: fast version: ernie-3.0-mini-zh slow version:ernie-1.0 + if 'fast' not in model_type: + self.tokenizer = tokenizer_class.from_pretrained('ernie-1.0') + else: + self.tokenizer = tokenizer_class.from_pretrained( + 'ernie-3.0-mini-zh') + + else: + raise NotImplementedError + def _clean_text(self, text): text = text.lower() text = re.sub('[^A-Za-z0-9\u4e00-\u9fa5]', '', text) @@ -179,7 +242,7 @@ class TextExecutor(BaseExecutor): else: raise NotImplementedError - def postprocess(self) -> Union[str, os.PathLike]: + def postprocess(self, isNewTrainer: bool=False) -> Union[str, os.PathLike]: """ Output postprocess and return human-readable results such as texts and audio files. """ @@ -192,13 +255,13 @@ class TextExecutor(BaseExecutor): input_ids[1:seq_len - 1]) labels = preds[1:seq_len - 1].tolist() assert len(tokens) == len(labels) - + if isNewTrainer: + self._punc_list = [0] + self._punc_list text = '' for t, l in zip(tokens, labels): text += t if l != 0: # Non punc. text += self._punc_list[l] - return text else: raise NotImplementedError @@ -255,10 +318,20 @@ class TextExecutor(BaseExecutor): """ Python API to call an executor. """ - paddle.set_device(device) - self._init_from_path(task, model, lang, config, ckpt_path, punc_vocab) - self.preprocess(text) - self.infer() - res = self.postprocess() # Retrieve result of text task. - + #Here is old version models + if model in ['ernie_linear_p7_wudao', 'ernie_linear_p3_wudao']: + paddle.set_device(device) + self._init_from_path(task, model, lang, config, ckpt_path, + punc_vocab) + self.preprocess(text) + self.infer() + res = self.postprocess() # Retrieve result of text task. + #Add new way to infer + else: + paddle.set_device(device) + self._init_from_path_new(task, model, lang, config, ckpt_path, + punc_vocab) + self.preprocess(text) + self.infer() + res = self.postprocess(isNewTrainer=True) return res From 92d09d5cce640300ac182852600217ac8796c34f Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 20 Sep 2022 16:31:38 +0800 Subject: [PATCH 022/113] Update README_cn.md --- paddlespeech/cli/README_cn.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddlespeech/cli/README_cn.md b/paddlespeech/cli/README_cn.md index 4b15d6c7b..6464c598c 100644 --- a/paddlespeech/cli/README_cn.md +++ b/paddlespeech/cli/README_cn.md @@ -43,3 +43,7 @@ ```bash paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 ``` +- 快速标点恢复 + ```bash + paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast + ``` From fb7f04e021d495524878e79b9e12d675490e2e77 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 20 Sep 2022 16:32:45 +0800 Subject: [PATCH 023/113] Update README.md --- paddlespeech/cli/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md index 19c822040..53c1ca3b2 100644 --- a/paddlespeech/cli/README.md +++ b/paddlespeech/cli/README.md @@ -42,3 +42,7 @@ ```bash paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 ``` +- Faster Punctuation Restoration + ```bash + paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast + ``` From 12a11394bd3f33f81e6a7e834c34993a2e1336d0 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 20 Sep 2022 16:53:44 +0800 Subject: [PATCH 024/113] Update infer.py add a new faster model to infer in cli --- paddlespeech/cli/text/infer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index ff822f674..8433e6545 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -335,3 +335,4 @@ class TextExecutor(BaseExecutor): self.infer() res = self.postprocess(isNewTrainer=True) return res + From a63a0b13503b3bf2d8b752973739a68d7e16780e Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 20 Sep 2022 16:58:16 +0800 Subject: [PATCH 025/113] Update pretrained_models.py --- paddlespeech/resource/pretrained_models.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 0a1ed15e1..b6ab7f01c 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -546,7 +546,7 @@ text_dynamic_pretrained_models = { } }, "ernie_linear_p3_wudao_fast-punc-zh": { - '1.0':{ + '1.0': { 'url': 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao_fast-punc-zh.tar.gz', 'md5': @@ -561,8 +561,6 @@ text_dynamic_pretrained_models = { } } - - # --------------------------------- # -------------- TTS -------------- # --------------------------------- From 18b71dc1361030c47031e472f05c1664c79c4849 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 20 Sep 2022 18:16:09 +0800 Subject: [PATCH 028/113] Update README.md --- paddlespeech/cli/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md index 53c1ca3b2..1d10e0d79 100644 --- a/paddlespeech/cli/README.md +++ b/paddlespeech/cli/README.md @@ -45,4 +45,4 @@ - Faster Punctuation Restoration ```bash paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast - ``` + ``` From d5dec463365e6d000477b63a2d4d000d4d398b50 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 20 Sep 2022 18:22:41 +0800 Subject: [PATCH 029/113] Update README.md --- paddlespeech/cli/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md index 1d10e0d79..e6e216c0b 100644 --- a/paddlespeech/cli/README.md +++ b/paddlespeech/cli/README.md @@ -43,6 +43,6 @@ paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 ``` - Faster Punctuation Restoration - ```bash + ```bash paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast - ``` + ``` From bdbacd42499b39aba3d013002989bbe44da3588f Mon Sep 17 00:00:00 2001 From: THUzyt21 Date: Tue, 20 Sep 2022 10:48:34 +0000 Subject: [PATCH 030/113] precomited --- paddlespeech/cli/text/infer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index 8433e6545..ff822f674 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -335,4 +335,3 @@ class TextExecutor(BaseExecutor): self.infer() res = self.postprocess(isNewTrainer=True) return res - From 322301a6db2280f9358a37059db276de9fdcdc9a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 11:27:22 +0000 Subject: [PATCH 031/113] add reverse pad with sos and eos test --- tests/unit/asr/reverse_pad_list.py | 145 +++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 tests/unit/asr/reverse_pad_list.py diff --git a/tests/unit/asr/reverse_pad_list.py b/tests/unit/asr/reverse_pad_list.py new file mode 100644 index 000000000..60e768bcf --- /dev/null +++ b/tests/unit/asr/reverse_pad_list.py @@ -0,0 +1,145 @@ + + + + +import paddle +import numpy as np +import unittest + +# from paddlespeech.audio.utils.tensor_utils import reverse_pad_list +import paddlespeech.s2t +from paddlespeech.audio.utils.tensor_utils import add_sos_eos +from paddlespeech.audio.utils.tensor_utils import pad_sequence + +def reverse_pad_list(ys_pad: paddle.Tensor, + ys_lens: paddle.Tensor, + pad_value: float=-1.0) -> paddle.Tensor: + """Reverse padding for the list of tensors. + Args: + ys_pad (tensor): The padded tensor (B, Tokenmax). + ys_lens (tensor): The lens of token seqs (B) + pad_value (int): Value for padding. + Returns: + Tensor: Padded tensor (B, Tokenmax). + Examples: + >>> x + tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) + >>> pad_list(x, 0) + tensor([[4, 3, 2, 1], + [7, 6, 5, 0], + [9, 8, 0, 0]]) + """ + r_ys_pad = pad_sequence([(paddle.flip(y[:i], [0])) + for y, i in zip(ys_pad, ys_lens)], True, pad_value) + return r_ys_pad + +def naive_reverse_pad_list_with_sos_eos(r_hyps, r_hyps_lens, sos=5000, eos=5000, ignore_id=-1): + r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(ignore_id)) + r_hyps, _ = add_sos_eos(r_hyps, sos, eos, ignore_id) + return r_hyps + +def reverse_pad_list_with_sos_eos(r_hyps, r_hyps_lens, sos=5000, eos=5000, ignore_id=-1): + # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) + # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) + max_len = paddle.max(r_hyps_lens) + index_range = paddle.arange(0, max_len, 1) + seq_len_expand = r_hyps_lens.unsqueeze(1) + seq_mask = seq_len_expand > index_range # (beam, max_len) + + index = (seq_len_expand - 1) - index_range # (beam, max_len) + # >>> index + # >>> tensor([[ 2, 1, 0], + # >>> [ 2, 1, 0], + # >>> [ 0, -1, -2]]) + index = index * seq_mask + + # >>> index + # >>> tensor([[2, 1, 0], + # >>> [2, 1, 0], + # >>> [0, 0, 0]]) + def paddle_gather(x, dim, index): + index_shape = index.shape + index_flatten = index.flatten() + if dim < 0: + dim = len(x.shape) + dim + nd_index = [] + for k in range(len(x.shape)): + if k == dim: + nd_index.append(index_flatten) + else: + reshape_shape = [1] * len(x.shape) + reshape_shape[k] = x.shape[k] + x_arange = paddle.arange(x.shape[k], dtype=index.dtype) + x_arange = x_arange.reshape(reshape_shape) + dim_index = paddle.expand(x_arange, index_shape).flatten() + nd_index.append(dim_index) + ind2 = paddle.transpose(paddle.stack(nd_index), + [1, 0]).astype("int64") + paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape) + return paddle_out + + r_hyps = paddle_gather(r_hyps, 1, index) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, 2, 2]]) + r_hyps = paddle.where(seq_mask, r_hyps, eos) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, eos, eos]]) + B = r_hyps.shape[0] + _sos = paddle.ones([B, 1], dtype=r_hyps.dtype) * sos + # r_hyps = paddle.concat([hyps[:, 0:1], r_hyps], axis=1) + r_hyps = paddle.concat([_sos, r_hyps], axis=1) + # >>> r_hyps + # >>> tensor([[sos, 3, 2, 1], + # >>> [sos, 4, 8, 9], + # >>> [sos, 2, eos, eos]]) + return r_hyps + + +class TestU2Model(unittest.TestCase): + def setUp(self): + paddle.set_device('cpu') + + self.sos=5000 + self.eos=5000 + self.ignore_id=-1 + self.reverse_hyps = paddle.to_tensor( + [[ 4, 3, 2, 1, -1], + [ 5, 4, 3, 2, 1]] + ) + self.reverse_hyps_sos_eos = paddle.to_tensor( + [[self.sos, 4 , 3 , 2 , 1 , self.eos], + [self.sos, 5 , 4 , 3 , 2 , 1 ]] + ) + + self.hyps = paddle.to_tensor( + [ + [1, 2, 3, 4, -1], + [1, 2, 3, 4, 5] + ] + ) + + + self.hyps_lens = paddle.to_tensor([4, 5], paddle.int32) + + def test_reverse_pad_list(self): + r_hyps = reverse_pad_list(self.hyps, self.hyps_lens) + self.assertSequenceEqual(r_hyps.tolist(), self.reverse_hyps.tolist()) + + def test_naive_reverse_pad_list_with_sos_eos(self): + r_hyps_sos_eos = naive_reverse_pad_list_with_sos_eos(self.hyps, self.hyps_lens) + self.assertSequenceEqual(r_hyps_sos_eos.tolist(), self.reverse_hyps_sos_eos.tolist()) + + def test_static_reverse_pad_list_with_sos_eos(self): + r_hyps_sos_eos_static = reverse_pad_list_with_sos_eos(self.hyps, self.hyps_lens) + self.assertSequenceEqual(r_hyps_sos_eos_static.tolist(), self.reverse_hyps_sos_eos.tolist()) + + + +if __name__ == '__main__': + unittest.main() + + From f95edc382c71e91b5c7fa10f10fc31e681a17169 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 11:32:35 +0000 Subject: [PATCH 032/113] format --- tests/unit/asr/reverse_pad_list.py | 193 +++++++++++++++-------------- 1 file changed, 102 insertions(+), 91 deletions(-) diff --git a/tests/unit/asr/reverse_pad_list.py b/tests/unit/asr/reverse_pad_list.py index 60e768bcf..215ed5ceb 100644 --- a/tests/unit/asr/reverse_pad_list.py +++ b/tests/unit/asr/reverse_pad_list.py @@ -1,16 +1,27 @@ - - - +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest import paddle -import numpy as np -import unittest -# from paddlespeech.audio.utils.tensor_utils import reverse_pad_list -import paddlespeech.s2t +import paddlespeech.s2t # noqa: F401 from paddlespeech.audio.utils.tensor_utils import add_sos_eos from paddlespeech.audio.utils.tensor_utils import pad_sequence +# from paddlespeech.audio.utils.tensor_utils import reverse_pad_list + + def reverse_pad_list(ys_pad: paddle.Tensor, ys_lens: paddle.Tensor, pad_value: float=-1.0) -> paddle.Tensor: @@ -33,95 +44,94 @@ def reverse_pad_list(ys_pad: paddle.Tensor, for y, i in zip(ys_pad, ys_lens)], True, pad_value) return r_ys_pad -def naive_reverse_pad_list_with_sos_eos(r_hyps, r_hyps_lens, sos=5000, eos=5000, ignore_id=-1): + +def naive_reverse_pad_list_with_sos_eos(r_hyps, + r_hyps_lens, + sos=5000, + eos=5000, + ignore_id=-1): r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(ignore_id)) r_hyps, _ = add_sos_eos(r_hyps, sos, eos, ignore_id) return r_hyps -def reverse_pad_list_with_sos_eos(r_hyps, r_hyps_lens, sos=5000, eos=5000, ignore_id=-1): - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = paddle.max(r_hyps_lens) - index_range = paddle.arange(0, max_len, 1) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - def paddle_gather(x, dim, index): - index_shape = index.shape - index_flatten = index.flatten() - if dim < 0: - dim = len(x.shape) + dim - nd_index = [] - for k in range(len(x.shape)): - if k == dim: - nd_index.append(index_flatten) - else: - reshape_shape = [1] * len(x.shape) - reshape_shape[k] = x.shape[k] - x_arange = paddle.arange(x.shape[k], dtype=index.dtype) - x_arange = x_arange.reshape(reshape_shape) - dim_index = paddle.expand(x_arange, index_shape).flatten() - nd_index.append(dim_index) - ind2 = paddle.transpose(paddle.stack(nd_index), - [1, 0]).astype("int64") - paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape) - return paddle_out - - r_hyps = paddle_gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = paddle.where(seq_mask, r_hyps, eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - B = r_hyps.shape[0] - _sos = paddle.ones([B, 1], dtype=r_hyps.dtype) * sos - # r_hyps = paddle.concat([hyps[:, 0:1], r_hyps], axis=1) - r_hyps = paddle.concat([_sos, r_hyps], axis=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - return r_hyps + +def reverse_pad_list_with_sos_eos(r_hyps, + r_hyps_lens, + sos=5000, + eos=5000, + ignore_id=-1): + # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) + # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) + max_len = paddle.max(r_hyps_lens) + index_range = paddle.arange(0, max_len, 1) + seq_len_expand = r_hyps_lens.unsqueeze(1) + seq_mask = seq_len_expand > index_range # (beam, max_len) + + index = (seq_len_expand - 1) - index_range # (beam, max_len) + # >>> index + # >>> tensor([[ 2, 1, 0], + # >>> [ 2, 1, 0], + # >>> [ 0, -1, -2]]) + index = index * seq_mask + + # >>> index + # >>> tensor([[2, 1, 0], + # >>> [2, 1, 0], + # >>> [0, 0, 0]]) + def paddle_gather(x, dim, index): + index_shape = index.shape + index_flatten = index.flatten() + if dim < 0: + dim = len(x.shape) + dim + nd_index = [] + for k in range(len(x.shape)): + if k == dim: + nd_index.append(index_flatten) + else: + reshape_shape = [1] * len(x.shape) + reshape_shape[k] = x.shape[k] + x_arange = paddle.arange(x.shape[k], dtype=index.dtype) + x_arange = x_arange.reshape(reshape_shape) + dim_index = paddle.expand(x_arange, index_shape).flatten() + nd_index.append(dim_index) + ind2 = paddle.transpose(paddle.stack(nd_index), [1, 0]).astype("int64") + paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape) + return paddle_out + + r_hyps = paddle_gather(r_hyps, 1, index) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, 2, 2]]) + r_hyps = paddle.where(seq_mask, r_hyps, eos) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, eos, eos]]) + B = r_hyps.shape[0] + _sos = paddle.ones([B, 1], dtype=r_hyps.dtype) * sos + # r_hyps = paddle.concat([hyps[:, 0:1], r_hyps], axis=1) + r_hyps = paddle.concat([_sos, r_hyps], axis=1) + # >>> r_hyps + # >>> tensor([[sos, 3, 2, 1], + # >>> [sos, 4, 8, 9], + # >>> [sos, 2, eos, eos]]) + return r_hyps class TestU2Model(unittest.TestCase): def setUp(self): paddle.set_device('cpu') - self.sos=5000 - self.eos=5000 - self.ignore_id=-1 - self.reverse_hyps = paddle.to_tensor( - [[ 4, 3, 2, 1, -1], - [ 5, 4, 3, 2, 1]] - ) + self.sos = 5000 + self.eos = 5000 + self.ignore_id = -1 + self.reverse_hyps = paddle.to_tensor([[4, 3, 2, 1, -1], + [5, 4, 3, 2, 1]]) self.reverse_hyps_sos_eos = paddle.to_tensor( - [[self.sos, 4 , 3 , 2 , 1 , self.eos], - [self.sos, 5 , 4 , 3 , 2 , 1 ]] - ) - - self.hyps = paddle.to_tensor( - [ - [1, 2, 3, 4, -1], - [1, 2, 3, 4, 5] - ] - ) + [[self.sos, 4, 3, 2, 1, self.eos], [self.sos, 5, 4, 3, 2, 1]]) + self.hyps = paddle.to_tensor([[1, 2, 3, 4, -1], [1, 2, 3, 4, 5]]) self.hyps_lens = paddle.to_tensor([4, 5], paddle.int32) @@ -130,16 +140,17 @@ class TestU2Model(unittest.TestCase): self.assertSequenceEqual(r_hyps.tolist(), self.reverse_hyps.tolist()) def test_naive_reverse_pad_list_with_sos_eos(self): - r_hyps_sos_eos = naive_reverse_pad_list_with_sos_eos(self.hyps, self.hyps_lens) - self.assertSequenceEqual(r_hyps_sos_eos.tolist(), self.reverse_hyps_sos_eos.tolist()) + r_hyps_sos_eos = naive_reverse_pad_list_with_sos_eos(self.hyps, + self.hyps_lens) + self.assertSequenceEqual(r_hyps_sos_eos.tolist(), + self.reverse_hyps_sos_eos.tolist()) def test_static_reverse_pad_list_with_sos_eos(self): - r_hyps_sos_eos_static = reverse_pad_list_with_sos_eos(self.hyps, self.hyps_lens) - self.assertSequenceEqual(r_hyps_sos_eos_static.tolist(), self.reverse_hyps_sos_eos.tolist()) - + r_hyps_sos_eos_static = reverse_pad_list_with_sos_eos(self.hyps, + self.hyps_lens) + self.assertSequenceEqual(r_hyps_sos_eos_static.tolist(), + self.reverse_hyps_sos_eos.tolist()) if __name__ == '__main__': unittest.main() - - From 6fc4b2809332ab8af057aa71b74baae7c7d06d2b Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 12:42:49 +0000 Subject: [PATCH 033/113] add comment --- examples/wenetspeech/asr1/local/export.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/wenetspeech/asr1/local/export.sh b/examples/wenetspeech/asr1/local/export.sh index 6b646b469..735c4f8e5 100755 --- a/examples/wenetspeech/asr1/local/export.sh +++ b/examples/wenetspeech/asr1/local/export.sh @@ -12,9 +12,12 @@ config_path=$1 ckpt_path_prefix=$2 jit_model_export_path=$3 + +# export can not using StreamdataDataloader, set use_stream_dta False python3 -u ${BIN_DIR}/export.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--opts use_stream_data False \ --checkpoint_path ${ckpt_path_prefix} \ --export_path ${jit_model_export_path} From 309c8d70d9e7168eac597a5ffb030fc6703d7e87 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 12:56:07 +0000 Subject: [PATCH 034/113] add reverse weight --- paddlespeech/s2t/exps/u2/model.py | 4 +++- paddlespeech/s2t/models/u2/u2.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 54810f22f..64b6c8df6 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -520,6 +520,7 @@ class U2Tester(U2Trainer): infer_model.ctc_activation, input_spec=input_spec) ######################### infer_model.forward_attention_decoder ######################## + reverse_weight = 0.3 input_spec = [ # hyps, (B, U) paddle.static.InputSpec(shape=[None, None], dtype='int64'), @@ -527,7 +528,8 @@ class U2Tester(U2Trainer): paddle.static.InputSpec(shape=[None], dtype='int64'), # encoder_out, (B,T,D) paddle.static.InputSpec( - shape=[batch_size, None, model_size], dtype='float32') + shape=[batch_size, None, model_size], dtype='float32'), + reverse_weight ] infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index d699b684b..1681bf1d9 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -706,7 +706,7 @@ class U2BaseModel(ASRInterface, nn.Layer): hyps: paddle.Tensor, hyps_lens: paddle.Tensor, encoder_out: paddle.Tensor, - reverse_weight: float=0.0, ) -> paddle.Tensor: + reverse_weight: float=0.0) -> paddle.Tensor: """ Export interface for c++ call, forward decoder with multiple hypothesis from ctc prefix beam search and one encoder output Args: From 00b2c1c8fb4fc81e723e8580cbc7ed6059378680 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 21 Sep 2022 07:50:02 +0000 Subject: [PATCH 035/113] fix forward attention decoder caller --- paddlespeech/s2t/exps/u2/bin/test_wav.py | 2 +- paddlespeech/s2t/models/u2/u2.py | 15 ++++++++------- paddlespeech/s2t/modules/decoder.py | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 9446884f8..31890cb19 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -79,7 +79,7 @@ class U2Infer(): ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode - logger.debug(f"decode cfg: {decode_config}") + logger.info(f"decode cfg: {decode_config}") result_transcripts = self.model.decode( xs, ilen, diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 1681bf1d9..7609b71e0 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -565,18 +565,18 @@ class U2BaseModel(ASRInterface, nn.Layer): [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - logger.debug( + logger.info( f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") hyps_lens = hyps_lens + 1 # Add at begining # ctc score in ln domain # (beam_size, max_hyps_len, vocab_size) decoder_out, r_decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, - encoder_out,reverse_weight ) + encoder_out, reverse_weight) + decoder_out = decoder_out.numpy() # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a # conventional transformer decoder. - r_decoder_out = paddle.nn.functional.log_softmax(r_decoder_out, axis=-1) r_decoder_out = r_decoder_out.numpy() # Only use decoder score for rescoring @@ -590,15 +590,16 @@ class U2BaseModel(ASRInterface, nn.Layer): # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] - logger.debug( - f"hyp {i} len {len(hyp[0])} l2r rescore_score: {score} ctc_score: {hyp[1]}" - ) + logger.info(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") if reverse_weight > 0: r_score = 0.0 for j, w in enumerate(hyp[0]): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.eos] + + logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + score = score * (1 - reverse_weight) + r_score * reverse_weight # add ctc score (which in ln domain) @@ -607,7 +608,7 @@ class U2BaseModel(ASRInterface, nn.Layer): best_score = score best_index = i - logger.debug(f"result: {hyps[best_index]}") + logger.info(f"result: {hyps[best_index]}") return hyps[best_index][0] @jit.to_static(property=True) diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index 3b1a7f23d..03b637b7c 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -343,7 +343,7 @@ class BiTransformerDecoder(BatchScorerInterface, nn.Layer): """ l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, ys_in_lens) - r_x = paddle.to_tensor(0.0) + r_x = paddle.zeros([1]) if reverse_weight > 0.0: r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, ys_in_lens) From b10512eb0e64d615621baa2cd203129f20dd1626 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 21 Sep 2022 09:16:32 +0000 Subject: [PATCH 036/113] more config or u2pp --- examples/wenetspeech/asr1/README.md | 31 ++++++ .../asr1/conf/chunk_conformer.yaml | 4 +- .../asr1/conf/chunk_conformer_u2pp.yaml | 100 ++++++++++++++++++ examples/wenetspeech/asr1/local/export.sh | 2 + paddlespeech/s2t/models/u2/u2.py | 8 +- 5 files changed, 140 insertions(+), 5 deletions(-) create mode 100644 examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md index c08b94e29..9fc2856ce 100644 --- a/examples/wenetspeech/asr1/README.md +++ b/examples/wenetspeech/asr1/README.md @@ -12,3 +12,34 @@ show model.tar.gz ``` tar tf model.tar.gz ``` + +other way is: + +```bash +tar cvzf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz model.yaml conf/tuning/ conf/chunk_conformer.yaml conf/preprocess.yaml data/mean_std.json exp/chunk_conformer/checkpoints/ +``` + +## Export Static Model + +>> `data/test_meeting/data.list` +>> {"input": [{"name": "input1", "shape": [3.2230625, 80], "feat": "/home/PaddleSpeech/dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0163.wav", "filetype": "sound"}], "output": [{"name": "target1", "shape": [9, 5538], "text": "\u697c\u5e02\u8c03\u63a7\u5c06\u53bb\u5411\u4f55\u65b9", "token": "\u697c \u5e02 \u8c03 \u63a7 \u5c06 \u53bb \u5411 \u4f55 \u65b9", "tokenid": "1891 1121 3502 1543 1018 477 528 163 1657"}], "utt": "BAC009S0764W0163", "utt2spk": "S0764"} + +>> Test Wav: +>> wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +### U2 chunk conformer +>> UiDecoder +>> Make sure `reverse_weight` in config is `0.0` +>> https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz +``` +tar zxvf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz +./local/export.sh conf/chunk_conformer.yaml exp/chunk_conformer/checkpoints/avg_10 ./export.ji +``` + +### U2++ chunk conformer +>> BiDecoder +>> https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.0.model.tar.gz +>> Make sure `reverse_weight` in config is not `0.0` + +``` +./local/export.sh conf/chunk_conformer_u2pp.yaml exp/chunk_conformer/checkpoints/avg_10 ./export.ji +``` diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml index 69fa223a1..d2f43d873 100644 --- a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml +++ b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml @@ -39,6 +39,7 @@ decoder_conf: model_conf: ctc_weight: 0.3 lsm_weight: 0.1 # label smoothing option + reverse_weight: 0.0 # unidecoder length_normalized_loss: false init_type: 'kaiming_uniform' @@ -53,8 +54,9 @@ test_manifest: data/test_meeting/data.list ########################################### # Dataloader # ########################################### -vocab_filepath: data/lang_char/vocab.txt +use_streaming_data: True unit_type: 'char' +vocab_filepath: data/lang_char/vocab.txt preprocess_config: conf/preprocess.yaml spm_model_prefix: '' feat_dim: 80 diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml new file mode 100644 index 000000000..2bb2006b5 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml @@ -0,0 +1,100 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: bitransformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 3 # the number of encoder blocks + r_num_blocks: 3 #only for bitransformer + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + reverse_weight: 0.3 # only for bitransformer decoder + init_type: 'kaiming_uniform' # !Warning: need to convergence + +########################################### +# Data # +########################################### +train_manifest: data/train_l/data.list +dev_manifest: data/dev/data.list +test_manifest: data/test_meeting/data.list + +########################################### +# Dataloader # +########################################### +use_stream_data: True +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +do_filter: True +maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced +minlen_in: 10 +minlen_out: 0 +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 150 +accum_grad: 8 +global_grad_clip: 5.0 +dist_sampler: False +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/wenetspeech/asr1/local/export.sh b/examples/wenetspeech/asr1/local/export.sh index 735c4f8e5..1f89afd6b 100755 --- a/examples/wenetspeech/asr1/local/export.sh +++ b/examples/wenetspeech/asr1/local/export.sh @@ -14,6 +14,8 @@ jit_model_export_path=$3 # export can not using StreamdataDataloader, set use_stream_dta False +# u2: reverse_weight should be 0.0 +# u2pp: reverse_weight should be same with config file. e.g. 0.3 python3 -u ${BIN_DIR}/export.py \ --ngpu ${ngpu} \ --config ${config_path} \ diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 7609b71e0..2279812ba 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -565,7 +565,7 @@ class U2BaseModel(ASRInterface, nn.Layer): [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - logger.info( + logger.debug( f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") hyps_lens = hyps_lens + 1 # Add at begining @@ -590,7 +590,7 @@ class U2BaseModel(ASRInterface, nn.Layer): # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] - logger.info(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + logger.debug(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") if reverse_weight > 0: r_score = 0.0 @@ -598,7 +598,7 @@ class U2BaseModel(ASRInterface, nn.Layer): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.eos] - logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") score = score * (1 - reverse_weight) + r_score * reverse_weight @@ -608,7 +608,7 @@ class U2BaseModel(ASRInterface, nn.Layer): best_score = score best_index = i - logger.info(f"result: {hyps[best_index]}") + logger.debug(f"result: {hyps[best_index]}") return hyps[best_index][0] @jit.to_static(property=True) From d25871a7b090fc76f7c1780eb3bf2fabb606aa14 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 21 Sep 2022 09:18:48 +0000 Subject: [PATCH 037/113] format --- paddlespeech/s2t/models/u2/u2.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 2279812ba..93c5d9106 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -571,8 +571,8 @@ class U2BaseModel(ASRInterface, nn.Layer): # ctc score in ln domain # (beam_size, max_hyps_len, vocab_size) - decoder_out, r_decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, - encoder_out, reverse_weight) + decoder_out, r_decoder_out = self.forward_attention_decoder( + hyps_pad, hyps_lens, encoder_out, reverse_weight) decoder_out = decoder_out.numpy() # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a @@ -590,7 +590,9 @@ class U2BaseModel(ASRInterface, nn.Layer): # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] - logger.debug(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + logger.debug( + f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}" + ) if reverse_weight > 0: r_score = 0.0 @@ -598,7 +600,9 @@ class U2BaseModel(ASRInterface, nn.Layer): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.eos] - logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + logger.info( + f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}" + ) score = score * (1 - reverse_weight) + r_score * reverse_weight @@ -702,12 +706,11 @@ class U2BaseModel(ASRInterface, nn.Layer): return self.ctc.log_softmax(xs) # @jit.to_static - def forward_attention_decoder( - self, - hyps: paddle.Tensor, - hyps_lens: paddle.Tensor, - encoder_out: paddle.Tensor, - reverse_weight: float=0.0) -> paddle.Tensor: + def forward_attention_decoder(self, + hyps: paddle.Tensor, + hyps_lens: paddle.Tensor, + encoder_out: paddle.Tensor, + reverse_weight: float=0.0) -> paddle.Tensor: """ Export interface for c++ call, forward decoder with multiple hypothesis from ctc prefix beam search and one encoder output Args: From 7382050e21990ae2b4dac0cd86a6dbac4d84d485 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 21 Sep 2022 11:15:00 +0000 Subject: [PATCH 038/113] fix bug on win --- paddlespeech/audio/utils/tensor_utils.py | 5 +++-- paddlespeech/s2t/models/u2/u2.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddlespeech/audio/utils/tensor_utils.py b/paddlespeech/audio/utils/tensor_utils.py index 44dcb52ec..b2436a121 100644 --- a/paddlespeech/audio/utils/tensor_utils.py +++ b/paddlespeech/audio/utils/tensor_utils.py @@ -237,7 +237,7 @@ def st_reverse_pad_list(ys_pad: paddle.Tensor, # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) B = ys_pad.shape[0] - _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos + _sos = paddle.full([B, 1], sos, dtype=ys_pad.dtype) max_len = paddle.max(ys_lens) index_range = paddle.arange(0, max_len, 1) seq_len_expand = ys_lens.unsqueeze(1) @@ -279,7 +279,8 @@ def st_reverse_pad_list(ys_pad: paddle.Tensor, # >>> tensor([[3, 2, 1], # >>> [4, 8, 9], # >>> [2, 2, 2]]) - r_hyps = paddle.where(seq_mask, r_hyps, eos) + _eos = paddle.full([1], eos, dtype=r_hyps.dtype) + r_hyps = paddle.where(seq_mask, r_hyps, _eos) # >>> r_hyps # >>> tensor([[3, 2, 1], # >>> [4, 8, 9], diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 93c5d9106..207e470a6 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -600,7 +600,7 @@ class U2BaseModel(ASRInterface, nn.Layer): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.eos] - logger.info( + logger.debug( f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}" ) From b7388ce25afc6da37b6011405141c0c9eb2ee99f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 22 Sep 2022 11:42:20 +0000 Subject: [PATCH 039/113] eliminate useless unsqueese --- paddlespeech/s2t/modules/embedding.py | 7 +++---- paddlespeech/s2t/modules/encoder.py | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index 3aeebd29b..54324c2f6 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -89,7 +89,7 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface): self.max_len = max_len self.xscale = paddle.to_tensor(math.sqrt(self.d_model)) self.dropout = nn.Dropout(p=dropout_rate) - self.pe = paddle.zeros([self.max_len, self.d_model]) #[T,D] + self.pe = paddle.zeros([1, self.max_len, self.d_model]) #[B=1,T,D] position = paddle.arange( 0, self.max_len, dtype=paddle.float32).unsqueeze(1) #[T, 1] @@ -97,9 +97,8 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface): paddle.arange(0, self.d_model, 2, dtype=paddle.float32) * -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = paddle.sin(position * div_term) - self.pe[:, 1::2] = paddle.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) #[1, T, D] + self.pe[:, :, 0::2] = paddle.sin(position * div_term) + self.pe[:, :, 1::2] = paddle.cos(position * div_term) def forward(self, x: paddle.Tensor, offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]: diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 87b83ef55..2e76ccb05 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -264,15 +264,15 @@ class BaseEncoder(nn.Layer): # new_att_cache = (1, head, attention_key_size, d_k*2) # new_cnn_cache = (B=1, hidden-dim, cache_t2) r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) # add elayer dim + r_cnn_cache.append(new_cnn_cache) # add elayer dim if self.normalize_before: xs = self.after_norm(xs) # r_att_cache (elayers, head, T, d_k*2) - # r_cnn_cache (elayers, B=1, hidden-dim, cache_t2) + # r_cnn_cache (elayers, B=1, hidden-dim, cache_t2) r_att_cache = paddle.concat(r_att_cache, axis=0) - r_cnn_cache = paddle.concat(r_cnn_cache, axis=0) + r_cnn_cache = paddle.stack(r_cnn_cache, axis=0) return xs, r_att_cache, r_cnn_cache def forward_chunk_by_chunk( From c4a5ae382524cc1461f172e8659ef39b8a310081 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 08:34:45 +0000 Subject: [PATCH 040/113] eliminate mul --- paddlespeech/audio/utils/tensor_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/audio/utils/tensor_utils.py b/paddlespeech/audio/utils/tensor_utils.py index b2436a121..93883c94d 100644 --- a/paddlespeech/audio/utils/tensor_utils.py +++ b/paddlespeech/audio/utils/tensor_utils.py @@ -152,8 +152,8 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int, # return pad_sequence(ys_in, padding_value=eos).transpose([1,0]), pad_sequence(ys_out, padding_value=ignore_id).transpose([1,0]) B = ys_pad.shape[0] - _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos - _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos + _sos = paddle.full([B, 1], sos, dtype=ys_pad.dtype) + _eos = paddle.full([B, 1], eos, dtype=ys_pad.dtype) ys_in = paddle.cat([_sos, ys_pad], dim=1) mask_pad = (ys_in == ignore_id) ys_in = ys_in.masked_fill(mask_pad, eos) From e65622548d8ca1320607fa5bfc1c28afbbeed73d Mon Sep 17 00:00:00 2001 From: Ming Date: Mon, 26 Sep 2022 16:48:06 +0800 Subject: [PATCH 041/113] update readme and fixed bug in ngpu (#2451) * update readme and fixed ngpu bug * update png in readme * update readme and FT web ttsText --- demos/speech_web/README.md | 38 ++- .../speech_web/speech_server/src/ernie_sat.py | 5 +- .../speech_web/speech_server/src/finetune.py | 8 +- .../speech_server/src/ge2e_clone.py | 11 +- .../speech_server/src/tdnn_clone.py | 10 +- demos/speech_web/speech_server/src/util.py | 9 + demos/speech_web/speech_server/vc.py | 11 +- .../web_client/src/components/Experience.vue | 2 +- .../SubMenu/ASR/RealTime/RealTime.vue | 5 +- .../src/components/SubMenu/ChatBot/Chat.vue | 298 ------------------ .../src/components/SubMenu/ChatBot/ChatT.vue | 4 + .../SubMenu/ENIRE_SAT/ENIRE_SAT.vue | 2 +- .../components/SubMenu/FineTune/FineTune.vue | 4 +- .../src/components/SubMenu/IE/IE.vue | 125 -------- .../src/components/SubMenu/TTS/TTST.vue | 4 + .../src/components/SubMenu/VPR/VPR.vue | 178 ----------- .../src/components/SubMenu/VPR/VPRT.vue | 7 +- .../SubMenu/VoiceClone/VoiceClone.vue | 9 +- 18 files changed, 95 insertions(+), 635 deletions(-) delete mode 100644 demos/speech_web/web_client/src/components/SubMenu/ChatBot/Chat.vue delete mode 100644 demos/speech_web/web_client/src/components/SubMenu/IE/IE.vue delete mode 100644 demos/speech_web/web_client/src/components/SubMenu/VPR/VPR.vue diff --git a/demos/speech_web/README.md b/demos/speech_web/README.md index e8c59ea8b..89d22382a 100644 --- a/demos/speech_web/README.md +++ b/demos/speech_web/README.md @@ -28,7 +28,7 @@ Paddle Speech Demo 是一个以 PaddleSpeech 的语音交互功能为主体开 运行效果: - ![效果](https://user-images.githubusercontent.com/30135920/191188766-12e7ca15-f7b4-45f8-9da5-0c0b0bbe5fcb.png) + ![效果](https://user-images.githubusercontent.com/30135920/192155349-9ef93d20-730b-413d-8d50-412fedf11d4b.png) @@ -36,6 +36,7 @@ Paddle Speech Demo 是一个以 PaddleSpeech 的语音交互功能为主体开 ### 后端环境安装 ```bash +# 需要先安装 PaddleSpeech cd speech_server pip install -r requirements.txt -i https://mirror.baidu.com/pypi/simple cd ../ @@ -44,6 +45,8 @@ cd ../ ### 前端环境安装 前端依赖 `node.js` ,需要提前安装,确保 `npm` 可用,`npm` 测试版本 `8.3.1`,建议下载[官网](https://nodejs.org/en/)稳定版的 `node.js` +如果因为网络问题,无法下载依赖库,可以参考 FAQ 部分,`npm / yarn 下载速度慢问题` + ```bash # 进入前端目录 cd web_client @@ -70,7 +73,7 @@ mkdir -p source/model cd source/model # 下载IE模型 wget https://bj.bcebos.com/paddlenlp/applications/speech-cmd-analysis/finetune/model_state.pdparams -cd ../../ +cd ../../../ ``` #### 启动后端服务 @@ -84,6 +87,10 @@ python main.py --port 8010 ### 启动 `vc.py` 后端服务 +参照下面的步骤自行配置项目所需环境。 + +Aistudio 在线体验小样本合成后端功能:[【PaddleSpeech进阶】PaddleSpeech小样本合成方案体验](https://aistudio.baidu.com/aistudio/projectdetail/4573549?sUid=2470186&shared=1&ts=1664174385948) + #### 下载相关模型和音频 ```bash @@ -172,8 +179,19 @@ cd web_client yarn dev --port 8011 ``` -默认配置下,前端中配置的后台地址信息是 localhost,确保后端服务器和打开页面的游览器在同一台机器上,不在一台机器的配置方式见下方的 FAQ:【后端如果部署在其它机器或者别的端口如何修改】 +默认配置下,前端配置的后台地址信息是 `localhost`,确保后端服务器和打开页面的游览器在同一台机器上,不在一台机器的配置方式见下方的 FAQ:【后端如果部署在其它机器或者别的端口如何修改】 + +#### 关于前端的一些说明 + +为了方便后期的维护,这里并没有给出打包好的 HTML 文件,而是 Vue3 的项目,使用 `yarn dev --port 8011` 的方式启动测试,方便大家debug,相当于是启动了一个前端服务器。 + +比如我们在本机启动的这个前端服务(运行 `yarn dev --port 8011` ),我们就可以通过在游览器中通过 `http://localhost:8011` 访问前端页面 + +如果我们在其它服务器上(例如:`*.*.*.*` )启动这个前端服务(运行 `yarn dev --port 8011` ),我们就可以通过在游览器中访问 `http://*.*.*.*:8011` 访问前端页面 +那前端跟后端是什么关系呢? 两个是独立的,只要前端能够通过代理访问到后端的接口,那就没有问题。你可以在 A 机器上部署后端服务,然后在 B 机器上部署前端服务。我们在 `./web_client/vite.config.js` 中将 `/api` 映射到的是 `http://localhost:8010`,你可以把它配置成任意你想要访问后端地址。 + +当前端在以 `*.*.*.*` 这类以 IP 地址形式的网页中访问时,由于游览器的安全限制,会禁止录音,需要重新配置游览器的安全策略, 可以看下面 FAQ 部分: [【前端以IP地址的形式访问,无法录音】] ## FAQ @@ -210,12 +228,24 @@ ASR_SOCKET_RECORD: 'ws://localhost:8010/ws/asr/onlineStream', // Stream ASR 接 TTS_SOCKET_RECORD: 'ws://localhost:8010/ws/tts/online', // Stream TTS 接口 ``` -#### Q:后端以IP地址的形式,前端无法录音 +#### Q:前端以IP地址的形式访问,无法录音 A:这里主要是游览器安全策略的限制,需要配置游览器后重启。游览器修改配置可参考[使用js-audio-recorder报浏览器不支持getUserMedia](https://blog.csdn.net/YRY_LIKE_YOU/article/details/113745273) chrome设置地址: chrome://flags/#unsafely-treat-insecure-origin-as-secure +#### Q: npm / yarn 配置淘宝镜像源 + +A: 配置淘宝镜像源,详细可以参考 [【yarn npm 设置淘宝镜像】](https://www.jianshu.com/p/f6f43e8f9d6b) + +```bash +# npm 配置淘宝镜像源 +npm config set registry https://registry.npmmirror.com + +# yarn 配置淘宝镜像源 +yarn config set registry http://registry.npm.taobao.org/ +``` + ## 参考资料 vue实现录音参考资料:https://blog.csdn.net/qq_41619796/article/details/107865602#t1 diff --git a/demos/speech_web/speech_server/src/ernie_sat.py b/demos/speech_web/speech_server/src/ernie_sat.py index b74dd8e3f..02e1ed9d9 100644 --- a/demos/speech_web/speech_server/src/ernie_sat.py +++ b/demos/speech_web/speech_server/src/ernie_sat.py @@ -1,5 +1,6 @@ import os +from .util import get_ngpu from .util import MAIN_ROOT from .util import run_cmd @@ -171,6 +172,7 @@ class SAT: output_name: str, source_lang: str, target_lang: str): + ngpu = get_ngpu() cmd = f""" FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ @@ -189,7 +191,8 @@ class SAT: --voc_config={voc_config} \ --voc_ckpt={voc_ckpt} \ --voc_stat={voc_stat} \ - --output_name={output_name} + --output_name={output_name} \ + --ngpu={ngpu} """ return cmd diff --git a/demos/speech_web/speech_server/src/finetune.py b/demos/speech_web/speech_server/src/finetune.py index d7a440f9a..6ca99251b 100644 --- a/demos/speech_web/speech_server/src/finetune.py +++ b/demos/speech_web/speech_server/src/finetune.py @@ -1,5 +1,6 @@ import os +from .util import get_ngpu from .util import MAIN_ROOT from .util import run_cmd @@ -38,7 +39,7 @@ class FineTune: dump_dir = os.path.join(exp_dir, 'dump') output_dir = os.path.join(exp_dir, 'exp') lang = "zh" - ngpu = 1 + ngpu = get_ngpu() cmd = f""" # check oov @@ -91,7 +92,7 @@ class FineTune: output_dir = os.path.join(exp_dir, 'exp') text_path = os.path.join(exp_dir, 'sentences.txt') lang = "zh" - ngpu = 1 + ngpu = get_ngpu() model_path = f"{output_dir}/checkpoints" ckpt = find_max_ckpt(model_path) @@ -117,7 +118,8 @@ class FineTune: --output_dir={out_wav_dir} \ --phones_dict={dump_dir}/phone_id_map.txt \ --speaker_dict={dump_dir}/speaker_id_map.txt \ - --spk_id=0 + --spk_id=0 \ + --ngpu={ngpu} """ out_path = os.path.join(out_wav_dir, f"{wav_name}.wav") diff --git a/demos/speech_web/speech_server/src/ge2e_clone.py b/demos/speech_web/speech_server/src/ge2e_clone.py index d90013b98..83c2b3f35 100644 --- a/demos/speech_web/speech_server/src/ge2e_clone.py +++ b/demos/speech_web/speech_server/src/ge2e_clone.py @@ -1,6 +1,7 @@ import os import shutil +from .util import get_ngpu from .util import MAIN_ROOT from .util import run_cmd @@ -30,11 +31,12 @@ class VoiceCloneGE2E(): ref_audio_dir = os.path.realpath("tmp_dir/ge2e") if os.path.exists(ref_audio_dir): shutil.rmtree(ref_audio_dir) - else: - os.makedirs(ref_audio_dir, exist_ok=True) - shutil.copy(input_wav, ref_audio_dir) + + os.makedirs(ref_audio_dir, exist_ok=True) + shutil.copy(input_wav, ref_audio_dir) output_dir = os.path.dirname(out_wav) + ngpu = get_ngpu() cmd = f""" python3 {self.BIN_DIR}/voice_cloning.py \ @@ -50,7 +52,8 @@ class VoiceCloneGE2E(): --text="{text}" \ --input-dir={ref_audio_dir} \ --output-dir={output_dir} \ - --phones-dict={self.phones_dict} + --phones-dict={self.phones_dict} \ + --ngpu={ngpu} """ output_name = os.path.join(output_dir, full_file_name) diff --git a/demos/speech_web/speech_server/src/tdnn_clone.py b/demos/speech_web/speech_server/src/tdnn_clone.py index c24b9b077..53c5a3816 100644 --- a/demos/speech_web/speech_server/src/tdnn_clone.py +++ b/demos/speech_web/speech_server/src/tdnn_clone.py @@ -1,6 +1,7 @@ import os import shutil +from .util import get_ngpu from .util import MAIN_ROOT from .util import run_cmd @@ -27,11 +28,11 @@ class VoiceCloneTDNN(): ref_audio_dir = os.path.realpath("tmp_dir/tdnn") if os.path.exists(ref_audio_dir): shutil.rmtree(ref_audio_dir) - else: - os.makedirs(ref_audio_dir, exist_ok=True) - shutil.copy(input_wav, ref_audio_dir) + os.makedirs(ref_audio_dir, exist_ok=True) + shutil.copy(input_wav, ref_audio_dir) output_dir = os.path.dirname(out_wav) + ngpu = get_ngpu() cmd = f""" python3 {self.BIN_DIR}/voice_cloning.py \ @@ -47,7 +48,8 @@ class VoiceCloneTDNN(): --input-dir={ref_audio_dir} \ --output-dir={output_dir} \ --phones-dict={self.phones_dict} \ - --use_ecapa=True + --use_ecapa=True \ + --ngpu={ngpu} """ output_name = os.path.join(output_dir, full_file_name) diff --git a/demos/speech_web/speech_server/src/util.py b/demos/speech_web/speech_server/src/util.py index a69e6c42f..0188f0280 100644 --- a/demos/speech_web/speech_server/src/util.py +++ b/demos/speech_web/speech_server/src/util.py @@ -2,10 +2,19 @@ import os import random import subprocess +import paddle + NOW_FILE_PATH = os.path.dirname(__file__) MAIN_ROOT = os.path.realpath(os.path.join(NOW_FILE_PATH, "../../../../")) +def get_ngpu(): + if paddle.device.get_device() == "cpu": + return 0 + else: + return 1 + + def randName(n=5): return "".join(random.sample('zyxwvutsrqponmlkjihgfedcba', n)) diff --git a/demos/speech_web/speech_server/vc.py b/demos/speech_web/speech_server/vc.py index 99e56b404..d035c02a4 100644 --- a/demos/speech_web/speech_server/vc.py +++ b/demos/speech_web/speech_server/vc.py @@ -281,15 +281,18 @@ async def VcCloneG2P(base: VcBaseText): if base.func == 'ge2e': wavName = base.wavName wavPath = os.path.join(VC_OUT_PATH, wavName) - vc_model.vc( + wavPath = vc_model.vc( text=base.text, input_wav=base.wavPath, out_wav=wavPath) else: wavName = base.wavName wavPath = os.path.join(VC_OUT_PATH, wavName) - vc_model_tdnn.vc( + wavPath = vc_model_tdnn.vc( text=base.text, input_wav=base.wavPath, out_wav=wavPath) - res = {"wavName": wavName, "wavPath": wavPath} - return SuccessRequest(result=res) + if wavPath: + res = {"wavName": wavName, "wavPath": wavPath} + return SuccessRequest(result=res) + else: + return ErrorRequest(message="克隆失败,检查克隆脚本是否有效") except Exception as e: print(e) return ErrorRequest(message="克隆失败,合成过程报错") diff --git a/demos/speech_web/web_client/src/components/Experience.vue b/demos/speech_web/web_client/src/components/Experience.vue index 4f32faf95..f593c0c14 100644 --- a/demos/speech_web/web_client/src/components/Experience.vue +++ b/demos/speech_web/web_client/src/components/Experience.vue @@ -47,7 +47,7 @@ import FineTuneT from './SubMenu/FineTune/FineTune.vue' - + diff --git a/demos/speech_web/web_client/src/components/SubMenu/ASR/RealTime/RealTime.vue b/demos/speech_web/web_client/src/components/SubMenu/ASR/RealTime/RealTime.vue index 761a5c11f..5494bb8f8 100644 --- a/demos/speech_web/web_client/src/components/SubMenu/ASR/RealTime/RealTime.vue +++ b/demos/speech_web/web_client/src/components/SubMenu/ASR/RealTime/RealTime.vue @@ -58,9 +58,6 @@ export default { mounted () { this.wsUrl = apiURL.ASR_SOCKET_RECORD this.ws = new WebSocket(this.wsUrl) - if(this.ws.readyState === this.ws.CONNECTING){ - this.$message.success("实时识别 Websocket 连接成功") - } var _that = this this.ws.addEventListener('message', function (event) { var temp = JSON.parse(event.data); @@ -78,7 +75,7 @@ export default { // 检查 websocket 状态 // debugger if(this.ws.readyState != this.ws.OPEN){ - this.$message.error("websocket 链接失败,请检查链接地址是否正确") + this.$message.error("websocket 链接失败,请检查 Websocket 后端服务是否正确开启") return } diff --git a/demos/speech_web/web_client/src/components/SubMenu/ChatBot/Chat.vue b/demos/speech_web/web_client/src/components/SubMenu/ChatBot/Chat.vue deleted file mode 100644 index 9d356fc80..000000000 --- a/demos/speech_web/web_client/src/components/SubMenu/ChatBot/Chat.vue +++ /dev/null @@ -1,298 +0,0 @@ - - - - - \ No newline at end of file diff --git a/demos/speech_web/web_client/src/components/SubMenu/ChatBot/ChatT.vue b/demos/speech_web/web_client/src/components/SubMenu/ChatBot/ChatT.vue index c37c083ff..6db847706 100644 --- a/demos/speech_web/web_client/src/components/SubMenu/ChatBot/ChatT.vue +++ b/demos/speech_web/web_client/src/components/SubMenu/ChatBot/ChatT.vue @@ -91,6 +91,10 @@ export default { methods: { // 开始录音 startRecorder(){ + if(this.ws.readyState != this.ws.OPEN){ + this.$message.error("websocket 链接失败,请检查 Websocket 后端服务是否正确开启") + return + } this.allResultList = [] if(!this.onReco){ this.asrResult = this.speakingText diff --git a/demos/speech_web/web_client/src/components/SubMenu/ENIRE_SAT/ENIRE_SAT.vue b/demos/speech_web/web_client/src/components/SubMenu/ENIRE_SAT/ENIRE_SAT.vue index e1a4f2343..4a0aa2c63 100644 --- a/demos/speech_web/web_client/src/components/SubMenu/ENIRE_SAT/ENIRE_SAT.vue +++ b/demos/speech_web/web_client/src/components/SubMenu/ENIRE_SAT/ENIRE_SAT.vue @@ -98,7 +98,7 @@ 播放 - 播放 + 播放 下载 下载 diff --git a/demos/speech_web/web_client/src/components/SubMenu/FineTune/FineTune.vue b/demos/speech_web/web_client/src/components/SubMenu/FineTune/FineTune.vue index 895dd586d..abf203ae8 100644 --- a/demos/speech_web/web_client/src/components/SubMenu/FineTune/FineTune.vue +++ b/demos/speech_web/web_client/src/components/SubMenu/FineTune/FineTune.vue @@ -80,7 +80,7 @@ - 播放 + 播放 播放 下载 下载 @@ -126,7 +126,7 @@ expPath: '', wav: '', wav_base64: '', - ttsText: '', + ttsText: '欢迎使用飞桨语音套件', cloneWav: '', onEnrollRec: 0, // 录音状态 diff --git a/demos/speech_web/web_client/src/components/SubMenu/IE/IE.vue b/demos/speech_web/web_client/src/components/SubMenu/IE/IE.vue deleted file mode 100644 index c7dd04e9d..000000000 --- a/demos/speech_web/web_client/src/components/SubMenu/IE/IE.vue +++ /dev/null @@ -1,125 +0,0 @@ - - - - - \ No newline at end of file diff --git a/demos/speech_web/web_client/src/components/SubMenu/TTS/TTST.vue b/demos/speech_web/web_client/src/components/SubMenu/TTS/TTST.vue index 353221f7b..ef5591783 100644 --- a/demos/speech_web/web_client/src/components/SubMenu/TTS/TTST.vue +++ b/demos/speech_web/web_client/src/components/SubMenu/TTS/TTST.vue @@ -228,6 +228,10 @@ export default { }, // 基于WS的流式合成 async getTtsChunkWavWS(){ + if(this.ws.readyState != this.ws.OPEN){ + this.$message.error("websocket 链接失败,请检查 Websocket 后端服务是否正确开启") + return + } // 初始化 chunks chunks = [] chunk_index = 0 diff --git a/demos/speech_web/web_client/src/components/SubMenu/VPR/VPR.vue b/demos/speech_web/web_client/src/components/SubMenu/VPR/VPR.vue deleted file mode 100644 index 1fe71e4d8..000000000 --- a/demos/speech_web/web_client/src/components/SubMenu/VPR/VPR.vue +++ /dev/null @@ -1,178 +0,0 @@ - - - - - \ No newline at end of file diff --git a/demos/speech_web/web_client/src/components/SubMenu/VPR/VPRT.vue b/demos/speech_web/web_client/src/components/SubMenu/VPR/VPRT.vue index e398da00c..47eb41df5 100644 --- a/demos/speech_web/web_client/src/components/SubMenu/VPR/VPRT.vue +++ b/demos/speech_web/web_client/src/components/SubMenu/VPR/VPRT.vue @@ -214,14 +214,17 @@ export default { let formData = new FormData() formData.append('spk_id', this.enrollSpkId) formData.append('audio', this.wav) - + const result = await vprEnroll(formData) + if (!result){ + this.$message.error("请检查后端服务是否正确开启") + return + } if(result.data.status){ this.$message.success("声纹注册成功") } else { this.$message.error(result.data.msg) } - // console.log(result) this.GetList() this.wav = '' this.randomSpkId() diff --git a/demos/speech_web/web_client/src/components/SubMenu/VoiceClone/VoiceClone.vue b/demos/speech_web/web_client/src/components/SubMenu/VoiceClone/VoiceClone.vue index 1e380d288..afa572417 100644 --- a/demos/speech_web/web_client/src/components/SubMenu/VoiceClone/VoiceClone.vue +++ b/demos/speech_web/web_client/src/components/SubMenu/VoiceClone/VoiceClone.vue @@ -71,7 +71,7 @@ - 播放 + 播放 播放 下载 下载 @@ -270,6 +270,7 @@ export default { } else if (this.nowIndex >= this.vcDatas.length){ return this.$message.error("当前序号不可以超过音频个数") } + this.cloneWav = "" let func = '' if(this.func_radio === '1'){ func = 'ge2e' @@ -289,12 +290,12 @@ export default { } ); this.g2pOnSys = 0 - if(!result.data.code){ + if(result.data.code == 0){ this.cloneWav = result.data.result console.log("clone wav: ", this.cloneWav) - this.$message.success("音色克隆成功") + this.$message.success("音频合成成功") } else { - this.$message.error(result.data.msg) + this.$message.error("音频合成失败,请检查后台错误后重试!") } }, // 播放表格 From 8e7a315e00806f54d320136467b9104d802bdc78 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 11:02:28 +0000 Subject: [PATCH 042/113] remove comment --- paddlespeech/s2t/__init__.py | 1 - paddlespeech/s2t/exps/u2/bin/test.py | 2 -- paddlespeech/s2t/exps/u2_st/bin/test.py | 2 -- 3 files changed, 5 deletions(-) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 5fe2e16b9..3c704b272 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -22,7 +22,6 @@ from paddle.nn import functional as F from paddlespeech.s2t.utils.log import Log -#TODO(Hui Zhang): remove fluid import logger = Log(__name__).getlog() ########### hack logging ############# diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index f14d804f1..b13fd0d3f 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -20,8 +20,6 @@ from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load - def main_sp(config, args): exp = Tester(config, args) diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py index 1d70a3103..c07c95bd5 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/test.py +++ b/paddlespeech/s2t/exps/u2_st/bin/test.py @@ -20,8 +20,6 @@ from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load - def main_sp(config, args): exp = Tester(config, args) From 6de81d74d9b00c0ec4e6163d9b74bbba5ac20ff0 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 11:06:17 +0000 Subject: [PATCH 043/113] elimiete cast dtype for bool op --- paddlespeech/s2t/models/u2/u2.py | 12 +++--------- paddlespeech/s2t/models/u2_st/u2_st.py | 5 +---- paddlespeech/s2t/modules/decoder.py | 8 ++------ paddlespeech/s2t/modules/encoder.py | 26 +++++++------------------- paddlespeech/s2t/modules/mask.py | 9 ++------- paddlespeech/s2t/utils/tensor_utils.py | 11 +++-------- 6 files changed, 18 insertions(+), 53 deletions(-) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 207e470a6..c25c2186d 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -124,10 +124,7 @@ class U2BaseModel(ASRInterface, nn.Layer): encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_time = time.time() - start #logger.debug(f"encoder time: {encoder_time}") - #TODO(Hui Zhang): sum not support bool type - #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] - encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum( - 1) #[B, 1, T] -> [B] + encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] # 2a. Attention-decoder branch loss_att = None @@ -291,8 +288,7 @@ class U2BaseModel(ASRInterface, nn.Layer): # 2. Decoder forward step by step for i in range(1, maxlen + 1): # Stop if all batch and all beam produce eos - # TODO(Hui Zhang): if end_flag.sum() == running_size: - if end_flag.cast(paddle.int64).sum() == running_size: + if end_flag.sum() == running_size: break # 2.1 Forward decoder step @@ -378,9 +374,7 @@ class U2BaseModel(ASRInterface, nn.Layer): speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) maxlen = encoder_out.shape[1] - # (TODO Hui Zhang): bool no support reduce_sum - # encoder_out_lens = encoder_mask.squeeze(1).sum(1) - encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1) + encoder_out_lens = encoder_mask.squeeze(1).sum(1) ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index e8b61bc0d..31defbbaf 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -111,10 +111,7 @@ class U2STBaseModel(nn.Layer): encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_time = time.time() - start #logger.debug(f"encoder time: {encoder_time}") - #TODO(Hui Zhang): sum not support bool type - #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] - encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum( - 1) #[B, 1, T] -> [B] + encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] # 2a. ST-decoder branch start = time.time() diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index 03b637b7c..5e1b4c92b 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -140,9 +140,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): # m: (1, L, L) m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0) # tgt_mask: (B, L, L) - # TODO(Hui Zhang): not support & for tensor - # tgt_mask = tgt_mask & m - tgt_mask = tgt_mask.logical_and(m) + tgt_mask = tgt_mask & m x, _ = self.embed(tgt) for layer in self.decoders: @@ -153,9 +151,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): if self.use_output_layer: x = self.output_layer(x) - # TODO(Hui Zhang): reduce_sum not support bool type - # olens = tgt_mask.sum(1) - olens = tgt_mask.astype(paddle.int).sum(1) + olens = tgt_mask.sum(1) return x, paddle.to_tensor(0.0), olens def forward_one_step( diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 2e76ccb05..db5848847 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -164,12 +164,8 @@ class BaseEncoder(nn.Layer): if self.global_cmvn is not None: xs = self.global_cmvn(xs) - #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor - xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) - #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor - masks = masks.astype(paddle.bool) - #TODO(Hui Zhang): mask_pad = ~masks - mask_pad = masks.logical_not() + xs, pos_emb, masks = self.embed(xs, masks, offset=0) + mask_pad = ~masks chunk_masks = add_optional_chunk_mask( xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, decoding_chunk_size, self.static_chunk_size, @@ -215,11 +211,8 @@ class BaseEncoder(nn.Layer): same shape as the original cnn_cache """ assert xs.shape[0] == 1 # batch size must be one - # tmp_masks is just for interface compatibility - # TODO(Hui Zhang): stride_slice not support bool tensor - # tmp_masks = paddle.ones([1, paddle.shape(xs)[1]], dtype=paddle.bool) - tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32) - tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T] + # tmp_masks is just for interface compatibility, [B=1, C=1, T] + tmp_masks = paddle.ones([1, 1, xs.shape[1]], dtype=paddle.bool) if self.global_cmvn is not None: xs = self.global_cmvn(xs) @@ -228,9 +221,8 @@ class BaseEncoder(nn.Layer): xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) # after embed, xs=(B=1, chunk_size, hidden-dim) - elayers = paddle.shape(att_cache)[0] - cache_t1 = paddle.shape(att_cache)[2] - chunk_size = paddle.shape(xs)[1] + elayers, _, cache_t1, _ = att_cache.shape + chunk_size = xs.shape[1] attention_key_size = cache_t1 + chunk_size # only used when using `RelPositionMultiHeadedAttention` @@ -402,11 +394,7 @@ class TransformerEncoder(BaseEncoder): if self.global_cmvn is not None: xs = self.global_cmvn(xs) - #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor - xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) - #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor - masks = masks.astype(paddle.bool) - + xs, pos_emb, masks = self.embed(xs, masks, offset=0) if cache is None: cache = [None for _ in range(len(self.encoders))] new_cache = [] diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py index 1f66c015a..787a06528 100644 --- a/paddlespeech/s2t/modules/mask.py +++ b/paddlespeech/s2t/modules/mask.py @@ -109,13 +109,8 @@ def subsequent_mask(size: int) -> paddle.Tensor: [1, 1, 1]] """ ret = paddle.ones([size, size], dtype=paddle.bool) - #TODO(Hui Zhang): tril not support bool - #return paddle.tril(ret) - ret = ret.astype(paddle.float) - ret = paddle.tril(ret) - ret = ret.astype(paddle.bool) - return ret - + return paddle.tril(ret) + def subsequent_chunk_mask( size: int, diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py index 422d4f82a..3ac102f3c 100644 --- a/paddlespeech/s2t/utils/tensor_utils.py +++ b/paddlespeech/s2t/utils/tensor_utils.py @@ -184,13 +184,8 @@ def th_accuracy(pad_outputs: paddle.Tensor, pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1], pad_outputs.shape[1]).argmax(2) mask = pad_targets != ignore_label - #TODO(Hui Zhang): sum not support bool type - # numerator = paddle.sum( - # pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - numerator = ( + + numerator = paddle.sum( pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - numerator = paddle.sum(numerator.type_as(pad_targets)) - #TODO(Hui Zhang): sum not support bool type - # denominator = paddle.sum(mask) - denominator = paddle.sum(mask.type_as(pad_targets)) + denominator = paddle.sum(mask) return float(numerator) / float(denominator) From c2c8a662b14b09dc6fc0079ee074ead8d192c549 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 11:07:29 +0000 Subject: [PATCH 044/113] refactor reshape --- paddlespeech/s2t/modules/embedding.py | 5 +---- paddlespeech/s2t/modules/subsampling.py | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index 54324c2f6..f41a7b5d4 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -110,12 +110,10 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface): paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...) paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...) """ - T = x.shape[1] assert offset + x.shape[ 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( offset, x.shape[1], self.max_len) - #TODO(Hui Zhang): using T = paddle.shape(x)[1], __getitem__ not support Tensor - pos_emb = self.pe[:, offset:offset + T] + pos_emb = self.pe[:, offset:offset + x.shape[1]] x = x * self.xscale + pos_emb return self.dropout(x), self.dropout(pos_emb) @@ -164,6 +162,5 @@ class RelPositionalEncoding(PositionalEncoding): 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( offset, x.shape[1], self.max_len) x = x * self.xscale - #TODO(Hui Zhang): using paddle.shape(x)[1], __getitem__ not support Tensor pos_emb = self.pe[:, offset:offset + x.shape[1]] return self.dropout(x), self.dropout(pos_emb) diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py index 88451ddd7..2775988a7 100644 --- a/paddlespeech/s2t/modules/subsampling.py +++ b/paddlespeech/s2t/modules/subsampling.py @@ -139,8 +139,8 @@ class Conv2dSubsampling4(Conv2dSubsampling): """ x = x.unsqueeze(1) # (b, c=1, t, f) x = self.conv(x) - b, c, t, f = paddle.shape(x) - x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + b, c, t, f = x.shape + x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] From 3d7ca93861124b27ac390fa5bcaf2b4aef644f86 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 11:07:48 +0000 Subject: [PATCH 045/113] bool type slice --- paddlespeech/s2t/modules/decoder_layer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py index 37b124e84..cb7261107 100644 --- a/paddlespeech/s2t/modules/decoder_layer.py +++ b/paddlespeech/s2t/modules/decoder_layer.py @@ -114,10 +114,7 @@ class DecoderLayer(nn.Layer): ], f"{cache.shape} == {[tgt.shape[0], tgt.shape[1] - 1, self.size]}" tgt_q = tgt[:, -1:, :] residual = residual[:, -1:, :] - # TODO(Hui Zhang): slice not support bool type - # tgt_q_mask = tgt_mask[:, -1:, :] - tgt_q_mask = tgt_mask.cast(paddle.int64)[:, -1:, :].cast( - paddle.bool) + tgt_q_mask = tgt_mask[:, -1:, :] if self.concat_after: tgt_concat = paddle.cat( From 63f70c9fd002594ada80c8e39875d5c24a1cdf25 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 26 Sep 2022 19:09:49 +0800 Subject: [PATCH 046/113] fix finetune batch size (#2457) * fix batch_size, test=tts --- examples/other/tts_finetune/tts3/local/finetune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/other/tts_finetune/tts3/local/finetune.py b/examples/other/tts_finetune/tts3/local/finetune.py index 496c2355b..814497aaa 100644 --- a/examples/other/tts_finetune/tts3/local/finetune.py +++ b/examples/other/tts_finetune/tts3/local/finetune.py @@ -131,10 +131,10 @@ def train_sp(args, config): converters=converters, ) # collate function and dataloader - + train_batch_size = min(len(train_metadata), config.batch_size) train_sampler = DistributedBatchSampler( train_dataset, - batch_size=config.batch_size, + batch_size=train_batch_size, shuffle=True, drop_last=True) From f9e3eaa024218a5310c24bd504d4468826867bbd Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 11:55:26 +0000 Subject: [PATCH 047/113] transpose in matmul --- paddlespeech/s2t/modules/attention.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 2d236743a..c02de15e8 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -188,8 +188,9 @@ class MultiHeadedAttention(nn.Layer): # non-trivial to calculate `next_cache_start` here. new_cache = paddle.concat((k, v), axis=-1) - scores = paddle.matmul(q, - k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k) + # scores = paddle.matmul(q, + # k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k) + scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k) return self.forward_attention(v, scores, mask), new_cache @@ -309,11 +310,13 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): # first compute matrix a and matrix c # as described in https://arxiv.org/abs/1901.02860 Section 3.3 # (batch, head, time1, time2) - matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2])) + # matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2])) + matrix_ac = paddle.matmul(q_with_bias_u, k, transpose_y=True) # compute matrix b and matrix d # (batch, head, time1, time2) - matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2])) + # matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2])) + matrix_bd = paddle.matmul(q_with_bias_v, p, transpose_y=True) # Remove rel_shift since it is useless in speech recognition, # and it requires special attention for streaming. # matrix_bd = self.rel_shift(matrix_bd) From 46088c0a16aa1476c095b80fee551c7df4a8ce71 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 12:19:30 +0000 Subject: [PATCH 048/113] elimiate attn transpose --- paddlespeech/s2t/modules/attention.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index c02de15e8..67bb869ed 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -271,7 +271,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): and `head * d_k == size` """ q, k, v = self.forward_qkv(query, key, value) - q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) + # q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) # when export onnx model, for 1st chunk, we feed # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) @@ -302,9 +302,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3]) + # q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3]) + q_with_bias_u = q + self.pos_bias_u.unsqueeze(1) # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3]) + # q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3]) + q_with_bias_v = q + self.pos_bias_v.unsqueeze(1) # compute attention score # first compute matrix a and matrix c From 3adb20b468fa40a138316e66d59be12c4d20314e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 15:50:06 +0000 Subject: [PATCH 049/113] eliminate shape and slice --- paddlespeech/s2t/modules/conformer_convolution.py | 6 +++--- paddlespeech/s2t/modules/decoder.py | 2 +- paddlespeech/s2t/modules/encoder.py | 2 +- paddlespeech/s2t/modules/loss.py | 4 ++-- paddlespeech/s2t/modules/subsampling.py | 7 ++++--- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index be6056546..09d903eee 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -127,11 +127,11 @@ class ConvolutionModule(nn.Layer): x = x.transpose([0, 2, 1]) # [B, C, T] # mask batch padding - if paddle.shape(mask_pad)[2] > 0: # time > 0 + if mask_pad.shape[2] > 0: # time > 0 x = x.masked_fill(mask_pad, 0.0) if self.lorder > 0: - if paddle.shape(cache)[2] == 0: # cache_t == 0 + if cache.shape[2] == 0: # cache_t == 0 x = nn.functional.pad( x, [self.lorder, 0], 'constant', 0.0, data_format='NCL') else: @@ -161,7 +161,7 @@ class ConvolutionModule(nn.Layer): x = self.pointwise_conv2(x) # mask batch padding - if paddle.shape(mask_pad)[2] > 0: # time > 0 + if mask_pad.shape[2] > 0: # time > 0 x = x.masked_fill(mask_pad, 0.0) x = x.transpose([0, 2, 1]) # [B, T, C] diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index 5e1b4c92b..4ddf057b6 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -243,7 +243,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): ] # batch decoding - ys_mask = subsequent_mask(paddle.shape(ys)[-1]).unsqueeze(0) # (B,L,L) + ys_mask = subsequent_mask(ys.shape[-1]).unsqueeze(0) # (B,L,L) xs_mask = make_xs_mask(xs).unsqueeze(1) # (B,1,T) logp, states = self.forward_one_step( xs, xs_mask, ys, ys_mask, cache=batch_state) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index db5848847..f23d3f140 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -246,7 +246,7 @@ class BaseEncoder(nn.Layer): # tensor zeros([0,0,0,0]) support [i:i+1] slice, will return zeros([0,0,0,0]) tensor # raw code as below: # att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, - # cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, + # cnn_cache=cnn_cache[i:i+1] if cnn_cache.shape[0] > 0 else cnn_cache, xs, _, new_att_cache, new_cnn_cache = layer( xs, att_mask, diff --git a/paddlespeech/s2t/modules/loss.py b/paddlespeech/s2t/modules/loss.py index 884fb70c1..afd5201aa 100644 --- a/paddlespeech/s2t/modules/loss.py +++ b/paddlespeech/s2t/modules/loss.py @@ -85,7 +85,7 @@ class CTCLoss(nn.Layer): Returns: [paddle.Tensor]: scalar. If reduction is 'none', then (N), where N = \text{batch size}. """ - B = paddle.shape(logits)[0] + B = logits.shape[0] # warp-ctc need logits, and do softmax on logits by itself # warp-ctc need activation with shape [T, B, V + 1] # logits: (B, L, D) -> (L, B, D) @@ -158,7 +158,7 @@ class LabelSmoothingLoss(nn.Layer): Returns: loss (paddle.Tensor) : The KL loss, scalar float value """ - B, T, D = paddle.shape(x) + B, T, D = x.shape assert D == self.size x = x.reshape((-1, self.size)) target = target.reshape([-1]) diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py index 2775988a7..782a437ee 100644 --- a/paddlespeech/s2t/modules/subsampling.py +++ b/paddlespeech/s2t/modules/subsampling.py @@ -192,8 +192,8 @@ class Conv2dSubsampling6(Conv2dSubsampling): """ x = x.unsqueeze(1) # (b, c, t, f) x = self.conv(x) - b, c, t, f = paddle.shape(x) - x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + b, c, t, f = x.shape + x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-4:3] @@ -245,6 +245,7 @@ class Conv2dSubsampling8(Conv2dSubsampling): """ x = x.unsqueeze(1) # (b, c, t, f) x = self.conv(x) - x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + b, c, t, f = x.shape + x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2] From feb27e2a8483cacc3c9200805986937bb2cfc6cd Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Sep 2022 15:54:31 +0000 Subject: [PATCH 050/113] fuse linear kv --- paddlespeech/s2t/modules/attention.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 67bb869ed..2166ca8bf 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -20,6 +20,7 @@ from typing import Tuple import paddle from paddle import nn from paddle.nn import initializer as I +from paddle.nn import functional as F from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log @@ -45,6 +46,7 @@ class MultiHeadedAttention(nn.Layer): """ super().__init__() assert n_feat % n_head == 0 + self.n_feat = n_feat # We assume d_v always equals d_k self.d_k = n_feat // n_head self.h = n_head @@ -54,6 +56,15 @@ class MultiHeadedAttention(nn.Layer): self.linear_out = Linear(n_feat, n_feat) self.dropout = nn.Dropout(p=dropout_rate) + + def _build_once(self, *args, **kwargs): + super()._build_once(*args, **kwargs) + # if self.self_att: + # self.linear_kv = Linear(self.n_feat, self.n_feat*2) + self.weight = paddle.concat([self.linear_k.weight, self.linear_v.weight], axis=-1) + self.bias = paddle.concat([self.linear_k.bias, self.linear_v.bias]) + self._built = True + def forward_qkv(self, query: paddle.Tensor, key: paddle.Tensor, @@ -73,9 +84,12 @@ class MultiHeadedAttention(nn.Layer): (#batch, n_head, time2, d_k). """ n_batch = query.shape[0] + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + # k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + # v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + k, v = F.linear(key, self.weight, self.bias).view(n_batch, -1, 2 * self.h, self.d_k).split(2, axis=2) + q = q.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) k = k.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k) v = v.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k) @@ -108,10 +122,10 @@ class MultiHeadedAttention(nn.Layer): # When will `if mask.size(2) > 0` be False? # 1. onnx(16/-1, -1/-1, 16/0) # 2. jit (16/-1, -1/-1, 16/0, 16/4) - if paddle.shape(mask)[2] > 0: # time2 > 0 + if mask.shape[2] > 0: # time2 > 0 mask = mask.unsqueeze(1).equal(0) # (batch, 1, *, time2) # for last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :paddle.shape(scores)[-1]] + mask = mask[:, :, :, :scores.shape[-1]] scores = scores.masked_fill(mask, -float('inf')) attn = paddle.softmax( scores, axis=-1).masked_fill(mask, @@ -179,7 +193,7 @@ class MultiHeadedAttention(nn.Layer): # >>> torch.equal(b, c) # True # >>> d = torch.split(a, 2, dim=-1) # >>> torch.equal(d[0], d[1]) # True - if paddle.shape(cache)[0] > 0: + if cache.shape[0] > 0: # last dim `d_k * 2` for (key, val) key_cache, value_cache = paddle.split(cache, 2, axis=-1) k = paddle.concat([key_cache, k], axis=2) @@ -288,7 +302,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): # >>> torch.equal(b, c) # True # >>> d = torch.split(a, 2, dim=-1) # >>> torch.equal(d[0], d[1]) # True - if paddle.shape(cache)[0] > 0: + if cache.shape[0] > 0: # last dim `d_k * 2` for (key, val) key_cache, value_cache = paddle.split(cache, 2, axis=-1) k = paddle.concat([key_cache, k], axis=2) From 0cd01241dbcb03a7407902ab7b9cba91858aff17 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 15:33:13 +0800 Subject: [PATCH 051/113] Update test_cli.sh update about text cli --- tests/unit/cli/test_cli.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 15604961d..c6837c303 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -7,7 +7,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespe paddlespeech cls --input ./cat.wav --topk 10 # Punctuation_restoration -paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 +paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast # Speech_recognition wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav From 9f8fbdbc09807a6b80416e846c3f7e394180df33 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 16:09:36 +0800 Subject: [PATCH 052/113] Update punc_application.yaml change model --- demos/streaming_asr_server/conf/punc_application.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/streaming_asr_server/conf/punc_application.yaml b/demos/streaming_asr_server/conf/punc_application.yaml index f947525e1..8456e2329 100644 --- a/demos/streaming_asr_server/conf/punc_application.yaml +++ b/demos/streaming_asr_server/conf/punc_application.yaml @@ -22,7 +22,7 @@ engine_list: ['text_python'] ################### text task: punc; engine_type: python ####################### text_python: task: punc - model_type: 'ernie_linear_p3_wudao' + model_type: 'ernie_linear_p3_wudao_fast' lang: 'zh' sample_rate: 16000 cfg_path: # [optional] From 82f731c1530c6e46470d4497073438ec6ab25d5b Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 16:13:11 +0800 Subject: [PATCH 053/113] Update application.yaml change model --- paddlespeech/server/conf/application.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index 55f241ec7..47b8b178f 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -142,7 +142,7 @@ cls_inference: ################### text task: punc; engine_type: python ####################### text_python: task: punc - model_type: 'ernie_linear_p3_wudao' + model_type: 'ernie_linear_p3_wudao_fast' lang: 'zh' sample_rate: 16000 cfg_path: # [optional] From d2da7f50d2982704dfb59184906cca96bff0c95b Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 16:27:49 +0800 Subject: [PATCH 054/113] Update text_engine.py precommihted already --- paddlespeech/server/engine/text/python/text_engine.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddlespeech/server/engine/text/python/text_engine.py b/paddlespeech/server/engine/text/python/text_engine.py index 6167e7784..9f2a48d51 100644 --- a/paddlespeech/server/engine/text/python/text_engine.py +++ b/paddlespeech/server/engine/text/python/text_engine.py @@ -107,10 +107,11 @@ class PaddleTextConnectionHandler: assert len(tokens) == len(labels) text = '' + print(self._punc_list) for t, l in zip(tokens, labels): text += t if l != 0: # Non punc. - text += self._punc_list[l] + text += self._punc_list[l - 1] return text else: @@ -160,7 +161,7 @@ class TextEngine(BaseEngine): return False self.executor = TextServerExecutor() - self.executor._init_from_path( + self.executor._init_from_path_new( task=config.task, model_type=config.model_type, lang=config.lang, From b20bf7d5dee23eef82ef4a810db2eafe8752e6d8 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 27 Sep 2022 08:47:22 +0000 Subject: [PATCH 055/113] masked_fill by multiply, remove while --- paddlespeech/s2t/__init__.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 3c704b272..b67322cdc 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -166,15 +166,9 @@ def broadcast_shape(shp1, shp2): def masked_fill(xs: paddle.Tensor, mask: paddle.Tensor, value: Union[float, int]): - bshape = broadcast_shape(xs.shape, mask.shape) mask.stop_gradient = True - tmp = paddle.ones(shape=[len(bshape)], dtype='int32') - for index in range(len(bshape)): - tmp[index] = bshape[index] - mask = mask.broadcast_to(tmp) - trues = paddle.ones_like(xs) * value - xs = paddle.where(mask, trues, xs) - return xs + mask = mask.astype(xs.dtype) + return xs * (1.0 - mask) + mask * value if not hasattr(paddle.Tensor, 'masked_fill'): From 83cd15be0c8077139baaaf82db63fb5ad9697c07 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 17:19:24 +0800 Subject: [PATCH 056/113] Create ernie-3.0.yaml config file of ernie-3.0-base-zh --- examples/iwslt2012/punc0/conf/ernie-3.0.yaml | 44 ++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 examples/iwslt2012/punc0/conf/ernie-3.0.yaml diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0.yaml b/examples/iwslt2012/punc0/conf/ernie-3.0.yaml new file mode 100644 index 000000000..845b13fd8 --- /dev/null +++ b/examples/iwslt2012/punc0/conf/ernie-3.0.yaml @@ -0,0 +1,44 @@ +########################################################### +# DATA SETTING # +########################################################### +dataset_type: Ernie +train_path: data/iwslt2012_zh/train.txt +dev_path: data/iwslt2012_zh/dev.txt +test_path: data/iwslt2012_zh/test.txt +batch_size: 64 +num_workers: 2 +data_params: + pretrained_token: ernie-3.0-base-zh + punc_path: data/iwslt2012_zh/punc_vocab + seq_len: 100 + + +########################################################### +# MODEL SETTING # +########################################################### +model_type: ErnieLinear +model: + pretrained_token: ernie-3.0-base-zh + num_classes: 4 + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer_params: + weight_decay: 1.0e-6 # weight decay coefficient. + +scheduler_params: + learning_rate: 1.0e-5 # learning rate. + gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better. + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 20 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random From ae8076c7462b277cdb252f44dcb6e9616348fac3 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 17:21:20 +0800 Subject: [PATCH 057/113] Rename ernie-3.0.yaml to ernie-3.0-base.yaml --- .../iwslt2012/punc0/conf/{ernie-3.0.yaml => ernie-3.0-base.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/iwslt2012/punc0/conf/{ernie-3.0.yaml => ernie-3.0-base.yaml} (100%) diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0.yaml b/examples/iwslt2012/punc0/conf/ernie-3.0-base.yaml similarity index 100% rename from examples/iwslt2012/punc0/conf/ernie-3.0.yaml rename to examples/iwslt2012/punc0/conf/ernie-3.0-base.yaml From 7753a3bddc2f29ad5ac9cd14088c9db26aec0573 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 17:22:26 +0800 Subject: [PATCH 058/113] Create ernie-3.0-medium-zh config file of ernie-3.0-medium-zh --- .../iwslt2012/punc0/conf/ernie-3.0-medium-zh | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 examples/iwslt2012/punc0/conf/ernie-3.0-medium-zh diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-medium-zh b/examples/iwslt2012/punc0/conf/ernie-3.0-medium-zh new file mode 100644 index 000000000..392ba011c --- /dev/null +++ b/examples/iwslt2012/punc0/conf/ernie-3.0-medium-zh @@ -0,0 +1,44 @@ +########################################################### +# DATA SETTING # +########################################################### +dataset_type: Ernie +train_path: data/iwslt2012_zh/train.txt +dev_path: data/iwslt2012_zh/dev.txt +test_path: data/iwslt2012_zh/test.txt +batch_size: 64 +num_workers: 2 +data_params: + pretrained_token: ernie-3.0-medium-zh + punc_path: data/iwslt2012_zh/punc_vocab + seq_len: 100 + + +########################################################### +# MODEL SETTING # +########################################################### +model_type: ErnieLinear +model: + pretrained_token: ernie-3.0-medium-zh + num_classes: 4 + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer_params: + weight_decay: 1.0e-6 # weight decay coefficient. + +scheduler_params: + learning_rate: 1.0e-5 # learning rate. + gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better. + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 20 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random From ca780d7edccea488ef50d2c1cb52e0c29c98a7e3 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 17:22:47 +0800 Subject: [PATCH 059/113] Rename ernie-3.0-medium-zh to ernie-3.0-medium.yaml --- .../punc0/conf/{ernie-3.0-medium-zh => ernie-3.0-medium.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/iwslt2012/punc0/conf/{ernie-3.0-medium-zh => ernie-3.0-medium.yaml} (100%) diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-medium-zh b/examples/iwslt2012/punc0/conf/ernie-3.0-medium.yaml similarity index 100% rename from examples/iwslt2012/punc0/conf/ernie-3.0-medium-zh rename to examples/iwslt2012/punc0/conf/ernie-3.0-medium.yaml From 83fd9589a1a047bbdaef12b94e40cdb9ece0b9b7 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 17:23:25 +0800 Subject: [PATCH 060/113] Create ernie-3.0-mini.yaml --- .../iwslt2012/punc0/conf/ernie-3.0-mini.yaml | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 examples/iwslt2012/punc0/conf/ernie-3.0-mini.yaml diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-mini.yaml b/examples/iwslt2012/punc0/conf/ernie-3.0-mini.yaml new file mode 100644 index 000000000..c57fd94a8 --- /dev/null +++ b/examples/iwslt2012/punc0/conf/ernie-3.0-mini.yaml @@ -0,0 +1,44 @@ +########################################################### +# DATA SETTING # +########################################################### +dataset_type: Ernie +train_path: data/iwslt2012_zh/train.txt +dev_path: data/iwslt2012_zh/dev.txt +test_path: data/iwslt2012_zh/test.txt +batch_size: 64 +num_workers: 2 +data_params: + pretrained_token: ernie-3.0-mini-zh + punc_path: data/iwslt2012_zh/punc_vocab + seq_len: 100 + + +########################################################### +# MODEL SETTING # +########################################################### +model_type: ErnieLinear +model: + pretrained_token: ernie-3.0-mini-zh + num_classes: 4 + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer_params: + weight_decay: 1.0e-6 # weight decay coefficient. + +scheduler_params: + learning_rate: 1.0e-5 # learning rate. + gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better. + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 20 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random From 402770f1933e21df5be244e3d39e1c28aa7945e0 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 17:23:57 +0800 Subject: [PATCH 061/113] Create ernie-3.0-nano-zh --- .../iwslt2012/punc0/conf/ernie-3.0-nano-zh | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh b/examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh new file mode 100644 index 000000000..a7a84c4c1 --- /dev/null +++ b/examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh @@ -0,0 +1,44 @@ +########################################################### +# DATA SETTING # +########################################################### +dataset_type: Ernie +train_path: data/iwslt2012_zh/train.txt +dev_path: data/iwslt2012_zh/dev.txt +test_path: data/iwslt2012_zh/test.txt +batch_size: 64 +num_workers: 2 +data_params: + pretrained_token: ernie-3.0-nano-zh + punc_path: data/iwslt2012_zh/punc_vocab + seq_len: 100 + + +########################################################### +# MODEL SETTING # +########################################################### +model_type: ErnieLinear +model: + pretrained_token: ernie-3.0-nano-zh + num_classes: 4 + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer_params: + weight_decay: 1.0e-6 # weight decay coefficient. + +scheduler_params: + learning_rate: 1.0e-5 # learning rate. + gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better. + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 20 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random From d016584a3cabd48de3a59d180e7544b0bfcb512a Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 17:24:16 +0800 Subject: [PATCH 062/113] Rename ernie-3.0-nano-zh to ernie-3.0-nano-zh.yaml --- .../punc0/conf/{ernie-3.0-nano-zh => ernie-3.0-nano-zh.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/iwslt2012/punc0/conf/{ernie-3.0-nano-zh => ernie-3.0-nano-zh.yaml} (100%) diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh b/examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh.yaml similarity index 100% rename from examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh rename to examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh.yaml From bdf577b43af147c010218c9d6970a23e37835600 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 17:24:48 +0800 Subject: [PATCH 063/113] Create Ernie-tiny.yaml --- examples/iwslt2012/punc0/conf/Ernie-tiny.yaml | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 examples/iwslt2012/punc0/conf/Ernie-tiny.yaml diff --git a/examples/iwslt2012/punc0/conf/Ernie-tiny.yaml b/examples/iwslt2012/punc0/conf/Ernie-tiny.yaml new file mode 100644 index 000000000..6a5b7fee2 --- /dev/null +++ b/examples/iwslt2012/punc0/conf/Ernie-tiny.yaml @@ -0,0 +1,44 @@ +########################################################### +# DATA SETTING # +########################################################### +dataset_type: Ernie +train_path: data/iwslt2012_zh/train.txt +dev_path: data/iwslt2012_zh/dev.txt +test_path: data/iwslt2012_zh/test.txt +batch_size: 64 +num_workers: 2 +data_params: + pretrained_token: ernie-tiny + punc_path: data/iwslt2012_zh/punc_vocab + seq_len: 100 + + +########################################################### +# MODEL SETTING # +########################################################### +model_type: ErnieLinear +model: + pretrained_token: ernie-tiny + num_classes: 4 + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer_params: + weight_decay: 1.0e-6 # weight decay coefficient. + +scheduler_params: + learning_rate: 1.0e-5 # learning rate. + gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better. + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 20 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random From 4c70f71671ac75d57dd4eb499580c68e83a35360 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Tue, 27 Sep 2022 17:25:06 +0800 Subject: [PATCH 064/113] Rename Ernie-tiny.yaml to ernie-tiny.yaml --- .../iwslt2012/punc0/conf/{Ernie-tiny.yaml => ernie-tiny.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/iwslt2012/punc0/conf/{Ernie-tiny.yaml => ernie-tiny.yaml} (100%) diff --git a/examples/iwslt2012/punc0/conf/Ernie-tiny.yaml b/examples/iwslt2012/punc0/conf/ernie-tiny.yaml similarity index 100% rename from examples/iwslt2012/punc0/conf/Ernie-tiny.yaml rename to examples/iwslt2012/punc0/conf/ernie-tiny.yaml From ae90c51bd6a8ecb4d0b25759b86179ebb88e6cc2 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 27 Sep 2022 12:13:05 +0000 Subject: [PATCH 065/113] add Speaker Diarization in readme, test=doc --- README.md | 28 ++++++++++++++++++++++++++-- README_cn.md | 34 +++++++++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 59c61f776..63466da84 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,6 @@

Quick Start - | Quick Start Server - | Quick Start Streaming Server | Documents | Models List | AIStudio Courses @@ -714,6 +712,31 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r + + +**Speaker Diarization** + + + + + + + + + + + + + + + + + + +
Task Dataset Model Type Example
Speaker DiarizationAMIECAPA-TDNN + AHC / SC + ecapa-tdnn-ami +
+ **Punctuation Restoration** @@ -767,6 +790,7 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](ht - [Text-to-Speech](#TextToSpeech) - [Audio Classification](#AudioClassification) - [Speaker Verification](#SpeakerVerification) + - [Speaker Diarization](#SpeakerDiarization) - [Punctuation Restoration](#PunctuationRestoration) - [Community](#Community) - [Welcome to contribute](#contribution) diff --git a/README_cn.md b/README_cn.md index 070a656a2..2b473091f 100644 --- a/README_cn.md +++ b/README_cn.md @@ -19,10 +19,8 @@

- 安装 + 安装 | 快速开始 - | 快速使用服务 - | 快速使用流式服务 | 教程文档 | 模型列表 | AIStudio 课程 @@ -717,8 +715,8 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - Speaker Verification - VoxCeleb12 + 声纹识别 + VoxCeleb1/2 ECAPA-TDNN ecapa-tdnn-voxceleb12 @@ -727,6 +725,31 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 + + +**说话人日志** + + + + + + + + + + + + + + + + + + +
任务 数据集 模型类型 脚本
说话人日志AMIECAPA-TDNN + AHC / SC + ecapa-tdnn-ami +
+ **标点恢复** @@ -786,6 +809,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - [语音合成](#语音合成模型) - [声音分类](#声音分类模型) - [声纹识别](#声纹识别模型) + - [说话人日志](#说话人日志模型) - [标点恢复](#标点恢复模型) - [技术交流群](#技术交流群) - [欢迎贡献](#欢迎贡献) From 1e4f4dc5d35364ec4754c06c3cf58ddb7e25f042 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 27 Sep 2022 21:25:37 +0800 Subject: [PATCH 066/113] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 63466da84..ca4071109 100644 --- a/README.md +++ b/README.md @@ -703,7 +703,7 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r Speaker Verification - VoxCeleb12 + VoxCeleb1/2 ECAPA-TDNN ecapa-tdnn-voxceleb12 From 4e55c2067f54d2747a06712562fa2ca0eda48e07 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 28 Sep 2022 11:39:07 +0800 Subject: [PATCH 067/113] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index ca4071109..d33287762 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update +- 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and ERNIE-SAT in [PaddleSpeech Web Demo](./demos/speech_web). +- ⚡ 2022.09.09: Add AISHELL-3 Voice Cloning with ECAPA-TDNN. - ⚡ 2022.08.25: Release TTS [finetune](./examples/other/tts_finetune/tts3) example. - 🔥 2022.08.22: Add ERNIE-SAT models: [ERNIE-SAT-vctk](./examples/vctk/ernie_sat)、[ERNIE-SAT-aishell3](./examples/aishell3/ernie_sat)、[ERNIE-SAT-zh_en](./examples/aishell3_vctk/ernie_sat). - 🔥 2022.08.15: Add [g2pW](https://github.com/GitYCC/g2pW) into TTS Chinese Text Frontend. From faa08085110bf4aacc8e4eab416635d7a69a2b05 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 28 Sep 2022 11:43:40 +0800 Subject: [PATCH 068/113] Update README_cn.md --- README_cn.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README_cn.md b/README_cn.md index 2b473091f..f3e176e33 100644 --- a/README_cn.md +++ b/README_cn.md @@ -179,6 +179,8 @@

### 近期更新 +- 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 ERNIE-SAT 到 [PaddleSpeech Web Demo](./demos/speech_web)。 +- ⚡ 2022.09.09: 新增基于 ECAPA-TDNN 声纹模型的 AISHELL-3 Voice Cloning [示例](./examples/aishell3/vc2)。 - ⚡ 2022.08.25: 发布 TTS [finetune](./examples/other/tts_finetune/tts3) 示例。 - 🔥 2022.08.22: 新增 ERNIE-SAT 模型: [ERNIE-SAT-vctk](./examples/vctk/ernie_sat)、[ERNIE-SAT-aishell3](./examples/aishell3/ernie_sat)、[ERNIE-SAT-zh_en](./examples/aishell3_vctk/ernie_sat)。 - 🔥 2022.08.15: 将 [g2pW](https://github.com/GitYCC/g2pW) 引入 TTS 中文文本前端。 From 175f0e7ba71535cc5d59b42f8cfd5842b0f4eda9 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 28 Sep 2022 11:46:14 +0800 Subject: [PATCH 069/113] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d33287762..72db64b7d 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision ### Recent Update - 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and ERNIE-SAT in [PaddleSpeech Web Demo](./demos/speech_web). -- ⚡ 2022.09.09: Add AISHELL-3 Voice Cloning with ECAPA-TDNN. +- ⚡ 2022.09.09: Add AISHELL-3 Voice Cloning [example](./examples/aishell3/vc2) with ECAPA-TDNN speaker encoder. - ⚡ 2022.08.25: Release TTS [finetune](./examples/other/tts_finetune/tts3) example. - 🔥 2022.08.22: Add ERNIE-SAT models: [ERNIE-SAT-vctk](./examples/vctk/ernie_sat)、[ERNIE-SAT-aishell3](./examples/aishell3/ernie_sat)、[ERNIE-SAT-zh_en](./examples/aishell3_vctk/ernie_sat). - 🔥 2022.08.15: Add [g2pW](https://github.com/GitYCC/g2pW) into TTS Chinese Text Frontend. From 764fa0a8599a6b20c6f719b70bb45a3b4d52b245 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 28 Sep 2022 11:47:27 +0800 Subject: [PATCH 070/113] Update README_cn.md --- README_cn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_cn.md b/README_cn.md index f3e176e33..725f7eda1 100644 --- a/README_cn.md +++ b/README_cn.md @@ -179,7 +179,7 @@

### 近期更新 -- 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 ERNIE-SAT 到 [PaddleSpeech Web Demo](./demos/speech_web)。 +- 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 ERNIE-SAT 到 [PaddleSpeech 网页应用](./demos/speech_web)。 - ⚡ 2022.09.09: 新增基于 ECAPA-TDNN 声纹模型的 AISHELL-3 Voice Cloning [示例](./examples/aishell3/vc2)。 - ⚡ 2022.08.25: 发布 TTS [finetune](./examples/other/tts_finetune/tts3) 示例。 - 🔥 2022.08.22: 新增 ERNIE-SAT 模型: [ERNIE-SAT-vctk](./examples/vctk/ernie_sat)、[ERNIE-SAT-aishell3](./examples/aishell3/ernie_sat)、[ERNIE-SAT-zh_en](./examples/aishell3_vctk/ernie_sat)。 From afda7ed7d1f0fad154e1984bc6aa32980b98b368 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 28 Sep 2022 06:36:12 +0000 Subject: [PATCH 071/113] remove useless code --- paddlespeech/s2t/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index b67322cdc..4507365d6 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -166,7 +166,6 @@ def broadcast_shape(shp1, shp2): def masked_fill(xs: paddle.Tensor, mask: paddle.Tensor, value: Union[float, int]): - mask.stop_gradient = True mask = mask.astype(xs.dtype) return xs * (1.0 - mask) + mask * value From 8ecf6796f3673d2565ab2949c2e4d4f303c7c9ab Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Wed, 28 Sep 2022 15:23:49 +0800 Subject: [PATCH 072/113] Update text_engine.py --- paddlespeech/server/engine/text/python/text_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/server/engine/text/python/text_engine.py b/paddlespeech/server/engine/text/python/text_engine.py index 9f2a48d51..b4ad95c64 100644 --- a/paddlespeech/server/engine/text/python/text_engine.py +++ b/paddlespeech/server/engine/text/python/text_engine.py @@ -107,7 +107,6 @@ class PaddleTextConnectionHandler: assert len(tokens) == len(labels) text = '' - print(self._punc_list) for t, l in zip(tokens, labels): text += t if l != 0: # Non punc. From 404708c64006dcff731204f9d9cbf7e616cdd7dc Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Wed, 28 Sep 2022 11:15:06 +0000 Subject: [PATCH 073/113] fix s2t gpu training hang --- examples/aishell/asr0/local/train.sh | 4 ++++ examples/aishell/asr1/local/train.sh | 4 ++++ examples/librispeech/asr0/local/train.sh | 4 ++++ examples/librispeech/asr1/local/train.sh | 4 ++++ examples/librispeech/asr2/local/train.sh | 4 ++++ examples/timit/asr1/local/train.sh | 4 ++++ examples/tiny/asr0/local/train.sh | 4 ++++ examples/tiny/asr1/local/train.sh | 4 ++++ examples/wenetspeech/asr1/local/train.sh | 4 ++++ 9 files changed, 36 insertions(+) diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh index 256b30d22..2b71b7f76 100755 --- a/examples/aishell/asr0/local/train.sh +++ b/examples/aishell/asr0/local/train.sh @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh index f514de303..bfa8dd97d 100755 --- a/examples/aishell/asr1/local/train.sh +++ b/examples/aishell/asr1/local/train.sh @@ -35,6 +35,10 @@ echo ${ips_config} mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/librispeech/asr0/local/train.sh b/examples/librispeech/asr0/local/train.sh index 71659e28d..bb41fd554 100755 --- a/examples/librispeech/asr0/local/train.sh +++ b/examples/librispeech/asr0/local/train.sh @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/librispeech/asr1/local/train.sh b/examples/librispeech/asr1/local/train.sh index f729ed22c..e274b9133 100755 --- a/examples/librispeech/asr1/local/train.sh +++ b/examples/librispeech/asr1/local/train.sh @@ -29,6 +29,10 @@ fi # export FLAGS_cudnn_exhaustive_search=true # export FLAGS_conv_workspace_size_limit=4000 +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/librispeech/asr2/local/train.sh b/examples/librispeech/asr2/local/train.sh index 1f414ad41..c2f2d4b65 100755 --- a/examples/librispeech/asr2/local/train.sh +++ b/examples/librispeech/asr2/local/train.sh @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/timit/asr1/local/train.sh b/examples/timit/asr1/local/train.sh index 661407582..1088c7ffa 100755 --- a/examples/timit/asr1/local/train.sh +++ b/examples/timit/asr1/local/train.sh @@ -19,6 +19,10 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/tiny/asr0/local/train.sh b/examples/tiny/asr0/local/train.sh index 8b67902fe..e233a0c0a 100755 --- a/examples/tiny/asr0/local/train.sh +++ b/examples/tiny/asr0/local/train.sh @@ -32,6 +32,10 @@ fi mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/tiny/asr1/local/train.sh b/examples/tiny/asr1/local/train.sh index 459f2e218..fbfb41f6f 100755 --- a/examples/tiny/asr1/local/train.sh +++ b/examples/tiny/asr1/local/train.sh @@ -34,6 +34,10 @@ fi mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ diff --git a/examples/wenetspeech/asr1/local/train.sh b/examples/wenetspeech/asr1/local/train.sh index 01af00b61..6813d270c 100755 --- a/examples/wenetspeech/asr1/local/train.sh +++ b/examples/wenetspeech/asr1/local/train.sh @@ -35,6 +35,10 @@ echo ${ips_config} mkdir -p exp +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ From b9693a0e8e41636cdc1c141467a4fbee621119b7 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Thu, 29 Sep 2022 13:08:20 +0800 Subject: [PATCH 074/113] Update text_engine.py --- paddlespeech/server/engine/text/python/text_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/server/engine/text/python/text_engine.py b/paddlespeech/server/engine/text/python/text_engine.py index b4ad95c64..a871de35c 100644 --- a/paddlespeech/server/engine/text/python/text_engine.py +++ b/paddlespeech/server/engine/text/python/text_engine.py @@ -131,7 +131,6 @@ class TextEngine(BaseEngine): """ super(TextEngine, self).__init__() logger.debug("Create the TextEngine Instance") - def init(self, config: dict): """Init the Text Engine From 8c945c073d6764b20b0ccad7b4cf5f00c1180bd6 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Thu, 29 Sep 2022 14:05:49 +0800 Subject: [PATCH 075/113] Update application.yaml --- paddlespeech/server/conf/application.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index 47b8b178f..55f241ec7 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -142,7 +142,7 @@ cls_inference: ################### text task: punc; engine_type: python ####################### text_python: task: punc - model_type: 'ernie_linear_p3_wudao_fast' + model_type: 'ernie_linear_p3_wudao' lang: 'zh' sample_rate: 16000 cfg_path: # [optional] From 80837fd65812ddb64ce17c813ac1b05f27571458 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Thu, 29 Sep 2022 14:06:57 +0800 Subject: [PATCH 076/113] Update punc_application.yaml --- demos/streaming_asr_server/conf/punc_application.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/streaming_asr_server/conf/punc_application.yaml b/demos/streaming_asr_server/conf/punc_application.yaml index 8456e2329..f947525e1 100644 --- a/demos/streaming_asr_server/conf/punc_application.yaml +++ b/demos/streaming_asr_server/conf/punc_application.yaml @@ -22,7 +22,7 @@ engine_list: ['text_python'] ################### text task: punc; engine_type: python ####################### text_python: task: punc - model_type: 'ernie_linear_p3_wudao_fast' + model_type: 'ernie_linear_p3_wudao' lang: 'zh' sample_rate: 16000 cfg_path: # [optional] From 304dc2603c583cda7d1bffb6f7d14eb7c40f96d0 Mon Sep 17 00:00:00 2001 From: Zhao Yuting <91456992+THUzyt21@users.noreply.github.com> Date: Thu, 29 Sep 2022 14:08:37 +0800 Subject: [PATCH 077/113] Update text_engine.py --- .../server/engine/text/python/text_engine.py | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/paddlespeech/server/engine/text/python/text_engine.py b/paddlespeech/server/engine/text/python/text_engine.py index a871de35c..cc72c0543 100644 --- a/paddlespeech/server/engine/text/python/text_engine.py +++ b/paddlespeech/server/engine/text/python/text_engine.py @@ -107,11 +107,14 @@ class PaddleTextConnectionHandler: assert len(tokens) == len(labels) text = '' + is_fast_model = 'fast' in self.text_engine.config.model_type for t, l in zip(tokens, labels): text += t if l != 0: # Non punc. - text += self._punc_list[l - 1] - + if is_fast_model: + text += self._punc_list[l - 1] + else: + text += self._punc_list[l] return text else: raise NotImplementedError @@ -131,6 +134,7 @@ class TextEngine(BaseEngine): """ super(TextEngine, self).__init__() logger.debug("Create the TextEngine Instance") + def init(self, config: dict): """Init the Text Engine @@ -159,14 +163,23 @@ class TextEngine(BaseEngine): return False self.executor = TextServerExecutor() - self.executor._init_from_path_new( - task=config.task, - model_type=config.model_type, - lang=config.lang, - cfg_path=config.cfg_path, - ckpt_path=config.ckpt_path, - vocab_file=config.vocab_file) - + if 'fast' in config.model_type: + self.executor._init_from_path_new( + task=config.task, + model_type=config.model_type, + lang=config.lang, + cfg_path=config.cfg_path, + ckpt_path=config.ckpt_path, + vocab_file=config.vocab_file) + else: + self.executor._init_from_path( + task=config.task, + model_type=config.model_type, + lang=config.lang, + cfg_path=config.cfg_path, + ckpt_path=config.ckpt_path, + vocab_file=config.vocab_file) + logger.info("Using model: %s." % (config.model_type)) logger.info("Initialize Text server engine successfully on device: %s." % (self.device)) return True From 5bbe6e9897f7112fec0d06b08714fc26bde20ec5 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Thu, 29 Sep 2022 13:41:16 +0000 Subject: [PATCH 078/113] support u2pp cli and server, optimiz code of u2pp decode, test=asr --- .../conf/application.yaml | 2 +- docs/source/released_model.md | 1 + paddlespeech/cli/asr/infer.py | 4 +- paddlespeech/resource/model_alias.py | 2 + paddlespeech/resource/pretrained_models.py | 40 +++++++++++++++++++ paddlespeech/s2t/exps/u2/bin/test_wav.py | 4 +- paddlespeech/s2t/exps/u2/model.py | 4 +- paddlespeech/s2t/models/u2/u2.py | 33 +++++++-------- .../server/conf/ws_conformer_application.yaml | 2 +- .../engine/asr/online/python/asr_engine.py | 23 +++++++++-- 10 files changed, 83 insertions(+), 32 deletions(-) diff --git a/demos/streaming_asr_server/conf/application.yaml b/demos/streaming_asr_server/conf/application.yaml index a89d312ab..d446e13b6 100644 --- a/demos/streaming_asr_server/conf/application.yaml +++ b/demos/streaming_asr_server/conf/application.yaml @@ -21,7 +21,7 @@ engine_list: ['asr_online'] ################################### ASR ######################################### ################### speech task: asr; engine_type: online ####################### asr_online: - model_type: 'conformer_online_wenetspeech' + model_type: 'conformer_u2pp_online_wenetspeech' am_model: # the pdmodel file of am static model [optional] am_params: # the pdiparams file of am static model [optional] lang: 'zh' diff --git a/docs/source/released_model.md b/docs/source/released_model.md index d6691812e..bdac2c5bb 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -9,6 +9,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) | onnx/inference/python | [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python | [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python | +[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.1.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python | [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python | [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python | [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) | python | diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 7296776f9..4a7feaf0f 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -51,7 +51,7 @@ class ASRExecutor(BaseExecutor): self.parser.add_argument( '--model', type=str, - default='conformer_wenetspeech', + default='conformer_u2pp_wenetspeech', choices=[ tag[:tag.index('-')] for tag in self.task_resource.pretrained_models.keys() @@ -465,7 +465,7 @@ class ASRExecutor(BaseExecutor): @stats_wrapper def __call__(self, audio_file: os.PathLike, - model: str='conformer_wenetspeech', + model: str='conformer_u2pp_wenetspeech', lang: str='zh', sample_rate: int=16000, config: os.PathLike=None, diff --git a/paddlespeech/resource/model_alias.py b/paddlespeech/resource/model_alias.py index 9c76dd4b3..3f36f11f2 100644 --- a/paddlespeech/resource/model_alias.py +++ b/paddlespeech/resource/model_alias.py @@ -25,6 +25,8 @@ model_alias = { "deepspeech2online": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"], "conformer": ["paddlespeech.s2t.models.u2:U2Model"], "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"], + "conformer_u2pp": ["paddlespeech.s2t.models.u2:U2Model"], + "conformer_u2pp_online": ["paddlespeech.s2t.models.u2:U2Model"], "transformer": ["paddlespeech.s2t.models.u2:U2Model"], "wenetspeech": ["paddlespeech.s2t.models.u2:U2Model"], diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index f049879a3..eecf21768 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -68,6 +68,46 @@ asr_dynamic_pretrained_models = { '', }, }, + "conformer_u2pp_wenetspeech-zh-16k": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.1.model.tar.gz', + 'md5': + 'eae678c04ed3b3f89672052fdc0c5e10', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/chunk_conformer_u2pp/checkpoints/avg_10', + 'model': + 'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams', + 'params': + 'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams', + 'lm_url': + '', + 'lm_md5': + '', + }, + }, + "conformer_u2pp_online_wenetspeech-zh-16k": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.2.model.tar.gz', + 'md5': + '925d047e9188dea7f421a718230c9ae3', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/chunk_conformer_u2pp/checkpoints/avg_10', + 'model': + 'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams', + 'params': + 'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams', + 'lm_url': + '', + 'lm_md5': + '', + }, + }, "conformer_online_multicn-zh-16k": { '1.0': { 'url': diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 4588def0b..46925faed 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -40,7 +40,6 @@ class U2Infer(): self.preprocess_conf = config.preprocess_config self.preprocess_args = {"train": False} self.preprocessing = Transformation(self.preprocess_conf) - self.reverse_weight = getattr(config.model_conf, 'reverse_weight', 0.0) self.text_feature = TextFeaturizer( unit_type=config.unit_type, vocab=config.vocab_filepath, @@ -89,8 +88,7 @@ class U2Infer(): ctc_weight=decode_config.ctc_weight, decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, - simulate_streaming=decode_config.simulate_streaming, - reverse_weight=self.reverse_weight) + simulate_streaming=decode_config.simulate_streaming) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {result_transcripts[0][0]}") diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index a13a6385e..a6197d073 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -316,7 +316,6 @@ class U2Tester(U2Trainer): vocab=self.config.vocab_filepath, spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list - self.reverse_weight = getattr(config.model_conf, 'reverse_weight', 0.0) def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ @@ -351,8 +350,7 @@ class U2Tester(U2Trainer): ctc_weight=decode_config.ctc_weight, decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, - simulate_streaming=decode_config.simulate_streaming, - reverse_weight=self.reverse_weight) + simulate_streaming=decode_config.simulate_streaming) decode_time = time.time() - start_time for utt, target, result, rec_tids in zip( diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 0a3e03b79..53c3bf555 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -507,16 +507,14 @@ class U2BaseModel(ASRInterface, nn.Layer): num_decoding_left_chunks, simulate_streaming) return hyps[0][0] - def attention_rescoring( - self, - speech: paddle.Tensor, - speech_lengths: paddle.Tensor, - beam_size: int, - decoding_chunk_size: int=-1, - num_decoding_left_chunks: int=-1, - ctc_weight: float=0.0, - simulate_streaming: bool=False, - reverse_weight: float=0.0, ) -> List[int]: + def attention_rescoring(self, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + beam_size: int, + decoding_chunk_size: int=-1, + num_decoding_left_chunks: int=-1, + ctc_weight: float=0.0, + simulate_streaming: bool=False) -> List[int]: """ Apply attention rescoring decoding, CTC prefix beam search is applied first to get nbest, then we resoring the nbest on attention decoder with corresponding encoder out @@ -536,7 +534,7 @@ class U2BaseModel(ASRInterface, nn.Layer): """ assert speech.shape[0] == speech_lengths.shape[0] assert decoding_chunk_size != 0 - if reverse_weight > 0.0: + if self.reverse_weight > 0.0: # decoder should be a bitransformer decoder if reverse_weight > 0.0 assert hasattr(self.decoder, 'right_decoder') device = speech.place @@ -574,7 +572,7 @@ class U2BaseModel(ASRInterface, nn.Layer): self.eos) decoder_out, r_decoder_out, _ = self.decoder( encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) + self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) # ctc score in ln domain decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1) decoder_out = decoder_out.numpy() @@ -594,12 +592,13 @@ class U2BaseModel(ASRInterface, nn.Layer): score += decoder_out[i][j][w] # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] - if reverse_weight > 0: + if self.reverse_weight > 0: r_score = 0.0 for j, w in enumerate(hyp[0]): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight + score = score * (1 - self.reverse_weight + ) + r_score * self.reverse_weight # add ctc score (which in ln domain) score += hyp[1] * ctc_weight if score > best_score: @@ -748,8 +747,7 @@ class U2BaseModel(ASRInterface, nn.Layer): ctc_weight: float=0.0, decoding_chunk_size: int=-1, num_decoding_left_chunks: int=-1, - simulate_streaming: bool=False, - reverse_weight: float=0.0): + simulate_streaming: bool=False): """u2 decoding. Args: @@ -821,8 +819,7 @@ class U2BaseModel(ASRInterface, nn.Layer): decoding_chunk_size=decoding_chunk_size, num_decoding_left_chunks=num_decoding_left_chunks, ctc_weight=ctc_weight, - simulate_streaming=simulate_streaming, - reverse_weight=reverse_weight) + simulate_streaming=simulate_streaming) hyps = [hyp] else: raise ValueError(f"Not support decoding method: {decoding_method}") diff --git a/paddlespeech/server/conf/ws_conformer_application.yaml b/paddlespeech/server/conf/ws_conformer_application.yaml index d72eb2379..b6128118f 100644 --- a/paddlespeech/server/conf/ws_conformer_application.yaml +++ b/paddlespeech/server/conf/ws_conformer_application.yaml @@ -30,7 +30,7 @@ asr_online: decode_method: num_decoding_left_chunks: -1 force_yes: True - device: # cpu or gpu:id + device: gpu # cpu or gpu:id continuous_decoding: True # enable continue decoding when endpoint detected am_predictor_conf: diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 4c7c4b37a..740f5270d 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -22,6 +22,7 @@ from numpy import float32 from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation +from paddlespeech.audio.utils.tensor_utils import st_reverse_pad_list from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource @@ -603,24 +604,31 @@ class PaddleASRConnectionHanddler: hyps_pad = pad_sequence( hyp_list, batch_first=True, padding_value=self.model.ignore_id) + ori_hyps_pad = hyps_pad hyps_lens = paddle.to_tensor( [len(hyp[0]) for hyp in hyps], place=self.device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.model.sos, self.model.eos, self.model.ignore_id) hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = self.encoder_out.repeat(beam_size, 1, 1) encoder_mask = paddle.ones( (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool) - decoder_out, _, _ = self.model.decoder( - encoder_out, encoder_mask, hyps_pad, - hyps_lens) # (beam_size, max_hyps_len, vocab_size) + r_hyps_pad = st_reverse_pad_list(ori_hyps_pad, hyps_lens - 1, + self.model.sos, self.model.eos) + decoder_out, r_decoder_out, _ = self.model.decoder( + encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, + self.model.reverse_weight) # (beam_size, max_hyps_len, vocab_size) # ctc score in ln domain decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1) decoder_out = decoder_out.numpy() + # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a + # conventional transformer decoder. + r_decoder_out = paddle.nn.functional.log_softmax(r_decoder_out, axis=-1) + r_decoder_out = r_decoder_out.numpy() + # Only use decoder score for rescoring best_score = -float('inf') best_index = 0 @@ -632,6 +640,13 @@ class PaddleASRConnectionHanddler: # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.model.eos] + if self.model.reverse_weight > 0: + r_score = 0.0 + for j, w in enumerate(hyp[0]): + r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] + r_score += r_decoder_out[i][len(hyp[0])][self.model.eos] + score = score * (1 - self.model.reverse_weight + ) + r_score * self.model.reverse_weight # add ctc score (which in ln domain) score += hyp[1] * self.ctc_decode_config.ctc_weight From 7a13b35fe6cec02b27ab9eb05e0ed47ef767a17b Mon Sep 17 00:00:00 2001 From: ZapBird <105480550+ZapBird@users.noreply.github.com> Date: Fri, 30 Sep 2022 10:45:43 +0800 Subject: [PATCH 079/113] =?UTF-8?q?BytesIO=E7=B1=BB=E5=9E=8B=E6=97=B6?= =?UTF-8?q?=EF=BC=8C=E8=A6=81=E4=BF=9D=E8=AF=81=E5=88=87=E5=88=B0=E5=88=9D?= =?UTF-8?q?=E5=A7=8B=E4=BD=8D=E7=BD=AE=EF=BC=8C=E8=BF=99=E6=A0=B7=E5=A4=9A?= =?UTF-8?q?=E6=AC=A1=E8=AF=BB=E5=8F=96=E6=89=8D=E8=83=BD=E5=A4=9F=E6=AD=A3?= =?UTF-8?q?=E5=B8=B8=E3=80=82=E6=AF=94=E5=A6=82=5F=5Fcall=5F=5F=E5=87=BD?= =?UTF-8?q?=E6=95=B0=E3=80=82=20(#2484)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * BytesIO类型时,要保证切到初始位置,这样多次读取才能够正常。比如__call__函数。 __call__函数的参数audio_file为BytesIO类型时执行到self.preprocess(model, audio_file)会报错,需要判断audio_file为BytesIO类型时执行audio_file.seek(0)。 --- paddlespeech/cli/asr/infer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 7296776f9..0c794a001 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse +import io import os import sys import time @@ -229,6 +230,8 @@ class ASRExecutor(BaseExecutor): audio_file = input if isinstance(audio_file, (str, os.PathLike)): logger.debug("Preprocess audio_file:" + audio_file) + elif isinstance(audio_file, io.BytesIO): + audio_file.seek(0) # Get the object for feature extraction if "deepspeech2" in model_type or "conformer" in model_type or "transformer" in model_type: @@ -352,6 +355,8 @@ class ASRExecutor(BaseExecutor): if not os.path.isfile(audio_file): logger.error("Please input the right audio file path") return False + elif isinstance(audio_file, io.BytesIO): + audio_file.seek(0) logger.debug("checking the audio file format......") try: From 5b5167b58635c879da2ef36fa4283d99c321d6ce Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Fri, 30 Sep 2022 04:14:22 +0000 Subject: [PATCH 080/113] support u2pp cli and server, optimiz code of u2pp decode, test=asr --- paddlespeech/resource/pretrained_models.py | 4 ++-- paddlespeech/server/conf/ws_conformer_application.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index eecf21768..d012a7d2d 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -69,7 +69,7 @@ asr_dynamic_pretrained_models = { }, }, "conformer_u2pp_wenetspeech-zh-16k": { - '1.0': { + '1.1': { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.1.model.tar.gz', 'md5': @@ -89,7 +89,7 @@ asr_dynamic_pretrained_models = { }, }, "conformer_u2pp_online_wenetspeech-zh-16k": { - '1.0': { + '1.1': { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.2.model.tar.gz', 'md5': diff --git a/paddlespeech/server/conf/ws_conformer_application.yaml b/paddlespeech/server/conf/ws_conformer_application.yaml index b6128118f..d5357c853 100644 --- a/paddlespeech/server/conf/ws_conformer_application.yaml +++ b/paddlespeech/server/conf/ws_conformer_application.yaml @@ -30,7 +30,7 @@ asr_online: decode_method: num_decoding_left_chunks: -1 force_yes: True - device: gpu # cpu or gpu:id + device: cpu # cpu or gpu:id continuous_decoding: True # enable continue decoding when endpoint detected am_predictor_conf: From 3ed24474d2ee85d3aee71de37c9b84c97094f5ef Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 02:34:10 +0000 Subject: [PATCH 081/113] wenetspeech asr1 quant --- examples/wenetspeech/asr1/local/quant.sh | 59 ++++++ paddlespeech/s2t/exps/u2/bin/quant.py | 220 +++++++++++++++++++++++ 2 files changed, 279 insertions(+) create mode 100755 examples/wenetspeech/asr1/local/quant.sh create mode 100644 paddlespeech/s2t/exps/u2/bin/quant.py diff --git a/examples/wenetspeech/asr1/local/quant.sh b/examples/wenetspeech/asr1/local/quant.sh new file mode 100755 index 000000000..9dfea9045 --- /dev/null +++ b/examples/wenetspeech/asr1/local/quant.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 + +mkdir -p data +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ +if [ $? -ne 0 ]; then + exit 1 +fi + +if [ ! -f ${audio_file} ]; then + echo "Plase input the right audio_file path" + exit 1 +fi + + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi + +# download language model +#bash local/download_lm_ch.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + +for type in attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/quant.py \ + --ngpu ${ngpu} \ + --config ${config_path} \ + --decode_cfg ${decode_config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ + --audio_file ${audio_file} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done +exit 0 diff --git a/paddlespeech/s2t/exps/u2/bin/quant.py b/paddlespeech/s2t/exps/u2/bin/quant.py new file mode 100644 index 000000000..de7c27e79 --- /dev/null +++ b/paddlespeech/s2t/exps/u2/bin/quant.py @@ -0,0 +1,220 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation for U2 model.""" +import os +import sys +from pathlib import Path + +import paddle +import soundfile +from yacs.config import CfgNode + +from paddlespeech.audio.transform.transformation import Transformation +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.models.u2 import U2Model +from paddlespeech.s2t.training.cli import default_argument_parser +from paddlespeech.s2t.utils.log import Log +from paddlespeech.s2t.utils.utility import UpdateConfig +from paddleslim import PTQ +logger = Log(__name__).getlog() + + +class U2Infer(): + def __init__(self, config, args): + self.args = args + self.config = config + self.audio_file = args.audio_file + + self.preprocess_conf = config.preprocess_config + self.preprocess_args = {"train": False} + self.preprocessing = Transformation(self.preprocess_conf) + self.reverse_weight = getattr(config.model_conf, 'reverse_weight', 0.0) + self.text_feature = TextFeaturizer( + unit_type=config.unit_type, + vocab=config.vocab_filepath, + spm_model_prefix=config.spm_model_prefix) + + paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') + + # model + model_conf = config + with UpdateConfig(model_conf): + model_conf.input_dim = config.feat_dim + model_conf.output_dim = self.text_feature.vocab_size + model = U2Model.from_config(model_conf) + self.model = model + self.model.eval() + self.ptq = PTQ() + self.model = self.ptq.quantize(model) + + # load model + params_path = self.args.checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + self.model.set_state_dict(model_dict) + logger.info(f"model_dict: {model_dict.keys()}") + + def run(self): + check(args.audio_file) + + with paddle.no_grad(): + # read + audio, sample_rate = soundfile.read( + self.audio_file, dtype="int16", always_2d=True) + audio = audio[:, 0] + logger.info(f"audio shape: {audio.shape}") + + # fbank + feat = self.preprocessing(audio, **self.preprocess_args) + logger.info(f"feat shape: {feat.shape}") + + ilen = paddle.to_tensor(feat.shape[0]) + xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) + decode_config = self.config.decode + logger.info(f"decode cfg: {decode_config}") + result_transcripts = self.model.decode( + xs, + ilen, + text_feature=self.text_feature, + decoding_method=decode_config.decoding_method, + beam_size=decode_config.beam_size, + ctc_weight=decode_config.ctc_weight, + decoding_chunk_size=decode_config.decoding_chunk_size, + num_decoding_left_chunks=decode_config.num_decoding_left_chunks, + simulate_streaming=decode_config.simulate_streaming, + reverse_weight=self.reverse_weight) + rsl = result_transcripts[0][0] + utt = Path(self.audio_file).name + logger.info(f"hyp: {utt} {result_transcripts[0][0]}") + # print(self.model) + # print(self.model.forward_encoder_chunk) + # return rsl + + logger.info("-------------start export ----------------------") + batch_size = 1 + feat_dim = 80 + model_size = 512 + num_left_chunks = -1 + logger.info( + f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}" + ) + + # ######################## self.model.forward_encoder_chunk ############ + # input_spec = [ + # # (T,), int16 + # paddle.static.InputSpec(shape=[None], dtype='int16'), + # ] + # self.model.forward_feature = paddle.jit.to_static( + # self.model.forward_feature, input_spec=input_spec) + + ######################### self.model.forward_encoder_chunk ############ + input_spec = [ + # xs, (B, T, D) + paddle.static.InputSpec( + shape=[batch_size, None, feat_dim], dtype='float32'), + # offset, int, but need be tensor + paddle.static.InputSpec(shape=[1], dtype='int32'), + # required_cache_size, int + num_left_chunks, + # att_cache + paddle.static.InputSpec( + shape=[None, None, None, None], dtype='float32'), + # cnn_cache + paddle.static.InputSpec( + shape=[None, None, None, None], dtype='float32') + ] + self.model.forward_encoder_chunk = paddle.jit.to_static( + self.model.forward_encoder_chunk, input_spec=input_spec) + + ######################### self.model.ctc_activation ######################## + input_spec = [ + # encoder_out, (B,T,D) + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32') + ] + self.model.ctc_activation = paddle.jit.to_static( + self.model.ctc_activation, input_spec=input_spec) + + ######################### self.model.forward_attention_decoder ######################## + reverse_weight = 0.3 + input_spec = [ + # hyps, (B, U) + paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # hyps_lens, (B,) + paddle.static.InputSpec(shape=[None], dtype='int64'), + # encoder_out, (B,T,D) + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32'), + reverse_weight + ] + self.model.forward_attention_decoder = paddle.jit.to_static( + self.model.forward_attention_decoder, input_spec=input_spec) + ################################################################################ + + # jit save + logger.info(f"export save: {self.args.export_path}") + config = {'is_static': True, 'combine_params':True, 'skip_forward':True} + self.ptq.save_quantized_model(self.model, self.args.export_path) + # paddle.jit.save( + # self.model, + # self.args.export_path, + # combine_params=True, + # skip_forward=True) + + + +def check(audio_file): + if not os.path.isfile(audio_file): + print("Please input the right audio file path") + sys.exit(-1) + + logger.info("checking the audio file format......") + try: + sig, sample_rate = soundfile.read(audio_file) + except Exception as e: + logger.error(str(e)) + logger.error( + "can not open the wav file, please check the audio file format") + sys.exit(-1) + logger.info("The sample rate is %d" % sample_rate) + assert (sample_rate == 16000) + logger.info("The audio file format is right") + + +def main(config, args): + U2Infer(config, args).run() + + +if __name__ == "__main__": + parser = default_argument_parser() + # save asr result to + parser.add_argument( + "--result_file", type=str, help="path of save the asr result") + parser.add_argument( + "--audio_file", type=str, help="path of the input audio file") + parser.add_argument( + "--export_path", type=str, default='export', help="path of the input audio file") + args = parser.parse_args() + + config = CfgNode(new_allowed=True) + + if args.config: + config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + main(config, args) From 925abcca2347851af3b90d9e1dca06eb13ab04a2 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 03:44:13 +0000 Subject: [PATCH 082/113] format --- paddlespeech/s2t/modules/attention.py | 16 +++++++++------- paddlespeech/s2t/modules/encoder.py | 2 +- paddlespeech/s2t/modules/mask.py | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 2166ca8bf..d9ee763f1 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -19,8 +19,8 @@ from typing import Tuple import paddle from paddle import nn -from paddle.nn import initializer as I from paddle.nn import functional as F +from paddle.nn import initializer as I from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log @@ -56,12 +56,12 @@ class MultiHeadedAttention(nn.Layer): self.linear_out = Linear(n_feat, n_feat) self.dropout = nn.Dropout(p=dropout_rate) - def _build_once(self, *args, **kwargs): super()._build_once(*args, **kwargs) # if self.self_att: # self.linear_kv = Linear(self.n_feat, self.n_feat*2) - self.weight = paddle.concat([self.linear_k.weight, self.linear_v.weight], axis=-1) + self.weight = paddle.concat( + [self.linear_k.weight, self.linear_v.weight], axis=-1) self.bias = paddle.concat([self.linear_k.bias, self.linear_v.bias]) self._built = True @@ -84,12 +84,14 @@ class MultiHeadedAttention(nn.Layer): (#batch, n_head, time2, d_k). """ n_batch = query.shape[0] - + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) # k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) # v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - k, v = F.linear(key, self.weight, self.bias).view(n_batch, -1, 2 * self.h, self.d_k).split(2, axis=2) - + k, v = F.linear(key, self.weight, self.bias).view( + n_batch, -1, 2 * self.h, self.d_k).split( + 2, axis=2) + q = q.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) k = k.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k) v = v.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k) @@ -203,7 +205,7 @@ class MultiHeadedAttention(nn.Layer): new_cache = paddle.concat((k, v), axis=-1) # scores = paddle.matmul(q, - # k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k) + # k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k) scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k) return self.forward_attention(v, scores, mask), new_cache diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index f23d3f140..fd7bd7b9a 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -221,7 +221,7 @@ class BaseEncoder(nn.Layer): xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) # after embed, xs=(B=1, chunk_size, hidden-dim) - elayers, _, cache_t1, _ = att_cache.shape + elayers, _, cache_t1, _ = att_cache.shape chunk_size = xs.shape[1] attention_key_size = cache_t1 + chunk_size diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py index 787a06528..65619eb90 100644 --- a/paddlespeech/s2t/modules/mask.py +++ b/paddlespeech/s2t/modules/mask.py @@ -110,7 +110,7 @@ def subsequent_mask(size: int) -> paddle.Tensor: """ ret = paddle.ones([size, size], dtype=paddle.bool) return paddle.tril(ret) - + def subsequent_chunk_mask( size: int, From abe22e56a48e96dee1b83b71c9ac3babf0afa62e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 03:45:37 +0000 Subject: [PATCH 083/113] paddele vertion for u2/u2pp export --- examples/wenetspeech/asr1/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md index 9fc2856ce..5a516f8ea 100644 --- a/examples/wenetspeech/asr1/README.md +++ b/examples/wenetspeech/asr1/README.md @@ -21,6 +21,8 @@ tar cvzf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz model.yaml ## Export Static Model +>> Need Paddle >= 2.4 + >> `data/test_meeting/data.list` >> {"input": [{"name": "input1", "shape": [3.2230625, 80], "feat": "/home/PaddleSpeech/dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0163.wav", "filetype": "sound"}], "output": [{"name": "target1", "shape": [9, 5538], "text": "\u697c\u5e02\u8c03\u63a7\u5c06\u53bb\u5411\u4f55\u65b9", "token": "\u697c \u5e02 \u8c03 \u63a7 \u5c06 \u53bb \u5411 \u4f55 \u65b9", "tokenid": "1891 1121 3502 1543 1018 477 528 163 1657"}], "utt": "BAC009S0764W0163", "utt2spk": "S0764"} From e86337a4233d9bfa8b802a8cfd218e9c9637e158 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 03:49:19 +0000 Subject: [PATCH 084/113] fix bug --- paddlespeech/s2t/exps/u2/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 4208d389e..d093821d8 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -350,7 +350,7 @@ class U2Tester(U2Trainer): ctc_weight=decode_config.ctc_weight, decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, - simulate_streaming=decode_config.simulate_streaming + simulate_streaming=decode_config.simulate_streaming, reverse_weight=decode_config.reverse_weight) decode_time = time.time() - start_time From 0359c3f6b5f3f4567810ea1f1d01deaa7b8f9149 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Sat, 8 Oct 2022 11:53:42 +0800 Subject: [PATCH 085/113] Fix mix front (#2493) * update mix frontend, test=tts --- paddlespeech/t2s/frontend/mix_frontend.py | 216 ++++------------------ 1 file changed, 39 insertions(+), 177 deletions(-) diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py index 101a1e503..19c98d53f 100644 --- a/paddlespeech/t2s/frontend/mix_frontend.py +++ b/paddlespeech/t2s/frontend/mix_frontend.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import re from typing import Dict from typing import List @@ -30,7 +29,6 @@ class MixFrontend(): self.zh_frontend = Frontend( phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path) self.en_frontend = English(phone_vocab_path=phone_vocab_path) - self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)') self.sp_id = self.zh_frontend.vocab_phones["sp"] self.sp_id_tensor = paddle.to_tensor([self.sp_id]) @@ -47,188 +45,56 @@ class MixFrontend(): else: return False - def is_number(self, char): - if char >= '\u0030' and char <= '\u0039': - return True - else: - return False - def is_other(self, char): - if not (self.is_chinese(char) or self.is_number(char) or - self.is_alphabet(char)): + if not (self.is_chinese(char) or self.is_alphabet(char)): return True else: return False - def is_end(self, before_char, after_char) -> bool: - flag = 0 - for char in (before_char, after_char): - if self.is_alphabet(char) or char == " ": - flag += 1 - if flag == 2: - return True - else: - return False - - def _replace(self, text: str) -> str: - new_text = "" - - # get "." indexs - point = "." - point_indexs = [] - index = -1 - for i in range(text.count(point)): - index = text.find(".", index + 1, len(text)) - point_indexs.append(index) - - # replace "." -> "。" when English sentence ending - if len(point_indexs) == 0: - new_text = text - - elif len(point_indexs) == 1: - point_index = point_indexs[0] - if point_index == 0 or point_index == len(text) - 1: - new_text = text - else: - if not self.is_end(text[point_index - 1], text[point_index + - 1]): - new_text = text - else: - new_text = text[:point_index] + "。" + text[point_index + 1:] - - elif len(point_indexs) == 2: - first_index = point_indexs[0] - end_index = point_indexs[1] - - # first - if first_index != 0: - if not self.is_end(text[first_index - 1], text[first_index + - 1]): - new_text += (text[:first_index] + ".") - else: - new_text += (text[:first_index] + "。") - else: - new_text += "." - # last - if end_index != len(text) - 1: - if not self.is_end(text[end_index - 1], text[end_index + 1]): - new_text += text[point_indexs[-2] + 1:] - else: - new_text += (text[point_indexs[-2] + 1:end_index] + "。" + - text[end_index + 1:]) - else: - new_text += "." - - else: - first_index = point_indexs[0] - end_index = point_indexs[-1] - # first - if first_index != 0: - if not self.is_end(text[first_index - 1], text[first_index + - 1]): - new_text += (text[:first_index] + ".") - else: - new_text += (text[:first_index] + "。") - else: - new_text += "." - # middle - for j in range(1, len(point_indexs) - 1): - point_index = point_indexs[j] - if not self.is_end(text[point_index - 1], text[point_index + - 1]): - new_text += ( - text[point_indexs[j - 1] + 1:point_index] + ".") - else: - new_text += ( - text[point_indexs[j - 1] + 1:point_index] + "。") - # last - if end_index != len(text) - 1: - if not self.is_end(text[end_index - 1], text[end_index + 1]): - new_text += text[point_indexs[-2] + 1:] - else: - new_text += (text[point_indexs[-2] + 1:end_index] + "。" + - text[end_index + 1:]) - else: - new_text += "." - - return new_text - - def _split(self, text: str) -> List[str]: - text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text) - # 替换英文句子的句号 "." --> "。" 用于后续分句 - text = self._replace(text) - text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) - text = text.strip() - sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] - return sentences - - def _distinguish(self, text: str) -> List[str]: + def get_segment(self, text: str) -> List[str]: # sentence --> [ch_part, en_part, ch_part, ...] - segments = [] types = [] - flag = 0 temp_seg = "" temp_lang = "" # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point. for ch in text: - if ch == ".": - types.append("point") - elif self.is_chinese(ch): + if self.is_chinese(ch): types.append("zh") elif self.is_alphabet(ch): types.append("en") - elif ch == " ": - types.append("blank") - elif self.is_number(ch): - types.append("num") else: - types.append("unk") + types.append("other") assert len(types) == len(text) for i in range(len(types)): - # find the first char of the seg if flag == 0: - # 首个字符是中文,英文或者数字 - if types[i] == "zh" or types[i] == "en" or types[i] == "num": - temp_seg += text[i] - temp_lang = types[i] - flag = 1 + temp_seg += text[i] + temp_lang = types[i] + flag = 1 else: - # 数字和小数点均与前面的字符合并,类型属于前面一个字符的类型 - if types[i] == temp_lang or types[i] == "num" or types[ - i] == "point": - temp_seg += text[i] - - # 数字与后面的任意字符都拼接 - elif temp_lang == "num": - temp_seg += text[i] - if types[i] == "zh" or types[i] == "en": + if temp_lang == "other": + if types[i] == temp_lang: + temp_seg += text[i] + else: + temp_seg += text[i] temp_lang = types[i] - # 如果是空格则与前面字符拼接 - elif types[i] == "blank": - temp_seg += text[i] - - elif types[i] == "unk": - pass - else: - segments.append((temp_seg, temp_lang)) - - if types[i] == "zh" or types[i] == "en": + if types[i] == temp_lang: + temp_seg += text[i] + elif types[i] == "other": + temp_seg += text[i] + else: + segments.append((temp_seg, temp_lang)) temp_seg = text[i] temp_lang = types[i] flag = 1 - else: - flag = 0 - temp_seg = "" - temp_lang = "" segments.append((temp_seg, temp_lang)) @@ -241,34 +107,30 @@ class MixFrontend(): add_sp: bool=True, to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: - sentences = self._split(sentence) + segments = self.get_segment(sentence) + phones_list = [] result = {} - for text in sentences: - phones_seg = [] - segments = self._distinguish(text) - for seg in segments: - content = seg[0] - lang = seg[1] - if content != '': - if lang == "en": - input_ids = self.en_frontend.get_input_ids( - content, merge_sentences=True, to_tensor=to_tensor) - else: - input_ids = self.zh_frontend.get_input_ids( - content, - merge_sentences=True, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) - phones_seg.append(input_ids["phone_ids"][0]) - if add_sp: - phones_seg.append(self.sp_id_tensor) - - if phones_seg == []: - phones_seg.append(self.sp_id_tensor) - phones = paddle.concat(phones_seg) - phones_list.append(phones) + for seg in segments: + content = seg[0] + lang = seg[1] + if content != '': + if lang == "en": + input_ids = self.en_frontend.get_input_ids( + content, merge_sentences=False, to_tensor=to_tensor) + else: + input_ids = self.zh_frontend.get_input_ids( + content, + merge_sentences=False, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) + if add_sp: + input_ids["phone_ids"][-1] = paddle.concat( + [input_ids["phone_ids"][-1], self.sp_id_tensor]) + + for phones in input_ids["phone_ids"]: + phones_list.append(phones) if merge_sentences: merge_list = paddle.concat(phones_list) From 1f4f98b171b490133c75e98e199e1ff4beb21962 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 06:34:39 +0000 Subject: [PATCH 086/113] fix bug --- paddlespeech/s2t/exps/u2/bin/quant.py | 18 ++++++++++----- paddlespeech/s2t/models/u2/u2.py | 22 +++++++++---------- .../engine/asr/online/python/asr_engine.py | 1 - 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/bin/quant.py b/paddlespeech/s2t/exps/u2/bin/quant.py index 907d79e5c..225bbf6db 100644 --- a/paddlespeech/s2t/exps/u2/bin/quant.py +++ b/paddlespeech/s2t/exps/u2/bin/quant.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +from paddleslim import PTQ from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation @@ -26,7 +27,6 @@ from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.utility import UpdateConfig -from paddleslim import PTQ logger = Log(__name__).getlog() @@ -90,14 +90,14 @@ class U2Infer(): ctc_weight=decode_config.ctc_weight, decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, - simulate_streaming=decode_config.simulate_streaming + simulate_streaming=decode_config.simulate_streaming, reverse_weight=decode_config.reverse_weight) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {rsl}") # print(self.model) # print(self.model.forward_encoder_chunk) - + logger.info("-------------start quant ----------------------") batch_size = 1 feat_dim = 80 @@ -161,7 +161,11 @@ class U2Infer(): # jit save logger.info(f"export save: {self.args.export_path}") - config = {'is_static': True, 'combine_params':True, 'skip_forward':True} + config = { + 'is_static': True, + 'combine_params': True, + 'skip_forward': True + } self.ptq.save_quantized_model(self.model, self.args.export_path) # paddle.jit.save( # self.model, @@ -169,7 +173,6 @@ class U2Infer(): # combine_params=True, # skip_forward=True) - def check(audio_file): if not os.path.isfile(audio_file): @@ -201,7 +204,10 @@ if __name__ == "__main__": parser.add_argument( "--audio_file", type=str, help="path of the input audio file") parser.add_argument( - "--export_path", type=str, default='export', help="path of the input audio file") + "--export_path", + type=str, + default='export', + help="path of the input audio file") args = parser.parse_args() config = CfgNode(new_allowed=True) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 5cdcae06f..544c1e836 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -131,7 +131,8 @@ class U2BaseModel(ASRInterface, nn.Layer): if self.ctc_weight != 1.0: start = time.time() loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths, self.reverse_weight) + text, text_lengths, + self.reverse_weight) decoder_time = time.time() - start #logger.debug(f"decoder time: {decoder_time}") @@ -152,13 +153,12 @@ class U2BaseModel(ASRInterface, nn.Layer): loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att return loss, loss_att, loss_ctc - def _calc_att_loss( - self, - encoder_out: paddle.Tensor, - encoder_mask: paddle.Tensor, - ys_pad: paddle.Tensor, - ys_pad_lens: paddle.Tensor, - reverse_weight: float) -> Tuple[paddle.Tensor, float]: + def _calc_att_loss(self, + encoder_out: paddle.Tensor, + encoder_mask: paddle.Tensor, + ys_pad: paddle.Tensor, + ys_pad_lens: paddle.Tensor, + reverse_weight: float) -> Tuple[paddle.Tensor, float]: """Calc attention loss. Args: @@ -188,8 +188,7 @@ class U2BaseModel(ASRInterface, nn.Layer): r_loss_att = paddle.to_tensor(0.0) if reverse_weight > 0.0: r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * (1 - reverse_weight - ) + r_loss_att * reverse_weight + loss_att = loss_att * (1 - reverse_weight) + r_loss_att * reverse_weight acc_att = th_accuracy( decoder_out.view(-1, self.vocab_size), ys_out_pad, @@ -599,8 +598,7 @@ class U2BaseModel(ASRInterface, nn.Layer): f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}" ) - score = score * (1 - reverse_weight - ) + r_score * reverse_weight + score = score * (1 - reverse_weight) + r_score * reverse_weight # add ctc score (which in ln domain) score += hyp[1] * ctc_weight if score > best_score: diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 27eda7ef6..67bbb4d48 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -22,7 +22,6 @@ from numpy import float32 from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation -from paddlespeech.audio.utils.tensor_utils import st_reverse_pad_list from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource From 9277fcb8a85d7a064f90eebdc7f9ba547abec13e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 08:15:51 +0000 Subject: [PATCH 087/113] fix attn can not train --- paddlespeech/s2t/modules/attention.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index d9ee763f1..128f87c07 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -60,9 +60,10 @@ class MultiHeadedAttention(nn.Layer): super()._build_once(*args, **kwargs) # if self.self_att: # self.linear_kv = Linear(self.n_feat, self.n_feat*2) - self.weight = paddle.concat( - [self.linear_k.weight, self.linear_v.weight], axis=-1) - self.bias = paddle.concat([self.linear_k.bias, self.linear_v.bias]) + if not self.training: + self.weight = paddle.concat( + [self.linear_k.weight, self.linear_v.weight], axis=-1) + self.bias = paddle.concat([self.linear_k.bias, self.linear_v.bias]) self._built = True def forward_qkv(self, @@ -86,11 +87,13 @@ class MultiHeadedAttention(nn.Layer): n_batch = query.shape[0] q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - # k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - # v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - k, v = F.linear(key, self.weight, self.bias).view( - n_batch, -1, 2 * self.h, self.d_k).split( - 2, axis=2) + if self.training: + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + else: + k, v = F.linear(key, self.weight, self.bias).view( + n_batch, -1, 2 * self.h, self.d_k).split( + 2, axis=2) q = q.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) k = k.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k) From c98b5dd173ffce56eb58e23f4873c7afd7378c51 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 8 Oct 2022 09:07:30 +0000 Subject: [PATCH 088/113] fix masked_fill which will nan in trainning --- paddlespeech/s2t/__init__.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 4507365d6..6663bcf87 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -166,8 +166,19 @@ def broadcast_shape(shp1, shp2): def masked_fill(xs: paddle.Tensor, mask: paddle.Tensor, value: Union[float, int]): - mask = mask.astype(xs.dtype) - return xs * (1.0 - mask) + mask * value + # will be nan when value is `inf`. + # mask = mask.astype(xs.dtype) + # return xs * (1.0 - mask) + mask * value + + bshape = broadcast_shape(xs.shape, mask.shape) + mask.stop_gradient = True + # tmp = paddle.ones(shape=[len(bshape)], dtype='int32') + # for index in range(len(bshape)): + # tmp[index] = bshape[index] + mask = mask.broadcast_to(bshape) + trues = paddle.full_like(xs, fill_value=value) + xs = paddle.where(mask, trues, xs) + return xs if not hasattr(paddle.Tensor, 'masked_fill'): From cda440e6f0bfcc964727cf4b652ffe5a97f072d7 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Sun, 9 Oct 2022 01:46:44 +0000 Subject: [PATCH 089/113] use reverse_weight in decode.yaml --- docs/source/released_model.md | 2 +- paddlespeech/resource/pretrained_models.py | 12 ++++++------ paddlespeech/s2t/exps/u2/bin/quant.py | 4 ++-- paddlespeech/s2t/exps/u2/bin/test_wav.py | 3 ++- paddlespeech/s2t/exps/u2/model.py | 3 ++- .../server/engine/asr/online/python/asr_engine.py | 8 ++++---- 6 files changed, 17 insertions(+), 15 deletions(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index bdac2c5bb..a2456f1fe 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -9,7 +9,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) | onnx/inference/python | [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python | [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python | -[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.1.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python | +[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python | [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python | [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python | [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) | python | diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 0103651bc..55f7eff19 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -69,11 +69,11 @@ asr_dynamic_pretrained_models = { }, }, "conformer_u2pp_wenetspeech-zh-16k": { - '1.1': { + '1.3': { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.1.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.3.model.tar.gz', 'md5': - 'eae678c04ed3b3f89672052fdc0c5e10', + '662b347e1d2131b7a4dc5398365e2134', 'cfg_path': 'model.yaml', 'ckpt_path': @@ -89,11 +89,11 @@ asr_dynamic_pretrained_models = { }, }, "conformer_u2pp_online_wenetspeech-zh-16k": { - '1.1': { + '1.4': { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.2.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz', 'md5': - '925d047e9188dea7f421a718230c9ae3', + '3100fc1eac5779486cab859366992d0b', 'cfg_path': 'model.yaml', 'ckpt_path': diff --git a/paddlespeech/s2t/exps/u2/bin/quant.py b/paddlespeech/s2t/exps/u2/bin/quant.py index 225bbf6db..c38134c57 100644 --- a/paddlespeech/s2t/exps/u2/bin/quant.py +++ b/paddlespeech/s2t/exps/u2/bin/quant.py @@ -39,7 +39,6 @@ class U2Infer(): self.preprocess_conf = config.preprocess_config self.preprocess_args = {"train": False} self.preprocessing = Transformation(self.preprocess_conf) - self.reverse_weight = getattr(config.model_conf, 'reverse_weight', 0.0) self.text_feature = TextFeaturizer( unit_type=config.unit_type, vocab=config.vocab_filepath, @@ -81,6 +80,7 @@ class U2Infer(): xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode logger.info(f"decode cfg: {decode_config}") + reverse_weight = getattr(decode_config, 'reverse_weight', 0.0) result_transcripts = self.model.decode( xs, ilen, @@ -91,7 +91,7 @@ class U2Infer(): decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, simulate_streaming=decode_config.simulate_streaming, - reverse_weight=decode_config.reverse_weight) + reverse_weight=reverse_weight) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {rsl}") diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 2e067ab6b..d12ea3646 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -79,6 +79,7 @@ class U2Infer(): xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode logger.info(f"decode cfg: {decode_config}") + reverse_weight = getattr(decode_config, 'reverse_weight', 0.0) result_transcripts = self.model.decode( xs, ilen, @@ -89,7 +90,7 @@ class U2Infer(): decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, simulate_streaming=decode_config.simulate_streaming, - reverse_weight=decode_config.reverse_weight) + reverse_weight=reverse_weight) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {result_transcripts[0][0]}") diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index d093821d8..5b7654d4a 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -337,6 +337,7 @@ class U2Tester(U2Trainer): errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if decode_config.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer + reverse_weight = getattr(decode_config, 'reverse_weight', 0.0) start_time = time.time() target_transcripts = self.id2token(texts, texts_len, self.text_feature) @@ -351,7 +352,7 @@ class U2Tester(U2Trainer): decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, simulate_streaming=decode_config.simulate_streaming, - reverse_weight=decode_config.reverse_weight) + reverse_weight=reverse_weight) decode_time = time.time() - start_time for utt, target, result, rec_tids in zip( diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 67bbb4d48..536ffe0a9 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -580,6 +580,7 @@ class PaddleASRConnectionHanddler: self.update_result() beam_size = self.ctc_decode_config.beam_size + reverse_weight = getattr(self.ctc_decode_config, 'reverse_weight', 0.0) hyps = self.searcher.get_hyps() if hyps is None or len(hyps) == 0: logger.info("No Hyps!") @@ -613,7 +614,7 @@ class PaddleASRConnectionHanddler: # ctc score in ln domain # (beam_size, max_hyps_len, vocab_size) decoder_out, r_decoder_out = self.model.forward_attention_decoder( - hyps_pad, hyps_lens, self.encoder_out, self.model.reverse_weight) + hyps_pad, hyps_lens, self.encoder_out, reverse_weight) decoder_out = decoder_out.numpy() # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a @@ -631,13 +632,12 @@ class PaddleASRConnectionHanddler: # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.model.eos] - if self.model.reverse_weight > 0: + if reverse_weight > 0: r_score = 0.0 for j, w in enumerate(hyp[0]): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.model.eos] - score = score * (1 - self.model.reverse_weight - ) + r_score * self.model.reverse_weight + score = score * (1 - reverse_weight) + r_score * reverse_weight # add ctc score (which in ln domain) score += hyp[1] * self.ctc_decode_config.ctc_weight From 5a66a14659b5839e93afc315fc0d8b1ff4efeba8 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Sun, 9 Oct 2022 02:31:14 +0000 Subject: [PATCH 090/113] fix u2pp model version number --- paddlespeech/resource/pretrained_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 55f7eff19..efd6bb3f2 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -69,7 +69,7 @@ asr_dynamic_pretrained_models = { }, }, "conformer_u2pp_wenetspeech-zh-16k": { - '1.3': { + '1.1': { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.3.model.tar.gz', 'md5': @@ -89,7 +89,7 @@ asr_dynamic_pretrained_models = { }, }, "conformer_u2pp_online_wenetspeech-zh-16k": { - '1.4': { + '1.1': { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz', 'md5': From d2999ba21dd2480e51f5ef892d24557ff780d468 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Sun, 9 Oct 2022 11:39:32 +0000 Subject: [PATCH 091/113] update install.md --- README.md | 6 +++--- docs/source/install.md | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 72db64b7d..d3eccdc92 100644 --- a/README.md +++ b/README.md @@ -183,19 +183,19 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision ## Installation -We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7* and *paddlepaddle>=2.3.1*. +We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7* and *paddlepaddle>=2.4rc*. ### **Dependency Introduction** + gcc >= 4.8.5 -+ paddlepaddle >= 2.3.1 ++ paddlepaddle >= 2.4rc + python >= 3.7 + OS support: Linux(recommend), Windows, Mac OSX PaddleSpeech depends on paddlepaddle. For installation, please refer to the official website of [paddlepaddle](https://www.paddlepaddle.org.cn/en) and choose according to your own machine. Here is an example of the cpu version. ```bash -pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html ``` There are two quick installation methods for PaddleSpeech, one is pip installation, and the other is source code compilation (recommended). diff --git a/docs/source/install.md b/docs/source/install.md index 6a9ff3bc8..f789b37d2 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -58,7 +58,7 @@ pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple ``` Then you can use the following commands: ```bash -pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` > If you encounter problem with downloading **nltk_data** while using paddlespeech, it maybe due to your poor network, we suggest you download the [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) provided by us, and extract it to your `${HOME}`. @@ -117,9 +117,9 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ``` (Hip: Do not use the last script if you want to install by **Hard** way): ### Install PaddlePaddle -You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu 2.3.1: +You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu develop: ```bash -python3 -m pip install paddlepaddle-gpu==2.3.1 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` ### Install PaddleSpeech You can install `paddlespeech` by the following command,then you can use the `ready-made` examples in `paddlespeech` : @@ -180,9 +180,9 @@ Some users may fail to install `kaldiio` due to the default download source, you ```bash pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple ``` -Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle 2.3.1: +Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle develop: ```bash -python3 -m pip install paddlepaddle-gpu==2.3.1 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` ### Install PaddleSpeech in Developing Mode ```bash From e3672427650f451faae87dd6e226ad9fc6c9793e Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Mon, 10 Oct 2022 06:11:22 +0000 Subject: [PATCH 092/113] update dependency of paddle --- README.md | 2 +- README_cn.md | 4 ++-- demos/speech_server/README.md | 2 +- demos/speech_server/README_cn.md | 2 +- demos/streaming_asr_server/README.md | 2 +- demos/streaming_asr_server/README_cn.md | 2 +- demos/streaming_tts_server/README.md | 2 +- demos/streaming_tts_server/README_cn.md | 2 +- docker/ubuntu16-gpu/Dockerfile | 2 +- docs/source/install.md | 14 ++++++++++--- docs/source/install_cn.md | 20 +++++++++++++------ .../server/tests/asr/online/README.md | 4 ++-- .../server/tests/asr/online/README_cn.md | 2 +- 13 files changed, 38 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index d3eccdc92..c80a31fde 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ We strongly recommend our users to install PaddleSpeech in **Linux** with *pytho PaddleSpeech depends on paddlepaddle. For installation, please refer to the official website of [paddlepaddle](https://www.paddlepaddle.org.cn/en) and choose according to your own machine. Here is an example of the cpu version. ```bash -pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html +pip install paddlepaddle==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple ``` There are two quick installation methods for PaddleSpeech, one is pip installation, and the other is source code compilation (recommended). diff --git a/README_cn.md b/README_cn.md index 725f7eda1..49f42dae5 100644 --- a/README_cn.md +++ b/README_cn.md @@ -215,14 +215,14 @@ ### 相关依赖 + gcc >= 4.8.5 -+ paddlepaddle >= 2.3.1 ++ paddlepaddle >= 2.4rc + python >= 3.7 + linux(推荐), mac, windows PaddleSpeech 依赖于 paddlepaddle,安装可以参考[ paddlepaddle 官网](https://www.paddlepaddle.org.cn/),根据自己机器的情况进行选择。这里给出 cpu 版本示例,其它版本大家可以根据自己机器的情况进行安装。 ```shell -pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +pip install paddlepaddle==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple ``` PaddleSpeech 快速安装方式有两种,一种是 pip 安装,一种是源码编译(推荐)。 diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index e400f7e74..7e7d4b2c5 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -13,7 +13,7 @@ For service interface definition, please check: ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.3.1** or above. +It is recommended to use **paddlepaddle 2.4rc** or above. You can choose one way from easy, meduim and hard to install paddlespeech. diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 628468c83..594928281 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -14,7 +14,7 @@ ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.3.1** 或以上版本。 +推荐使用 **paddlepaddle 2.4rc** 或以上版本。 你可以从简单,中等,困难 几种方式中选择一种方式安装 PaddleSpeech。 diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index a97486757..5eef82866 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -14,7 +14,7 @@ Streaming ASR server only support `websocket` protocol, and doesn't support `htt ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.3.1** or above. +It is recommended to use **paddlepaddle 2.4rc** or above. You can choose one way from easy, meduim and hard to install paddlespeech. diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 267367729..1902a2fa9 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -14,7 +14,7 @@ ### 1. 安装 安装 PaddleSpeech 的详细过程请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md)。 -推荐使用 **paddlepaddle 2.3.1** 或以上版本。 +推荐使用 **paddlepaddle 2.4rc** 或以上版本。 你可以从简单,中等,困难 几种方式中选择一种方式安装 PaddleSpeech。 diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index 15448a46f..ca5d6f1f8 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -13,7 +13,7 @@ For service interface definition, please check: ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.3.1** or above. +It is recommended to use **paddlepaddle 2.4rc** or above. You can choose one way from easy, meduim and hard to install paddlespeech. diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index b99155bca..125f37033 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -12,7 +12,7 @@ ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.3.1** 或以上版本。 +推荐使用 **paddlepaddle 2.4rc** 或以上版本。 你可以从简单,中等,困难 几种方式中选择一种方式安装 PaddleSpeech。 diff --git a/docker/ubuntu16-gpu/Dockerfile b/docker/ubuntu16-gpu/Dockerfile index f275471ee..a8c11e37b 100644 --- a/docker/ubuntu16-gpu/Dockerfile +++ b/docker/ubuntu16-gpu/Dockerfile @@ -62,7 +62,7 @@ RUN mkdir -p ~/.pip && echo "[global]" > ~/.pip/pip.conf && \ echo "index-url=https://mirror.baidu.com/pypi/simple" >> ~/.pip/pip.conf && \ echo "trusted-host=mirror.baidu.com" >> ~/.pip/pip.conf && \ python3 -m pip install --upgrade pip && \ - pip install paddlepaddle-gpu==2.3.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html && \ + pip install paddlepaddle-gpu==2.4.0rc0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html && \ rm -rf ~/.cache/pip RUN git clone https://github.com/PaddlePaddle/PaddleSpeech.git && cd PaddleSpeech && \ diff --git a/docs/source/install.md b/docs/source/install.md index f789b37d2..187bd4ea0 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -58,7 +58,7 @@ pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple ``` Then you can use the following commands: ```bash -pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html +pip install paddlepaddle==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` > If you encounter problem with downloading **nltk_data** while using paddlespeech, it maybe due to your poor network, we suggest you download the [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) provided by us, and extract it to your `${HOME}`. @@ -117,7 +117,11 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ``` (Hip: Do not use the last script if you want to install by **Hard** way): ### Install PaddlePaddle -You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu develop: +You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu 2.4rc: +```bash +python3 -m pip install paddlepaddle-gpu==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple +``` +You can also install the develop version of paddlepaddle. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu develop: ```bash python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` @@ -180,7 +184,11 @@ Some users may fail to install `kaldiio` due to the default download source, you ```bash pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple ``` -Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle develop: +Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle 2.4rc: +```bash +python3 -m pip install paddlepaddle-gpu==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple +``` +You can also install the develop version of paddlepaddle. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu develop: ```bash python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` diff --git a/docs/source/install_cn.md b/docs/source/install_cn.md index 9f49ebad6..9936a214a 100644 --- a/docs/source/install_cn.md +++ b/docs/source/install_cn.md @@ -55,8 +55,8 @@ pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple ``` 然后你可以使用如下命令: ```bash -pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple -pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple +pip install paddlepaddle==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple +pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` > 如果您在使用 paddlespeech 的过程中遇到关于下载 **nltk_data** 的问题,可能是您的网络不佳,我们建议您下载我们提供的 [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) 并解压缩到您的 `${HOME}` 目录下。 @@ -111,9 +111,13 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ``` (提示: 如果你想使用**困难**方式完成安装,请不要使用最后一条命令) ### 安装 PaddlePaddle -你可以根据系统配置选择 PaddlePaddle 版本,例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.3.1: +你可以根据系统配置选择 PaddlePaddle 版本,例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.4rc: ```bash -python3 -m pip install paddlepaddle-gpu==2.3.1 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle-gpu==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple +``` +你也可以安装 develop 版本的PaddlePaddle. 例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu develop: +```bash +python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` ### 安装 PaddleSpeech 最后安装 `paddlespeech`,这样你就可以使用 `paddlespeech` 中已有的 examples: @@ -168,9 +172,13 @@ conda activate tools/venv conda install -y -c conda-forge sox libsndfile swig bzip2 libflac bc ``` ### 安装 PaddlePaddle -请确认你系统是否有 GPU,并且使用了正确版本的 paddlepaddle。例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.3.1: +请确认你系统是否有 GPU,并且使用了正确版本的 paddlepaddle。例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.4rc: +```bash +python3 -m pip install paddlepaddle-gpu==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple +``` +你也可以安装 develop 版本的PaddlePaddle. 例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu develop: ```bash -python3 -m pip install paddlepaddle-gpu==2.3.1 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` ### 用开发者模式安装 PaddleSpeech 部分用户系统由于默认源的问题,安装中会出现 kaldiio 安转出错的问题,建议首先安装 pytest-runner: diff --git a/paddlespeech/server/tests/asr/online/README.md b/paddlespeech/server/tests/asr/online/README.md index e1e4d9506..1d7fa8824 100644 --- a/paddlespeech/server/tests/asr/online/README.md +++ b/paddlespeech/server/tests/asr/online/README.md @@ -11,8 +11,8 @@ This document introduces a client for streaming asr service: microphone ### 1. Install Refer [Install](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). - **paddlepaddle 2.2.1** 或以上版本。 -It is recommended to use **paddlepaddle 2.2.1** or above. + **paddlepaddle 2.4rc** 或以上版本。 +It is recommended to use **paddlepaddle 2.4rc** or above. You can choose one way from meduim and hard to install paddlespeech. diff --git a/paddlespeech/server/tests/asr/online/README_cn.md b/paddlespeech/server/tests/asr/online/README_cn.md index 46dff250e..403216369 100644 --- a/paddlespeech/server/tests/asr/online/README_cn.md +++ b/paddlespeech/server/tests/asr/online/README_cn.md @@ -10,7 +10,7 @@ ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.2.1** 或以上版本。 +推荐使用 **paddlepaddle 2.4rc** 或以上版本。 你可以从 medium,hard 三中方式中选择一种方式安装 PaddleSpeech。 From 6e429f051316628f99ed5e68ccaa91f6d1a32cc0 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Mon, 10 Oct 2022 11:42:44 +0000 Subject: [PATCH 093/113] support wav2vec2ASR on librispeech --- examples/librispeech/asr3/README.md | 191 +++ examples/librispeech/asr3/RESULTS.md | 8 + examples/librispeech/asr3/cmd.sh | 89 ++ .../librispeech/asr3/conf/preprocess.yaml | 4 + .../librispeech/asr3/conf/tuning/decode.yaml | 11 + .../librispeech/asr3/conf/wav2vec2ASR.yaml | 120 ++ examples/librispeech/asr3/local/test.sh | 84 ++ examples/librispeech/asr3/local/test_wav.sh | 58 + examples/librispeech/asr3/local/train.sh | 55 + examples/librispeech/asr3/path.sh | 15 + examples/librispeech/asr3/run.sh | 48 + examples/librispeech/asr3/utils | 1 + .../s2t/exps/wav2vec2/bin/__init__.py | 13 + paddlespeech/s2t/exps/wav2vec2/bin/test.py | 66 + .../s2t/exps/wav2vec2/bin/test_wav.py | 118 ++ paddlespeech/s2t/exps/wav2vec2/bin/train.py | 54 + paddlespeech/s2t/exps/wav2vec2/model.py | 435 +++++++ paddlespeech/s2t/models/wav2vec2/__init__.py | 0 .../s2t/models/wav2vec2/modules/VanillaNN.py | 45 + .../models/wav2vec2/modules/activations.py | 175 +++ .../s2t/models/wav2vec2/modules/containers.py | 131 ++ .../s2t/models/wav2vec2/modules/linear.py | 73 ++ .../wav2vec2/modules/modeling_outputs.py | 1129 ++++++++++++++++ .../wav2vec2/modules/modeling_wav2vec2.py | 1131 +++++++++++++++++ .../wav2vec2/processing/signal_processing.py | 242 ++++ .../processing/speech_augmentation.py | 727 +++++++++++ .../s2t/models/wav2vec2/wav2vec2_ASR.py | 247 ++++ 27 files changed, 5270 insertions(+) create mode 100644 examples/librispeech/asr3/README.md create mode 100644 examples/librispeech/asr3/RESULTS.md create mode 100644 examples/librispeech/asr3/cmd.sh create mode 100644 examples/librispeech/asr3/conf/preprocess.yaml create mode 100644 examples/librispeech/asr3/conf/tuning/decode.yaml create mode 100644 examples/librispeech/asr3/conf/wav2vec2ASR.yaml create mode 100644 examples/librispeech/asr3/local/test.sh create mode 100644 examples/librispeech/asr3/local/test_wav.sh create mode 100644 examples/librispeech/asr3/local/train.sh create mode 100644 examples/librispeech/asr3/path.sh create mode 100644 examples/librispeech/asr3/run.sh create mode 120000 examples/librispeech/asr3/utils create mode 100644 paddlespeech/s2t/exps/wav2vec2/bin/__init__.py create mode 100644 paddlespeech/s2t/exps/wav2vec2/bin/test.py create mode 100644 paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py create mode 100644 paddlespeech/s2t/exps/wav2vec2/bin/train.py create mode 100644 paddlespeech/s2t/exps/wav2vec2/model.py create mode 100644 paddlespeech/s2t/models/wav2vec2/__init__.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/VanillaNN.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/activations.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/containers.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/linear.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/modeling_outputs.py create mode 100644 paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py create mode 100644 paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py create mode 100644 paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py create mode 100644 paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py diff --git a/examples/librispeech/asr3/README.md b/examples/librispeech/asr3/README.md new file mode 100644 index 000000000..bd96af86f --- /dev/null +++ b/examples/librispeech/asr3/README.md @@ -0,0 +1,191 @@ +# Wav2vec2ASR with Librispeech +This example contains code used to finetune [wav2vec2.0](https://https://arxiv.org/pdf/2006.11477.pdf) model with [Librispeech dataset](http://www.openslr.org/resources/12) +## Overview +All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function. +| Stage | Function | +|:---- |:----------------------------------------------------------- | +| 0 | Process data. It includes:
(1) Download the dataset
(2) Calculate the CMVN of the train dataset
(3) Get the vocabulary file
(4) Get the manifest files of the train, development and test dataset
(5) Download the pretrained wav2vec2 model | +| 1 | Train the model | +| 2 | Get the final model by averaging the top-k models, set k = 1 means to choose the best model | +| 3 | Test the final model performance | +| 4 | Infer the single audio file | + + +You can choose to run a range of stages by setting `stage` and `stop_stage `. + +For example, if you want to execute the code in stage 2 and stage 3, you can run this script: +```bash +bash run.sh --stage 2 --stop_stage 3 +``` +Or you can set `stage` equal to `stop-stage` to only run one stage. +For example, if you only want to run `stage 0`, you can use the script below: +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +The document below will describe the scripts in `run.sh` in detail. +## The Environment Variables +The path.sh contains the environment variables. +```bash +. ./path.sh +. ./cmd.sh +``` +This script needs to be run first. And another script is also needed: +```bash +source ${MAIN_ROOT}/utils/parse_options.sh +``` +It will support the way of using `--variable value` in the shell scripts. +## The Local Variables +Some local variables are set in `run.sh`. +`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. +`stage` denotes the number of stages you want to start from in the experiments. +`stop stage` denotes the number of the stage you want to end at in the experiments. +`conf_path` denotes the config path of the model. +`avg_num` denotes the number K of top-K models you want to average to get the final model. +`audio file` denotes the file path of the single file you want to infer in stage 5 +`ckpt` denotes the checkpoint prefix of the model, e.g. "wav2vec2ASR" + +You can set the local variables (except `ckpt`) when you use `run.sh` + +For example, you can set the `gpus` and `avg_num` when you use the command line: +```bash +bash run.sh --gpus 0,1 --avg_num 20 +``` +## Stage 0: Data Processing +To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below: +```bash + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 + fi +``` +Stage 0 is for processing the data. + +If you only want to process the data. You can run +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +You can also just run these scripts in your command line. +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +``` +After processing the data, the `data` directory will look like this: +```bash +data/ +|-- dev.meta +|-- lang_char +| `-- bpe_unigram_5000.model +| `-- bpe_unigram_5000.vocab +| `-- vocab.txt +|-- manifest.dev +|-- manifest.dev.raw +|-- manifest.test +|-- manifest.test.raw +|-- manifest.train +|-- manifest.train.raw +|-- mean_std.json +|-- test.meta +`-- train.meta +``` +## Stage 1: Model Training +If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. +```bash +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + fi +``` +If you want to train the model, you can use the script below to execute stage 0 and stage 1: +```bash +bash run.sh --stage 0 --stop_stage 1 +``` +or you can run these scripts in the command line (only use CPU). +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +``` +## Stage 2: Top-k Models Averaging +After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below. Note: We only train one epoch for wav2vec2ASR, thus the `avg_num` is set to 1. +```bash + if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh best exp/${ckpt}/checkpoints ${avg_num} + fi +``` +The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`. +If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2: +```bash +bash run.sh --stage 0 --stop_stage 2 +``` +or you can run these scripts in the command line (only use CPU). + +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +avg.sh best exp/wav2vec2ASR/checkpoints 1 +``` +## Stage 3: Model Testing +The test stage is to evaluate the model performance. The code of test stage is shown below: +```bash + if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + fi +``` +If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 : +```bash +bash run.sh --stage 0 --stop_stage 3 +``` +or you can run these scripts in the command line (only use CPU). +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +avg.sh best exp/wav2vec2ASR/checkpoints 1 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 +``` +## Pretrained Model +You can get the pretrained wav2vec2ASR from [this](../../../docs/source/released_model.md). + +using the `tar` scripts to unpack the model and then you can use the script to test the model. + +For example: +```bash +wget https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/wav2vec2ASR-large-960h-librispeech_ckpt_1.3.0.model.tar.gz +tar xzvf wav2vec2ASR-large-960h-librispeech_ckpt_1.3.0.model.tar.gz +source path.sh +# If you have process the data and get the manifest file, you can skip the following 2 steps +bash local/data.sh --stage -1 --stop_stage -1 +bash local/data.sh --stage 2 --stop_stage 2 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 +``` +The performance of the released models are shown in [here](./RESULTS.md). + + +## Stage 4: Single Audio File Inference +In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below +```bash + if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + fi +``` +you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below: +```bash +wget https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/wav2vec2ASR-large-960h-librispeech_ckpt_1.3.0.model.tar.gz +tar xzvf wav2vec2ASR-large-960h-librispeech_ckpt_1.3.0.model.tar.gz +``` +You can download the audio demo: +```bash +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ +``` +You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. +```bash +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_002_en.wav +``` diff --git a/examples/librispeech/asr3/RESULTS.md b/examples/librispeech/asr3/RESULTS.md new file mode 100644 index 000000000..1c5626d9e --- /dev/null +++ b/examples/librispeech/asr3/RESULTS.md @@ -0,0 +1,8 @@ +# LibriSpeech + +## Wav2VecASR +train: Epoch 1, 1*V100-32G, batchsize:10 + +| Model | Params | Config | Augmentation| Test set | Decode method | WER | +| --- | --- | --- | --- | --- | --- | --- | +| wav2vec2ASR | 302.86 M | conf/wav2vec2ASR.yaml | spec_aug | test-clean | greedy search | 0.018887 | diff --git a/examples/librispeech/asr3/cmd.sh b/examples/librispeech/asr3/cmd.sh new file mode 100644 index 000000000..7b70ef5e0 --- /dev/null +++ b/examples/librispeech/asr3/cmd.sh @@ -0,0 +1,89 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time