From 4e7106d9e2a3eb9ee5ab870dcae3a3c59eac338e Mon Sep 17 00:00:00 2001 From: 0x45f Date: Wed, 27 Jul 2022 09:32:11 +0000 Subject: [PATCH 001/124] Support dy2st --- paddlespeech/s2t/exps/u2/model.py | 165 +++++++++++++++++- paddlespeech/s2t/models/u2/u2.py | 42 ++++- .../engine/asr/online/python/asr_engine.py | 17 +- 3 files changed, 210 insertions(+), 14 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index cdad3b8f7..b41f320b4 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -471,6 +471,165 @@ class U2Tester(U2Trainer): infer_model, input_spec = self.load_inferspec() assert isinstance(input_spec, list), type(input_spec) infer_model.eval() - static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) - logger.info(f"Export code: {static_model.forward.code}") - paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) + # logger.info(f"Export code: {static_model.forward.code}") + # paddle.jit.save(static_model, self.args.export_path) + + # # to check outputs + # def flatten(out): + # if isinstance(out, paddle.Tensor): + # return [out] + + # flatten_out = [] + # for var in out: + # if isinstance(var, (list, tuple)): + # flatten_out.extend(flatten(var)) + # else: + # flatten_out.append(var) + # return flatten_out + + + # ######################### infer_model.forward_attention_decoder ######################## + # a = paddle.full(shape=[10, 8], fill_value=10, dtype='int64') + # b = paddle.full(shape=[10], fill_value=8, dtype='int64') + # # c = paddle.rand(shape=[1, 20, 512], dtype='float32') + # c = paddle.full(shape=[1, 20, 512], fill_value=1, dtype='float32') + + # out1 = infer_model.forward_attention_decoder(a, b, c) + # print(out1) + + # input_spec = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # paddle.static.InputSpec(shape=[None], dtype='int64'), + # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_attention_decoder, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + # out2 = static_model(a, b, c) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + + + + + + # ######################### infer_model.forward_encoder_chunk ######################## + # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([80], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.randn(shape=[12, 8, 80, 128], dtype='float32') + # cnn_cache = paddle.randn(shape=[12, 1, 512, 14], dtype='float32') + # # out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + # zero_out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache=paddle.zeros([0, 0, 0, 0]), cnn_cache=paddle.zeros([0, 0, 0, 0])) + # # print(zero_out1) + + # input_spec = [ + # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # paddle.static.InputSpec(shape=[1], dtype='int32'), + # -16, + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + # # out2 = static_model(xs, offset, att_cache, cnn_cache) + # # print(out2) + # zero_out2 = static_model(xs, offset, paddle.zeros([0, 0, 0, 0]), paddle.zeros([0, 0, 0, 0])) + + # # out1 = flatten(out1) + # # out2 = flatten(out2) + # # for i in range(len(out1)): + # # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + # zero_out1 = flatten(zero_out1) + # zero_out2 = flatten(zero_out2) + # for i in range(len(zero_out1)): + # print(np.equal(zero_out1[i].numpy(), zero_out2[i].numpy()).all()) + + + + + + + + # ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## + # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([0], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + + # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + + # input_spec = [ + # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # paddle.static.InputSpec(shape=[1], dtype='int32'), + # -16, + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path) + # static_model = paddle.jit.load(self.args.export_path) + + # offset = paddle.to_tensor([0], dtype='int32') + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + # xs, att_cache, cnn_cache = static_model(xs1, offset, att_cache, cnn_cache) + # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out2 = static_model(xs2, offset, att_cache, cnn_cache) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + + + + + + + + ###################### save/load combine ######################## + paddle.jit.save(infer_model, '/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', combine_params=True) + + + # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([0], dtype='int32') + # required_cache_size = -16 + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + + # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + # # print(out1) + + + # from paddle.jit.layer import Layer + # layer = Layer() + # layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(0)) + + # offset = paddle.to_tensor([0], dtype='int32') + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) + # offset = paddle.to_tensor([16], dtype='int32') + # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) \ No newline at end of file diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 76f698e64..9148c7372 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -59,6 +59,20 @@ __all__ = ["U2Model", "U2InferModel"] logger = Log(__name__).getlog() +# input_spec1 = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), +# paddle.static.InputSpec(shape=[None], dtype='int64'), +# paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] + +# input_spec2 = [ +# paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), +# paddle.static.InputSpec(shape=[1], dtype='int32'), +# -16, +# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), +# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] + +# input_spec3 = [paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), +# paddle.static.InputSpec(shape=[1], dtype='int64')] + class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" @@ -599,7 +613,12 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.eos - @jit.to_static + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + paddle.static.InputSpec(shape=[1], dtype='int32'), + -16, + paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) def forward_encoder_chunk( self, xs: paddle.Tensor, @@ -655,7 +674,10 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) - @jit.to_static + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[None, None], dtype='int64'), + paddle.static.InputSpec(shape=[None], dtype='int64'), + paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) def forward_attention_decoder( self, hyps: paddle.Tensor, @@ -918,6 +940,9 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) + @jit.to_static(input_spec=[ + paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), + paddle.static.InputSpec(shape=[1], dtype='int64')]) def forward(self, feats, feats_lengths, @@ -933,9 +958,10 @@ class U2InferModel(U2Model): Returns: List[List[int]]: best path result """ - return self.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) + # return self.ctc_greedy_search( + # feats, + # feats_lengths, + # decoding_chunk_size=decoding_chunk_size, + # num_decoding_left_chunks=num_decoding_left_chunks, + # simulate_streaming=simulate_streaming) + return feats, feats_lengths diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 4df38f09d..cd50f157a 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -80,6 +80,10 @@ class PaddleASRConnectionHanddler: self.init_decoder() self.reset() + from paddle.jit.layer import Layer + self.jit_layer = Layer() + self.jit_layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(1)) + def init_decoder(self): if "deepspeech2" in self.model_type: assert self.continuous_decoding is False, "ds2 model not support endpoint" @@ -474,9 +478,16 @@ class PaddleASRConnectionHanddler: # cur chunk chunk_xs = self.cached_feat[:, cur:end, :] # forward chunk - (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( - chunk_xs, self.offset, required_cache_size, - self.att_cache, self.cnn_cache) + # (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( + # chunk_xs, self.offset, required_cache_size, + # self.att_cache, self.cnn_cache) + + (y, self.att_cache, self.cnn_cache) = self.jit_layer.forward_encoder_chunk( + chunk_xs, + paddle.to_tensor([self.offset], dtype='int32'), + self.att_cache, + self.cnn_cache) + outputs.append(y) # update the global offset, in decoding frame unit From e5a6c243f1f53ea3d3d28a957010db98cdcd6db4 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 1 Aug 2022 08:03:04 +0000 Subject: [PATCH 002/124] fix jit save for conformer --- paddlespeech/s2t/exps/u2/model.py | 205 ++++++------------------------ paddlespeech/s2t/models/u2/u2.py | 62 ++++----- 2 files changed, 62 insertions(+), 205 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index b41f320b4..141e83bce 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -25,8 +25,6 @@ import paddle from paddle import distributed as dist from paddlespeech.s2t.frontend.featurizer import TextFeaturizer -from paddlespeech.s2t.io.dataloader import BatchDataLoader -from paddlespeech.s2t.io.dataloader import StreamDataLoader from paddlespeech.s2t.io.dataloader import DataLoaderFactory from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.optimizer import OptimizerFactory @@ -109,7 +107,8 @@ class U2Trainer(Trainer): def valid(self): self.model.eval() if not self.use_streamdata: - logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") + logger.info( + f"Valid Total Examples: {len(self.valid_loader.dataset)}") valid_losses = defaultdict(list) num_seen_utts = 1 total_loss = 0.0 @@ -136,7 +135,8 @@ class U2Trainer(Trainer): msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) if not self.use_streamdata: - msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) + msg += "batch: {}/{}, ".format(i + 1, + len(self.valid_loader)) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in valid_dump.items()) logger.info(msg) @@ -157,7 +157,8 @@ class U2Trainer(Trainer): self.before_train() if not self.use_streamdata: - logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") + logger.info( + f"Train Total Examples: {len(self.train_loader.dataset)}") while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() @@ -225,14 +226,18 @@ class U2Trainer(Trainer): config = self.config.clone() self.use_streamdata = config.get("use_stream_data", False) if self.train: - self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args) - self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args) + self.train_loader = DataLoaderFactory.get_dataloader( + 'train', config, self.args) + self.valid_loader = DataLoaderFactory.get_dataloader( + 'valid', config, self.args) logger.info("Setup train/valid Dataloader!") else: decode_batch_size = config.get('decode', dict()).get( 'decode_batch_size', 1) - self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args) - self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args) + self.test_loader = DataLoaderFactory.get_dataloader('test', config, + self.args) + self.align_loader = DataLoaderFactory.get_dataloader( + 'align', config, self.args) logger.info("Setup test/align Dataloader!") def setup_model(self): @@ -470,166 +475,30 @@ class U2Tester(U2Trainer): def export(self): infer_model, input_spec = self.load_inferspec() assert isinstance(input_spec, list), type(input_spec) + del input_spec infer_model.eval() - # static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) - # logger.info(f"Export code: {static_model.forward.code}") - # paddle.jit.save(static_model, self.args.export_path) - - # # to check outputs - # def flatten(out): - # if isinstance(out, paddle.Tensor): - # return [out] - - # flatten_out = [] - # for var in out: - # if isinstance(var, (list, tuple)): - # flatten_out.extend(flatten(var)) - # else: - # flatten_out.append(var) - # return flatten_out - - - # ######################### infer_model.forward_attention_decoder ######################## - # a = paddle.full(shape=[10, 8], fill_value=10, dtype='int64') - # b = paddle.full(shape=[10], fill_value=8, dtype='int64') - # # c = paddle.rand(shape=[1, 20, 512], dtype='float32') - # c = paddle.full(shape=[1, 20, 512], fill_value=1, dtype='float32') - - # out1 = infer_model.forward_attention_decoder(a, b, c) - # print(out1) - - # input_spec = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), - # paddle.static.InputSpec(shape=[None], dtype='int64'), - # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] - # static_model = paddle.jit.to_static(infer_model.forward_attention_decoder, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path) - # static_model = paddle.jit.load(self.args.export_path) - # out2 = static_model(a, b, c) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) - - - - - - - # ######################### infer_model.forward_encoder_chunk ######################## - # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([80], dtype='int32') - # required_cache_size = -16 - # att_cache = paddle.randn(shape=[12, 8, 80, 128], dtype='float32') - # cnn_cache = paddle.randn(shape=[12, 1, 512, 14], dtype='float32') - # # out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache, cnn_cache) - # # print(out1) - # zero_out1 = infer_model.forward_encoder_chunk(xs, offset, required_cache_size, att_cache=paddle.zeros([0, 0, 0, 0]), cnn_cache=paddle.zeros([0, 0, 0, 0])) - # # print(zero_out1) - - # input_spec = [ - # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - # paddle.static.InputSpec(shape=[1], dtype='int32'), - # -16, - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] - # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path) - # static_model = paddle.jit.load(self.args.export_path) - # # out2 = static_model(xs, offset, att_cache, cnn_cache) - # # print(out2) - # zero_out2 = static_model(xs, offset, paddle.zeros([0, 0, 0, 0]), paddle.zeros([0, 0, 0, 0])) - - # # out1 = flatten(out1) - # # out2 = flatten(out2) - # # for i in range(len(out1)): - # # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) - - # zero_out1 = flatten(zero_out1) - # zero_out2 = flatten(zero_out2) - # for i in range(len(zero_out1)): - # print(np.equal(zero_out1[i].numpy(), zero_out2[i].numpy()).all()) - - - - - - - - # ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## - # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([0], dtype='int32') - # required_cache_size = -16 - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - - # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - # # print(out1) - - # input_spec = [ - # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - # paddle.static.InputSpec(shape=[1], dtype='int32'), - # -16, - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] - # static_model = paddle.jit.to_static(infer_model.forward_encoder_chunk, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path) - # static_model = paddle.jit.load(self.args.export_path) - - # offset = paddle.to_tensor([0], dtype='int32') - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = static_model(xs1, offset, att_cache, cnn_cache) - # xs = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out2 = static_model(xs2, offset, att_cache, cnn_cache) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) - - - - - - - - ###################### save/load combine ######################## - paddle.jit.save(infer_model, '/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', combine_params=True) + ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## + input_spec = [ + paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + paddle.static.InputSpec(shape=[1], dtype='int32'), -1, + paddle.static.InputSpec( + shape=[None, None, None, None], + dtype='float32'), paddle.static.InputSpec( + shape=[None, None, None, None], dtype='float32') + ] + infer_model.forward_encoder_chunk = paddle.jit.to_static( + infer_model.forward_encoder_chunk, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path, combine_params=True) - # xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([0], dtype='int32') - # required_cache_size = -16 - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) + ######################### infer_model.forward_attention_decoder ######################## + input_spec = [ + paddle.static.InputSpec(shape=[None, None], dtype='int64'), + paddle.static.InputSpec(shape=[None], dtype='int64'), + paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + ] + infer_model.forward_attention_decoder = paddle.jit.to_static( + infer_model.forward_attention_decoder, input_spec=input_spec) + # paddle.jit.save(static_model, self.args.export_path, combine_params=True) - # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - # # print(out1) - - - # from paddle.jit.layer import Layer - # layer = Layer() - # layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(0)) - - # offset = paddle.to_tensor([0], dtype='int32') - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) - # offset = paddle.to_tensor([16], dtype='int32') - # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) \ No newline at end of file + paddle.jit.save(infer_model, './export.jit', combine_params=True) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 9148c7372..432162aae 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -29,6 +29,9 @@ import paddle from paddle import jit from paddle import nn +from paddlespeech.audio.utils.tensor_utils import add_sos_eos +from paddlespeech.audio.utils.tensor_utils import pad_sequence +from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.decoders.scorers.ctc import CTCPrefixScorer from paddlespeech.s2t.frontend.utility import IGNORE_ID from paddlespeech.s2t.frontend.utility import load_cmvn @@ -48,9 +51,6 @@ from paddlespeech.s2t.utils import checkpoint from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank from paddlespeech.s2t.utils.log import Log -from paddlespeech.audio.utils.tensor_utils import add_sos_eos -from paddlespeech.audio.utils.tensor_utils import pad_sequence -from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.utils.utility import log_add from paddlespeech.s2t.utils.utility import UpdateConfig @@ -59,20 +59,6 @@ __all__ = ["U2Model", "U2InferModel"] logger = Log(__name__).getlog() -# input_spec1 = [paddle.static.InputSpec(shape=[None, None], dtype='int64'), -# paddle.static.InputSpec(shape=[None], dtype='int64'), -# paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')] - -# input_spec2 = [ -# paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), -# paddle.static.InputSpec(shape=[1], dtype='int32'), -# -16, -# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), -# paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')] - -# input_spec3 = [paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), -# paddle.static.InputSpec(shape=[1], dtype='int64')] - class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" @@ -588,44 +574,44 @@ class U2BaseModel(ASRInterface, nn.Layer): best_index = i return hyps[best_index][0] - #@jit.to_static + @jit.to_static(property=True) def subsampling_rate(self) -> int: """ Export interface for c++ call, return subsampling_rate of the model """ return self.encoder.embed.subsampling_rate - #@jit.to_static + @jit.to_static(property=True) def right_context(self) -> int: """ Export interface for c++ call, return right_context of the model """ return self.encoder.embed.right_context - #@jit.to_static + @jit.to_static(property=True) def sos_symbol(self) -> int: """ Export interface for c++ call, return sos symbol id of the model """ return self.sos - #@jit.to_static + @jit.to_static(property=True) def eos_symbol(self) -> int: """ Export interface for c++ call, return eos symbol id of the model """ return self.eos - @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - paddle.static.InputSpec(shape=[1], dtype='int32'), - -16, - paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) + # @jit.to_static(input_spec=[ + # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # paddle.static.InputSpec(shape=[1], dtype='int32'), + # -1, + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), + # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) def forward_encoder_chunk( self, xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Export interface for c++ call, give input chunk xs, and return output from time 0 to current chunk. @@ -660,8 +646,8 @@ class U2BaseModel(ASRInterface, nn.Layer): paddle.Tensor: new conformer cnn cache required for next chunk, with same shape as the original cnn_cache. """ - return self.encoder.forward_chunk( - xs, offset, required_cache_size, att_cache, cnn_cache) + return self.encoder.forward_chunk(xs, offset, required_cache_size, + att_cache, cnn_cache) # @jit.to_static def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: @@ -674,10 +660,10 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) - @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[None, None], dtype='int64'), - paddle.static.InputSpec(shape=[None], dtype='int64'), - paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) + # @jit.to_static(input_spec=[ + # paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # paddle.static.InputSpec(shape=[None], dtype='int64'), + # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) def forward_attention_decoder( self, hyps: paddle.Tensor, @@ -941,8 +927,9 @@ class U2InferModel(U2Model): super().__init__(configs) @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), - paddle.static.InputSpec(shape=[1], dtype='int64')]) + paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), + paddle.static.InputSpec(shape=[1], dtype='int64') + ]) def forward(self, feats, feats_lengths, @@ -958,6 +945,7 @@ class U2InferModel(U2Model): Returns: List[List[int]]: best path result """ + # dummy code for dy2st # return self.ctc_greedy_search( # feats, # feats_lengths, From d3572be0bb37cd2265691bbfe73c6c550d33f162 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 1 Aug 2022 08:06:25 +0000 Subject: [PATCH 003/124] add ws export.sh --- examples/wenetspeech/asr1/local/export.sh | 28 +++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 examples/wenetspeech/asr1/local/export.sh diff --git a/examples/wenetspeech/asr1/local/export.sh b/examples/wenetspeech/asr1/local/export.sh new file mode 100755 index 000000000..6b646b469 --- /dev/null +++ b/examples/wenetspeech/asr1/local/export.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +if [ $# != 3 ];then + echo "usage: $0 config_path ckpt_prefix jit_model_path" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_path_prefix=$2 +jit_model_export_path=$3 + +python3 -u ${BIN_DIR}/export.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--checkpoint_path ${ckpt_path_prefix} \ +--export_path ${jit_model_export_path} + + +if [ $? -ne 0 ]; then + echo "Failed in export!" + exit 1 +fi + + +exit 0 From 6149daa22142d7be2f252b9590b2728a5ec72a10 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 3 Aug 2022 08:38:43 +0000 Subject: [PATCH 004/124] export ctc_activation --- paddlespeech/s2t/exps/u2/model.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 141e83bce..fdccdf159 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -478,7 +478,8 @@ class U2Tester(U2Trainer): del input_spec infer_model.eval() - ######################### infer_model.forward_encoder_chunk zero Tensor online ######################## + ######################### infer_model.forward_encoder_chunk zero Tensor online ############ + # TODO: 80(feature dim) be configable input_spec = [ paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), paddle.static.InputSpec(shape=[1], dtype='int32'), -1, @@ -492,6 +493,7 @@ class U2Tester(U2Trainer): # paddle.jit.save(static_model, self.args.export_path, combine_params=True) ######################### infer_model.forward_attention_decoder ######################## + # TODO: 512(encoder_output) be configable. 1 for B input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype='int64'), paddle.static.InputSpec(shape=[None], dtype='int64'), @@ -501,4 +503,12 @@ class U2Tester(U2Trainer): infer_model.forward_attention_decoder, input_spec=input_spec) # paddle.jit.save(static_model, self.args.export_path, combine_params=True) + ######################### infer_model.ctc_activation ######################## + # TODO: 512(encoder_output) be configable + input_spec = [ + paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + ] + infer_model.ctc_activation = paddle.jit.to_static( + infer_model.ctc_activation, input_spec=input_spec) + paddle.jit.save(infer_model, './export.jit', combine_params=True) From 05bc25883333d80a7ee1a5ec1314a1b81f57a81c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 3 Aug 2022 09:17:23 +0000 Subject: [PATCH 005/124] update docstring --- paddlespeech/s2t/models/u2/u2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index ca83ca170..e4c667e00 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -654,7 +654,7 @@ class U2BaseModel(ASRInterface, nn.Layer): Args: xs (paddle.Tensor): encoder output, (B, T, D) Returns: - paddle.Tensor: activation before ctc + paddle.Tensor: activation before ctc. (B, Tmax, odim) """ return self.ctc.log_softmax(xs) From c1fbfe928ec386eefa805c9215a369fc83b9b9fc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 4 Aug 2022 03:22:14 +0000 Subject: [PATCH 006/124] add test --- paddlespeech/s2t/exps/u2/model.py | 49 +++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index fdccdf159..5ce5f50bf 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -512,3 +512,52 @@ class U2Tester(U2Trainer): infer_model.ctc_activation, input_spec=input_spec) paddle.jit.save(infer_model, './export.jit', combine_params=True) + + def flatten(out): + if isinstance(out, paddle.Tensor): + return [out] + + flatten_out = [] + for var in out: + if isinstance(var, (list, tuple)): + flatten_out.extend(flatten(var)) + else: + flatten_out.append(var) + return flatten_out + + xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + offset = paddle.to_tensor([0], dtype='int32') + required_cache_size = -16 + att_cache = paddle.zeros([0, 0, 0, 0]) + cnn_cache = paddle.zeros([0, 0, 0, 0]) + + # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # offset = paddle.to_tensor([16], dtype='int32') + # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + # print(out1) + + xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk( + xs1, offset, att_cache, cnn_cache) + xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') + offset = paddle.to_tensor([16], dtype='int32') + out1 = infer_model.forward_encoder_chunk(xs2, offset, att_cache, + cnn_cache) + print(out1) + + # from paddle.jit.layer import Layer + # layer = Layer() + # layer.load('./export.jit', paddle.CPUPlace()) + + # offset = paddle.to_tensor([0], dtype='int32') + # att_cache = paddle.zeros([0, 0, 0, 0]) + # cnn_cache=paddle.zeros([0, 0, 0, 0]) + # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) + # offset = paddle.to_tensor([16], dtype='int32') + # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) + # # print(out2) + + # out1 = flatten(out1) + # out2 = flatten(out2) + # for i in range(len(out1)): + # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) From d638325c46e7092fcdb48ee7605c9c79f498bb1f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:09:29 +0000 Subject: [PATCH 007/124] do not jit save forward; using slice for zeros([0,0,0,0]) tensor --- paddlespeech/s2t/exps/u2/model.py | 51 +++++++++++------------------ paddlespeech/s2t/models/u2/u2.py | 4 --- paddlespeech/s2t/modules/encoder.py | 5 +-- 3 files changed, 23 insertions(+), 37 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 5ce5f50bf..66b95f63c 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -482,10 +482,12 @@ class U2Tester(U2Trainer): # TODO: 80(feature dim) be configable input_spec = [ paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - paddle.static.InputSpec(shape=[1], dtype='int32'), -1, + paddle.static.InputSpec(shape=[1], dtype='int32'), + -1, paddle.static.InputSpec( shape=[None, None, None, None], - dtype='float32'), paddle.static.InputSpec( + dtype='float32'), + paddle.static.InputSpec( shape=[None, None, None, None], dtype='float32') ] infer_model.forward_encoder_chunk = paddle.jit.to_static( @@ -511,7 +513,7 @@ class U2Tester(U2Trainer): infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) - paddle.jit.save(infer_model, './export.jit', combine_params=True) + paddle.jit.save(infer_model, './export.jit', combine_params=True, skip_forward=True) def flatten(out): if isinstance(out, paddle.Tensor): @@ -531,33 +533,20 @@ class U2Tester(U2Trainer): att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - # xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - # offset = paddle.to_tensor([16], dtype='int32') - # out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - # print(out1) - - xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk( - xs1, offset, att_cache, cnn_cache) + xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') offset = paddle.to_tensor([16], dtype='int32') - out1 = infer_model.forward_encoder_chunk(xs2, offset, att_cache, - cnn_cache) - print(out1) - - # from paddle.jit.layer import Layer - # layer = Layer() - # layer.load('./export.jit', paddle.CPUPlace()) - - # offset = paddle.to_tensor([0], dtype='int32') - # att_cache = paddle.zeros([0, 0, 0, 0]) - # cnn_cache=paddle.zeros([0, 0, 0, 0]) - # xs, att_cache, cnn_cache = layer.forward_encoder_chunk(xs1, offset, att_cache, cnn_cache) - # offset = paddle.to_tensor([16], dtype='int32') - # out2 = layer.forward_encoder_chunk(xs2, offset, att_cache, cnn_cache) - # # print(out2) - - # out1 = flatten(out1) - # out2 = flatten(out2) - # for i in range(len(out1)): - # print(np.equal(out1[i].numpy(), out2[i].numpy()).all()) + out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) + print('py encoder', out1) + + from paddle.jit.layer import Layer + layer = Layer() + layer.load('./export.jit', paddle.CPUPlace()) + + xs1 = paddle.full([1, 7, 80], 0.1, dtype='float32') + offset = paddle.to_tensor([0], dtype='int32') + att_cache = paddle.zeros([0, 0, 0, 0]) + cnn_cache=paddle.zeros([0, 0, 0, 0]) + func = getattr(layer, 'forward_encoder_chunk') + xs, att_cache, cnn_cache = func(xs1, offset, att_cache, cnn_cache) + print('py static encoder', xs) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index e4c667e00..a1daccf18 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -924,10 +924,6 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) - @jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[1, 1, 1], dtype='int64'), - paddle.static.InputSpec(shape=[1], dtype='int64') - ]) def forward(self, feats, feats_lengths, diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index bff2d69bb..a7919bca4 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -251,10 +251,11 @@ class BaseEncoder(nn.Layer): for i, layer in enumerate(self.encoders): # att_cache[i:i+1] = (1, head, cache_t1, d_k*2) # cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2) + # zeros([0,0,0,0]) support [i:i+1] slice xs, _, new_att_cache, new_cnn_cache = layer( xs, att_mask, pos_emb, - att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, + att_cache=att_cache[i:i+1], + cnn_cache=cnn_cache[i:i+1], ) # new_att_cache = (1, head, attention_key_size, d_k*2) # new_cnn_cache = (B=1, hidden-dim, cache_t2) From a7c6c54e75575ffddcae18ae353c858006653cb9 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:20:28 +0000 Subject: [PATCH 008/124] fix --- .../server/engine/asr/online/python/asr_engine.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index cd50f157a..e3cbd38f3 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -80,9 +80,6 @@ class PaddleASRConnectionHanddler: self.init_decoder() self.reset() - from paddle.jit.layer import Layer - self.jit_layer = Layer() - self.jit_layer.load('/workspace/conformer/PaddleSpeech-conformer/conformer/conformer', paddle.CUDAPlace(1)) def init_decoder(self): if "deepspeech2" in self.model_type: @@ -478,15 +475,9 @@ class PaddleASRConnectionHanddler: # cur chunk chunk_xs = self.cached_feat[:, cur:end, :] # forward chunk - # (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( - # chunk_xs, self.offset, required_cache_size, - # self.att_cache, self.cnn_cache) - - (y, self.att_cache, self.cnn_cache) = self.jit_layer.forward_encoder_chunk( - chunk_xs, - paddle.to_tensor([self.offset], dtype='int32'), - self.att_cache, - self.cnn_cache) + (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( + chunk_xs, self.offset, required_cache_size, + self.att_cache, self.cnn_cache) outputs.append(y) From 63aeb747b0be474140fc4b9f6808403b05d1cf84 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:29:55 +0000 Subject: [PATCH 009/124] more comment --- paddlespeech/s2t/exps/u2/model.py | 10 +++++----- paddlespeech/s2t/modules/encoder.py | 7 ++++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 66b95f63c..1d813761d 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -492,10 +492,9 @@ class U2Tester(U2Trainer): ] infer_model.forward_encoder_chunk = paddle.jit.to_static( infer_model.forward_encoder_chunk, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path, combine_params=True) ######################### infer_model.forward_attention_decoder ######################## - # TODO: 512(encoder_output) be configable. 1 for B + # TODO: 512(encoder_output) be configable. 1 for BatchSize input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype='int64'), paddle.static.InputSpec(shape=[None], dtype='int64'), @@ -503,7 +502,6 @@ class U2Tester(U2Trainer): ] infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) - # paddle.jit.save(static_model, self.args.export_path, combine_params=True) ######################### infer_model.ctc_activation ######################## # TODO: 512(encoder_output) be configable @@ -513,8 +511,10 @@ class U2Tester(U2Trainer): infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) - paddle.jit.save(infer_model, './export.jit', combine_params=True, skip_forward=True) + # jit save + paddle.jit.save(infer_model, self.args.export_path, combine_params=True, skip_forward=True) + # test dy2static def flatten(out): if isinstance(out, paddle.Tensor): return [out] @@ -541,7 +541,7 @@ class U2Tester(U2Trainer): from paddle.jit.layer import Layer layer = Layer() - layer.load('./export.jit', paddle.CPUPlace()) + layer.load(self.args.export_path, paddle.CPUPlace()) xs1 = paddle.full([1, 7, 80], 0.1, dtype='float32') offset = paddle.to_tensor([0], dtype='int32') diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index a7919bca4..230894d50 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -251,7 +251,12 @@ class BaseEncoder(nn.Layer): for i, layer in enumerate(self.encoders): # att_cache[i:i+1] = (1, head, cache_t1, d_k*2) # cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2) - # zeros([0,0,0,0]) support [i:i+1] slice + + # WARNING: eliminate if-else cond op in graph + # tensor zeros([0,0,0,0]) support [i:i+1] slice, will return zeros([0,0,0,0]) tensor + # raw code as below: + # att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, + # cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, xs, _, new_att_cache, new_cnn_cache = layer( xs, att_mask, pos_emb, att_cache=att_cache[i:i+1], From 1c9f238ba09e55b26b3b0c46033436ed27eb9613 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:45:26 +0000 Subject: [PATCH 010/124] configurable export --- paddlespeech/s2t/exps/u2/model.py | 37 +++++++++++++++++++------------ 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 1d813761d..45fbcb404 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -462,31 +462,37 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.clone(), self.args.checkpoint_path) + + batch_size = 1 feat_dim = self.test_loader.feat_dim - input_spec = [ - paddle.static.InputSpec(shape=[1, None, feat_dim], - dtype='float32'), # audio, [B,T,D] - paddle.static.InputSpec(shape=[1], - dtype='int64'), # audio_length, [B] - ] - return infer_model, input_spec + model_size = 512 + num_left_chunks = -1 + + return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) @paddle.no_grad() def export(self): infer_model, input_spec = self.load_inferspec() - assert isinstance(input_spec, list), type(input_spec) - del input_spec infer_model.eval() - ######################### infer_model.forward_encoder_chunk zero Tensor online ############ + assert isinstance(input_spec, list), type(input_spec) + batch_size, feat_dim, model_size, num_left_chunks = input_spec + + + ######################### infer_model.forward_encoder_chunk zero tensor online ############ # TODO: 80(feature dim) be configable input_spec = [ - paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), + # xs, (B, T, D) + paddle.static.InputSpec(shape=[batch_size, None, feat_dim], dtype='float32'), + # offset, int, but need be tensor paddle.static.InputSpec(shape=[1], dtype='int32'), - -1, + # required_cache_size, int + num_left_chunks, + # att_cache paddle.static.InputSpec( shape=[None, None, None, None], dtype='float32'), + # cnn_cache paddle.static.InputSpec( shape=[None, None, None, None], dtype='float32') ] @@ -496,9 +502,12 @@ class U2Tester(U2Trainer): ######################### infer_model.forward_attention_decoder ######################## # TODO: 512(encoder_output) be configable. 1 for BatchSize input_spec = [ + # hyps, (B, U) paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # hyps_lens, (B,) paddle.static.InputSpec(shape=[None], dtype='int64'), - paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + # encoder_out, (B,T,D) + paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') ] infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) @@ -529,7 +538,7 @@ class U2Tester(U2Trainer): xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') offset = paddle.to_tensor([0], dtype='int32') - required_cache_size = -16 + required_cache_size = num_left_chunks att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) From 3a8869fba496ecfbb153a094feae18ac1ce28fc9 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 9 Sep 2022 15:50:11 +0000 Subject: [PATCH 011/124] rm to_static decarator; configure jit save for ctc_activation --- paddlespeech/s2t/exps/u2/model.py | 4 ++-- paddlespeech/s2t/models/u2/u2.py | 12 ++---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 45fbcb404..dae618db6 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -513,9 +513,9 @@ class U2Tester(U2Trainer): infer_model.forward_attention_decoder, input_spec=input_spec) ######################### infer_model.ctc_activation ######################## - # TODO: 512(encoder_output) be configable input_spec = [ - paddle.static.InputSpec(shape=[1, None, 512], dtype='float32') + # encoder_out, (B,T,D) + paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') ] infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index a1daccf18..149170ed6 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -599,12 +599,7 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.eos - # @jit.to_static(input_spec=[ - # paddle.static.InputSpec(shape=[1, None, 80], dtype='float32'), - # paddle.static.InputSpec(shape=[1], dtype='int32'), - # -1, - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32'), - # paddle.static.InputSpec(shape=[None, None, None, None], dtype='float32')]) + # @jit.to_static def forward_encoder_chunk( self, xs: paddle.Tensor, @@ -658,10 +653,7 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) - # @jit.to_static(input_spec=[ - # paddle.static.InputSpec(shape=[None, None], dtype='int64'), - # paddle.static.InputSpec(shape=[None], dtype='int64'), - # paddle.static.InputSpec(shape=[1, None, 512], dtype='float32')]) + # @jit.to_static def forward_attention_decoder( self, hyps: paddle.Tensor, From 67709155e9f17e03579c7360882e2e92b65ad7c1 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 13 Sep 2022 08:29:21 +0000 Subject: [PATCH 012/124] add chunk conformer config from release model --- .../asr1/conf/chunk_conformer.yaml | 99 +++++++++++++++++++ .../wenetspeech/asr1/conf/preprocess.yaml | 2 +- .../asr1/conf/tuning/chunk_decode.yaml | 11 +++ 3 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 examples/wenetspeech/asr1/conf/chunk_conformer.yaml create mode 100644 examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml new file mode 100644 index 000000000..69fa223a1 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml @@ -0,0 +1,99 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + init_type: 'kaiming_uniform' + +# https://yaml.org/type/float.html +########################################### +# Data # +########################################### +train_manifest: data/train_l/data.list +dev_manifest: data/dev/data.list +test_manifest: data/test_meeting/data.list + +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +do_filter: True +maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced +minlen_in: 10 +minlen_out: 0 +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + + +########################################### +# Training # +########################################### +n_epoch: 26 +accum_grad: 32 +global_grad_clip: 5.0 +dist_sampler: True +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml index f7f4c58d5..c7ccc522d 100644 --- a/examples/wenetspeech/asr1/conf/preprocess.yaml +++ b/examples/wenetspeech/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: 0.1 + dither: 1.0 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml b/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 000000000..7e8afb7a8 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: 16 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: True # simulate streaming inference. Defaults to False. From 8690a00bd8d66c7d1358a8ac370967ddb4bd1ec5 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 13 Sep 2022 09:54:48 +0000 Subject: [PATCH 013/124] add feature pipeline layer(cmvn, fbank), but to_static and jit.layer output is not equal --- paddlespeech/audio/compliance/kaldi.py | 22 +++---- paddlespeech/s2t/exps/u2/bin/test_wav.py | 3 + paddlespeech/s2t/exps/u2/model.py | 75 ++++++++++++++++-------- paddlespeech/s2t/models/u2/u2.py | 58 ++++++++++++++++++ paddlespeech/s2t/modules/cmvn.py | 10 +++- paddlespeech/s2t/modules/fbank.py | 74 +++++++++++++++++++++++ 6 files changed, 206 insertions(+), 36 deletions(-) create mode 100644 paddlespeech/s2t/modules/fbank.py diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index 538be0196..beb2d86b9 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -74,16 +74,16 @@ def _feature_window_function( window_size: int, blackman_coeff: float, dtype: int, ) -> Tensor: - if window_type == HANNING: + if window_type == "hann": return get_window('hann', window_size, fftbins=False, dtype=dtype) - elif window_type == HAMMING: + elif window_type == "hamming": return get_window('hamming', window_size, fftbins=False, dtype=dtype) - elif window_type == POVEY: + elif window_type == "povey": return get_window( 'hann', window_size, fftbins=False, dtype=dtype).pow(0.85) - elif window_type == RECTANGULAR: + elif window_type == "rect": return paddle.ones([window_size], dtype=dtype) - elif window_type == BLACKMAN: + elif window_type == "blackman": a = 2 * math.pi / (window_size - 1) window_function = paddle.arange(window_size, dtype=dtype) return (blackman_coeff - 0.5 * paddle.cos(a * window_function) + @@ -216,7 +216,7 @@ def spectrogram(waveform: Tensor, sr: int=16000, snip_edges: bool=True, subtract_mean: bool=False, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. Args: @@ -236,7 +236,7 @@ def spectrogram(waveform: Tensor, snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. - window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". Returns: Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames @@ -418,11 +418,11 @@ def fbank(waveform: Tensor, vtln_high: float=-500.0, vtln_low: float=100.0, vtln_warp: float=1.0, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return filter banks from a waveform. The output is identical to Kaldi's. Args: - waveform (Tensor): A waveform tensor with shape `(C, T)`. + waveform (Tensor): A waveform tensor with shape `(C, T)`. `C` is in the range [0,1]. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. channel (int, optional): Select the channel of waveform. Defaults to -1. dither (float, optional): Dithering constant . Defaults to 0.0. @@ -448,7 +448,7 @@ def fbank(waveform: Tensor, vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. - window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". Returns: Tensor: A filter banks tensor with shape `(m, n_mels)`. @@ -537,7 +537,7 @@ def mfcc(waveform: Tensor, vtln_high: float=-500.0, vtln_low: float=100.0, vtln_warp: float=1.0, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return mel frequency cepstral coefficients from a waveform. The output is identical to Kaldi's. diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 887ec7a6d..c04e3ae47 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +import numpy as np from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation @@ -77,6 +78,8 @@ class U2Infer(): feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") + np.savetxt("feat.transform.txt", feat) + ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) decode_config = self.config.decode diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index dae618db6..ee4df9cb9 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -474,13 +474,20 @@ class U2Tester(U2Trainer): def export(self): infer_model, input_spec = self.load_inferspec() infer_model.eval() + paddle.set_device('cpu') - assert isinstance(input_spec, list), type(input_spec) + assert isinstance(input_spec, (list, tuple)), type(input_spec) batch_size, feat_dim, model_size, num_left_chunks = input_spec - ######################### infer_model.forward_encoder_chunk zero tensor online ############ - # TODO: 80(feature dim) be configable + ######################## infer_model.forward_encoder_chunk ############ + input_spec = [ + # (T,), int16 + paddle.static.InputSpec(shape=[None], dtype='int16'), + ] + infer_model.forward_feature = paddle.jit.to_static(infer_model.forward_feature, input_spec=input_spec) + + ######################### infer_model.forward_encoder_chunk ############ input_spec = [ # xs, (B, T, D) paddle.static.InputSpec(shape=[batch_size, None, feat_dim], dtype='float32'), @@ -499,8 +506,16 @@ class U2Tester(U2Trainer): infer_model.forward_encoder_chunk = paddle.jit.to_static( infer_model.forward_encoder_chunk, input_spec=input_spec) + ######################### infer_model.ctc_activation ######################## + input_spec = [ + # encoder_out, (B,T,D) + paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') + ] + infer_model.ctc_activation = paddle.jit.to_static( + infer_model.ctc_activation, input_spec=input_spec) + + ######################### infer_model.forward_attention_decoder ######################## - # TODO: 512(encoder_output) be configable. 1 for BatchSize input_spec = [ # hyps, (B, U) paddle.static.InputSpec(shape=[None, None], dtype='int64'), @@ -512,17 +527,11 @@ class U2Tester(U2Trainer): infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) - ######################### infer_model.ctc_activation ######################## - input_spec = [ - # encoder_out, (B,T,D) - paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') - ] - infer_model.ctc_activation = paddle.jit.to_static( - infer_model.ctc_activation, input_spec=input_spec) - # jit save + logger.info(f"export save: {self.args.export_path}") paddle.jit.save(infer_model, self.args.export_path, combine_params=True, skip_forward=True) + # test dy2static def flatten(out): if isinstance(out, paddle.Tensor): @@ -536,26 +545,44 @@ class U2Tester(U2Trainer): flatten_out.append(var) return flatten_out - xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # forward_encoder_chunk dygraph + xs1 = paddle.full([1, 67, 80], 0.1, dtype='float32') offset = paddle.to_tensor([0], dtype='int32') required_cache_size = num_left_chunks att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) - - xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - offset = paddle.to_tensor([16], dtype='int32') - out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - print('py encoder', out1) - + xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + + import soundfile + audio, sample_rate = soundfile.read( + './zh.wav', dtype="int16", always_2d=True) + audio = audio[:, 0] + logger.info(f"audio shape: {audio.shape}") + audio = paddle.to_tensor(audio, paddle.int16) + feat_d = infer_model.forward_feature(audio) + logger.info(f"{feat_d}") + np.savetxt("feat.tostatic.txt", feat_d) + + + # load static model from paddle.jit.layer import Layer layer = Layer() layer.load(self.args.export_path, paddle.CPUPlace()) - xs1 = paddle.full([1, 7, 80], 0.1, dtype='float32') + # forward_encoder_chunk static + xs1 = paddle.full([1, 67, 80], 0.1, dtype='float32') offset = paddle.to_tensor([0], dtype='int32') att_cache = paddle.zeros([0, 0, 0, 0]) - cnn_cache=paddle.zeros([0, 0, 0, 0]) + cnn_cache = paddle.zeros([0, 0, 0, 0]) func = getattr(layer, 'forward_encoder_chunk') - xs, att_cache, cnn_cache = func(xs1, offset, att_cache, cnn_cache) - print('py static encoder', xs) + xs_s, att_cache_s, cnn_cache_s = func(xs1, offset, att_cache, cnn_cache) + np.testing.assert_allclose(xs_d, xs_s, atol=1e-5) + np.testing.assert_allclose(att_cache_d, att_cache_s, atol=1e-4) + np.testing.assert_allclose(cnn_cache_d, cnn_cache_s, atol=1e-4) + # logger.info(f"forward_encoder_chunk output: {xs_s}") + + # forward_feature static + func = getattr(layer, 'forward_feature') + feat_s = func(audio)[0] + logger.info(f"{feat_s}") + np.testing.assert_allclose(feat_d, feat_s, atol=1e-5) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 149170ed6..d7b8630a3 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -916,6 +916,50 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) + from paddlespeech.s2t.modules.fbank import KaldiFbank + import yaml + import json + import numpy as np + + input_dim = configs['input_dim'] + process = configs['preprocess_config'] + with open(process, encoding="utf-8") as f: + conf = yaml.safe_load(f) + assert isinstance(conf, dict), type(self.conf) + + for idx, process in enumerate(conf['process']): + assert isinstance(process, dict), type(process) + opts = dict(process) + process_type = opts.pop("type") + + if process_type == 'fbank_kaldi': + opts.update({'n_mels': input_dim}) + opts['dither'] = 0.0 + self.fbank = KaldiFbank( + **opts + ) + logger.info(f"{self.__class__.__name__} export: {self.fbank}") + if process_type == 'cmvn_json': + # align with paddlespeech.audio.transform.cmvn:GlobalCMVN + std_floor = 1.0e-20 + + cmvn = opts['cmvn_path'] + if isinstance(cmvn, dict): + cmvn_stats = cmvn + else: + with open(cmvn) as f: + cmvn_stats = json.load(f) + count = cmvn_stats['frame_num'] + mean = np.array(cmvn_stats['mean_stat']) / count + square_sums = np.array(cmvn_stats['var_stat']) + var = square_sums / count - mean**2 + std = np.maximum(np.sqrt(var), std_floor) + istd = 1.0 / std + self.global_cmvn = GlobalCMVN( + paddle.to_tensor(mean, dtype=paddle.float), + paddle.to_tensor(istd, dtype=paddle.float)) + logger.info(f"{self.__class__.__name__} export: {self.global_cmvn}") + def forward(self, feats, feats_lengths, @@ -939,3 +983,17 @@ class U2InferModel(U2Model): # num_decoding_left_chunks=num_decoding_left_chunks, # simulate_streaming=simulate_streaming) return feats, feats_lengths + + def forward_feature(self, x): + """feature pipeline. + + Args: + x (paddle.Tensor): waveform (T,). + + Return: + feat (paddle.Tensor): feature (T, D) + """ + x = paddle.cast(x, paddle.float32) + feat = self.fbank(x) + feat = self.global_cmvn(feat) + return feat \ No newline at end of file diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py index 67f71b667..53c508f1a 100644 --- a/paddlespeech/s2t/modules/cmvn.py +++ b/paddlespeech/s2t/modules/cmvn.py @@ -40,6 +40,14 @@ class GlobalCMVN(nn.Layer): self.register_buffer("mean", mean) self.register_buffer("istd", istd) + def __repr__(self): + return ( + "{name}(mean={mean}, istd={istd}, norm_var={norm_var})".format( + name=self.__class__.__name__, + mean=self.mean, + istd=self.istd, + norm_var=self.norm_var)) + def forward(self, x: paddle.Tensor): """ Args: @@ -50,4 +58,4 @@ class GlobalCMVN(nn.Layer): x = x - self.mean if self.norm_var: x = x * self.istd - return x + return x \ No newline at end of file diff --git a/paddlespeech/s2t/modules/fbank.py b/paddlespeech/s2t/modules/fbank.py new file mode 100644 index 000000000..4ec620a79 --- /dev/null +++ b/paddlespeech/s2t/modules/fbank.py @@ -0,0 +1,74 @@ + + + +import paddle +from paddle import nn + +from paddlespeech.audio.compliance import kaldi + +from paddlespeech.s2t.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['KaldiFbank'] + +class KaldiFbank(nn.Layer): + def __init__(self, + fs=16000, + n_mels=80, + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + energy_floor=0.0, + dither=0.0): + """ + Args: + fs (int): sample rate of the audio + n_mels (int): number of mel filter banks + n_shift (int): number of points in a frame shift + win_length (int): number of points in a frame windows + energy_floor (float): Floor on energy in Spectrogram computation (absolute) + dither (float): Dithering constant. Default 0.0 + """ + super().__init__() + self.fs = fs + self.n_mels = n_mels + num_point_ms = fs / 1000 + self.n_frame_length = win_length / num_point_ms + self.n_frame_shift = n_shift / num_point_ms + self.energy_floor = energy_floor + self.dither = dither + + def __repr__(self): + return ( + "{name}(fs={fs}, n_mels={n_mels}, " + "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, " + "dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_frame_shift=self.n_frame_shift, + n_frame_length=self.n_frame_length, + dither=self.dither, )) + + def forward(self, x: paddle.Tensor): + """ + Args: + x (paddle.Tensor): shape (Ti). + Not support: [Time, Channel] and Batch mode. + + Returns: + paddle.Tensor: (T, D) + """ + assert x.ndim == 1 + + feat = kaldi.fbank( + x.unsqueeze(0), # append channel dim, (C, Ti) + n_mels=self.n_mels, + frame_length=self.n_frame_length, + frame_shift=self.n_frame_shift, + dither=self.dither, + energy_floor=self.energy_floor, + sr=self.fs) + + assert feat.ndim == 2 # (T,D) + return feat From 0d7d87120b79b71259a2d42c8a33f0e93adf67ee Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 14 Sep 2022 16:44:12 +0000 Subject: [PATCH 014/124] simplify feature pipeline graph --- paddlespeech/audio/compliance/kaldi.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index beb2d86b9..24415058c 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -357,10 +357,13 @@ def _get_mel_banks(num_bins: int, ('Bad values in options: vtln-low {} and vtln-high {}, versus ' 'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq)) - bin = paddle.arange(num_bins).unsqueeze(1) + bin = paddle.arange(num_bins, dtype=paddle.float32).unsqueeze(1) + # left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) + # center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1) + # right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1) left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) - center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1) - right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1) + center_mel = left_mel + mel_freq_delta + right_mel = center_mel + mel_freq_delta if vtln_warp_factor != 1.0: left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, @@ -373,7 +376,7 @@ def _get_mel_banks(num_bins: int, center_freqs = _inverse_mel_scale(center_mel) # (num_bins) # (1, num_fft_bins) - mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0) + mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins, dtype=paddle.float32)).unsqueeze(0) # (num_bins, num_fft_bins) up_slope = (mel - left_mel) / (center_mel - left_mel) @@ -472,7 +475,8 @@ def fbank(waveform: Tensor, # (n_mels, padded_window_size // 2) mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq, high_freq, vtln_low, vtln_high, vtln_warp) - mel_energies = mel_energies.astype(dtype) + # mel_energies = mel_energies.astype(dtype) + assert mel_energies.dtype == dtype # (n_mels, padded_window_size // 2 + 1) mel_energies = paddle.nn.functional.pad( From 5e714ecb4a40561c2a2e6a54ff8c4d787cea4ec4 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 19 Sep 2022 18:35:08 +0800 Subject: [PATCH 015/124] [doc]update api docs (#2406) * update apt docs, test=doc --- docs/source/api/paddlespeech.audio.rst | 3 + ...ddlespeech.audio.streamdata.autodecode.rst | 7 + .../paddlespeech.audio.streamdata.cache.rst | 7 + .../paddlespeech.audio.streamdata.compat.rst | 7 + ...espeech.audio.streamdata.extradatasets.rst | 7 + .../paddlespeech.audio.streamdata.filters.rst | 7 + .../paddlespeech.audio.streamdata.gopen.rst | 7 + ...paddlespeech.audio.streamdata.handlers.rst | 7 + .../api/paddlespeech.audio.streamdata.mix.rst | 7 + ...lespeech.audio.streamdata.paddle_utils.rst | 7 + ...paddlespeech.audio.streamdata.pipeline.rst | 7 + .../api/paddlespeech.audio.streamdata.rst | 28 ++ ...ddlespeech.audio.streamdata.shardlists.rst | 7 + ...lespeech.audio.streamdata.tariterators.rst | 7 + .../paddlespeech.audio.streamdata.utils.rst | 7 + .../paddlespeech.audio.streamdata.writer.rst | 7 + docs/source/api/paddlespeech.audio.text.rst | 16 + ...addlespeech.audio.text.text_featurizer.rst | 7 + .../api/paddlespeech.audio.text.utility.rst | 7 + ...addlespeech.audio.transform.add_deltas.rst | 7 + ...peech.audio.transform.channel_selector.rst | 7 + .../api/paddlespeech.audio.transform.cmvn.rst | 7 + ...addlespeech.audio.transform.functional.rst | 7 + .../paddlespeech.audio.transform.perturb.rst | 7 + .../api/paddlespeech.audio.transform.rst | 24 ++ ...dlespeech.audio.transform.spec_augment.rst | 7 + ...ddlespeech.audio.transform.spectrogram.rst | 7 + ...ch.audio.transform.transform_interface.rst | 7 + ...espeech.audio.transform.transformation.rst | 7 + .../api/paddlespeech.audio.transform.wpe.rst | 7 + .../paddlespeech.audio.utils.check_kwargs.rst | 7 + ...addlespeech.audio.utils.dynamic_import.rst | 7 + docs/source/api/paddlespeech.audio.utils.rst | 3 + .../paddlespeech.audio.utils.tensor_utils.rst | 7 + .../paddlespeech.kws.exps.mdtc.collate.rst | 7 + ...paddlespeech.kws.exps.mdtc.compute_det.rst | 7 + ...dlespeech.kws.exps.mdtc.plot_det_curve.rst | 7 + .../source/api/paddlespeech.kws.exps.mdtc.rst | 19 ++ .../api/paddlespeech.kws.exps.mdtc.score.rst | 7 + .../api/paddlespeech.kws.exps.mdtc.train.rst | 7 + docs/source/api/paddlespeech.kws.exps.rst | 15 + docs/source/api/paddlespeech.kws.rst | 1 + .../api/paddlespeech.resource.model_alias.rst | 7 + ...addlespeech.resource.pretrained_models.rst | 7 + .../api/paddlespeech.resource.resource.rst | 7 + docs/source/api/paddlespeech.resource.rst | 17 + docs/source/api/paddlespeech.rst | 2 + docs/source/api/paddlespeech.s2t.rst | 1 - docs/source/api/paddlespeech.server.utils.rst | 1 - docs/source/api/paddlespeech.t2s.datasets.rst | 1 + .../api/paddlespeech.t2s.datasets.sampler.rst | 7 + .../paddlespeech.t2s.exps.ernie_sat.align.rst | 7 + ...dlespeech.t2s.exps.ernie_sat.normalize.rst | 7 + ...lespeech.t2s.exps.ernie_sat.preprocess.rst | 7 + .../api/paddlespeech.t2s.exps.ernie_sat.rst | 21 ++ ...lespeech.t2s.exps.ernie_sat.synthesize.rst | 7 + ...eech.t2s.exps.ernie_sat.synthesize_e2e.rst | 7 + .../paddlespeech.t2s.exps.ernie_sat.train.rst | 7 + .../paddlespeech.t2s.exps.ernie_sat.utils.rst | 7 + .../api/paddlespeech.t2s.exps.fastspeech2.rst | 1 + ...espeech.t2s.exps.fastspeech2.vc2_infer.rst | 7 + docs/source/api/paddlespeech.t2s.exps.rst | 3 + .../paddlespeech.t2s.exps.stream_play_tts.rst | 7 + .../paddlespeech.t2s.exps.vits.normalize.rst | 7 + .../paddlespeech.t2s.exps.vits.preprocess.rst | 7 + .../source/api/paddlespeech.t2s.exps.vits.rst | 20 ++ .../paddlespeech.t2s.exps.vits.synthesize.rst | 7 + ...dlespeech.t2s.exps.vits.synthesize_e2e.rst | 7 + .../api/paddlespeech.t2s.exps.vits.train.rst | 7 + ...ddlespeech.t2s.exps.vits.voice_cloning.rst | 7 + ...paddlespeech.t2s.frontend.g2pw.dataset.rst | 7 + ...addlespeech.t2s.frontend.g2pw.onnx_api.rst | 7 + .../api/paddlespeech.t2s.frontend.g2pw.rst | 17 + .../paddlespeech.t2s.frontend.g2pw.utils.rst | 7 + ...paddlespeech.t2s.frontend.mix_frontend.rst | 7 + docs/source/api/paddlespeech.t2s.frontend.rst | 2 + ...espeech.t2s.models.ernie_sat.ernie_sat.rst | 7 + ...t2s.models.ernie_sat.ernie_sat_updater.rst | 7 + .../api/paddlespeech.t2s.models.ernie_sat.rst | 3 +- ...h.t2s.models.vits.monotonic_align.core.rst | 7 + ...speech.t2s.models.vits.monotonic_align.rst | 16 + ....t2s.models.vits.monotonic_align.setup.rst | 7 + .../api/paddlespeech.utils.dynamic_import.rst | 7 + docs/source/api/paddlespeech.utils.env.rst | 7 + docs/source/api/paddlespeech.utils.rst | 16 + docs/source/index.rst | 2 + .../t2s/models/ernie_sat/ernie_sat.py | 108 ++++--- .../t2s/models/vits/duration_predictor.py | 39 ++- paddlespeech/t2s/models/vits/flow.py | 111 ++++--- paddlespeech/t2s/models/vits/generator.py | 301 +++++++++++------- .../t2s/models/vits/posterior_encoder.py | 54 ++-- .../t2s/models/vits/residual_coupling.py | 99 ++++-- paddlespeech/t2s/models/vits/text_encoder.py | 69 ++-- paddlespeech/t2s/models/vits/vits.py | 153 ++++++--- .../t2s/models/vits/wavenet/residual_block.py | 24 +- .../t2s/models/vits/wavenet/wavenet.py | 72 +++-- paddlespeech/t2s/models/wavernn/wavernn.py | 20 +- 97 files changed, 1348 insertions(+), 375 deletions(-) create mode 100644 docs/source/api/paddlespeech.audio.streamdata.autodecode.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.cache.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.compat.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.filters.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.gopen.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.handlers.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.mix.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.pipeline.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.shardlists.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.tariterators.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.utils.rst create mode 100644 docs/source/api/paddlespeech.audio.streamdata.writer.rst create mode 100644 docs/source/api/paddlespeech.audio.text.rst create mode 100644 docs/source/api/paddlespeech.audio.text.text_featurizer.rst create mode 100644 docs/source/api/paddlespeech.audio.text.utility.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.add_deltas.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.channel_selector.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.cmvn.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.functional.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.perturb.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.spec_augment.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.spectrogram.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.transform_interface.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.transformation.rst create mode 100644 docs/source/api/paddlespeech.audio.transform.wpe.rst create mode 100644 docs/source/api/paddlespeech.audio.utils.check_kwargs.rst create mode 100644 docs/source/api/paddlespeech.audio.utils.dynamic_import.rst create mode 100644 docs/source/api/paddlespeech.audio.utils.tensor_utils.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.score.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.train.rst create mode 100644 docs/source/api/paddlespeech.kws.exps.rst create mode 100644 docs/source/api/paddlespeech.resource.model_alias.rst create mode 100644 docs/source/api/paddlespeech.resource.pretrained_models.rst create mode 100644 docs/source/api/paddlespeech.resource.resource.rst create mode 100644 docs/source/api/paddlespeech.resource.rst create mode 100644 docs/source/api/paddlespeech.t2s.datasets.sampler.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.train.rst create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst create mode 100644 docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst create mode 100644 docs/source/api/paddlespeech.utils.dynamic_import.rst create mode 100644 docs/source/api/paddlespeech.utils.env.rst create mode 100644 docs/source/api/paddlespeech.utils.rst diff --git a/docs/source/api/paddlespeech.audio.rst b/docs/source/api/paddlespeech.audio.rst index 5a3867f96..4ed7e4672 100644 --- a/docs/source/api/paddlespeech.audio.rst +++ b/docs/source/api/paddlespeech.audio.rst @@ -20,4 +20,7 @@ Subpackages paddlespeech.audio.io paddlespeech.audio.metric paddlespeech.audio.sox_effects + paddlespeech.audio.streamdata + paddlespeech.audio.text + paddlespeech.audio.transform paddlespeech.audio.utils diff --git a/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst new file mode 100644 index 000000000..1e45c1373 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.autodecode module +=============================================== + +.. automodule:: paddlespeech.audio.streamdata.autodecode + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.cache.rst b/docs/source/api/paddlespeech.audio.streamdata.cache.rst new file mode 100644 index 000000000..393055e54 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.cache.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.cache module +========================================== + +.. automodule:: paddlespeech.audio.streamdata.cache + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.compat.rst b/docs/source/api/paddlespeech.audio.streamdata.compat.rst new file mode 100644 index 000000000..760695b20 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.compat.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.compat module +=========================================== + +.. automodule:: paddlespeech.audio.streamdata.compat + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst new file mode 100644 index 000000000..74628e963 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.extradatasets module +================================================== + +.. automodule:: paddlespeech.audio.streamdata.extradatasets + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.filters.rst b/docs/source/api/paddlespeech.audio.streamdata.filters.rst new file mode 100644 index 000000000..d26104279 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.filters.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.filters module +============================================ + +.. automodule:: paddlespeech.audio.streamdata.filters + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.gopen.rst b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst new file mode 100644 index 000000000..1cccb7763 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.gopen module +========================================== + +.. automodule:: paddlespeech.audio.streamdata.gopen + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.handlers.rst b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst new file mode 100644 index 000000000..7a4b3ce8e --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.handlers module +============================================= + +.. automodule:: paddlespeech.audio.streamdata.handlers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.mix.rst b/docs/source/api/paddlespeech.audio.streamdata.mix.rst new file mode 100644 index 000000000..908b35dd1 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.mix.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.mix module +======================================== + +.. automodule:: paddlespeech.audio.streamdata.mix + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst new file mode 100644 index 000000000..203343004 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.paddle\_utils module +================================================== + +.. automodule:: paddlespeech.audio.streamdata.paddle_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst new file mode 100644 index 000000000..ae05fbecc --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.pipeline module +============================================= + +.. automodule:: paddlespeech.audio.streamdata.pipeline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.rst b/docs/source/api/paddlespeech.audio.streamdata.rst new file mode 100644 index 000000000..a1f4560a3 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.rst @@ -0,0 +1,28 @@ +paddlespeech.audio.streamdata package +===================================== + +.. automodule:: paddlespeech.audio.streamdata + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.audio.streamdata.autodecode + paddlespeech.audio.streamdata.cache + paddlespeech.audio.streamdata.compat + paddlespeech.audio.streamdata.extradatasets + paddlespeech.audio.streamdata.filters + paddlespeech.audio.streamdata.gopen + paddlespeech.audio.streamdata.handlers + paddlespeech.audio.streamdata.mix + paddlespeech.audio.streamdata.paddle_utils + paddlespeech.audio.streamdata.pipeline + paddlespeech.audio.streamdata.shardlists + paddlespeech.audio.streamdata.tariterators + paddlespeech.audio.streamdata.utils + paddlespeech.audio.streamdata.writer diff --git a/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst new file mode 100644 index 000000000..ec1fe8236 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.shardlists module +=============================================== + +.. automodule:: paddlespeech.audio.streamdata.shardlists + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst new file mode 100644 index 000000000..b003b2d42 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.tariterators module +================================================= + +.. automodule:: paddlespeech.audio.streamdata.tariterators + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.utils.rst b/docs/source/api/paddlespeech.audio.streamdata.utils.rst new file mode 100644 index 000000000..f248b1131 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.utils.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.utils module +========================================== + +.. automodule:: paddlespeech.audio.streamdata.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.streamdata.writer.rst b/docs/source/api/paddlespeech.audio.streamdata.writer.rst new file mode 100644 index 000000000..7437241f3 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.streamdata.writer.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.streamdata.writer module +=========================================== + +.. automodule:: paddlespeech.audio.streamdata.writer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.text.rst b/docs/source/api/paddlespeech.audio.text.rst new file mode 100644 index 000000000..a2018050a --- /dev/null +++ b/docs/source/api/paddlespeech.audio.text.rst @@ -0,0 +1,16 @@ +paddlespeech.audio.text package +=============================== + +.. automodule:: paddlespeech.audio.text + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.audio.text.text_featurizer + paddlespeech.audio.text.utility diff --git a/docs/source/api/paddlespeech.audio.text.text_featurizer.rst b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst new file mode 100644 index 000000000..1a8262d08 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.text.text\_featurizer module +=============================================== + +.. automodule:: paddlespeech.audio.text.text_featurizer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.text.utility.rst b/docs/source/api/paddlespeech.audio.text.utility.rst new file mode 100644 index 000000000..90fcb25f6 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.text.utility.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.text.utility module +====================================== + +.. automodule:: paddlespeech.audio.text.utility + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.add_deltas.rst b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst new file mode 100644 index 000000000..b4b596d6e --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.add\_deltas module +=============================================== + +.. automodule:: paddlespeech.audio.transform.add_deltas + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.channel_selector.rst b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst new file mode 100644 index 000000000..4828b5904 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.channel\_selector module +===================================================== + +.. automodule:: paddlespeech.audio.transform.channel_selector + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.cmvn.rst b/docs/source/api/paddlespeech.audio.transform.cmvn.rst new file mode 100644 index 000000000..44655a1e4 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.cmvn.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.cmvn module +======================================== + +.. automodule:: paddlespeech.audio.transform.cmvn + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.functional.rst b/docs/source/api/paddlespeech.audio.transform.functional.rst new file mode 100644 index 000000000..7877d2495 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.functional.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.functional module +============================================== + +.. automodule:: paddlespeech.audio.transform.functional + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.perturb.rst b/docs/source/api/paddlespeech.audio.transform.perturb.rst new file mode 100644 index 000000000..e3615a5d1 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.perturb.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.perturb module +=========================================== + +.. automodule:: paddlespeech.audio.transform.perturb + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.rst b/docs/source/api/paddlespeech.audio.transform.rst new file mode 100644 index 000000000..47a7303b3 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.rst @@ -0,0 +1,24 @@ +paddlespeech.audio.transform package +==================================== + +.. automodule:: paddlespeech.audio.transform + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.audio.transform.add_deltas + paddlespeech.audio.transform.channel_selector + paddlespeech.audio.transform.cmvn + paddlespeech.audio.transform.functional + paddlespeech.audio.transform.perturb + paddlespeech.audio.transform.spec_augment + paddlespeech.audio.transform.spectrogram + paddlespeech.audio.transform.transform_interface + paddlespeech.audio.transform.transformation + paddlespeech.audio.transform.wpe diff --git a/docs/source/api/paddlespeech.audio.transform.spec_augment.rst b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst new file mode 100644 index 000000000..f11a32241 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.spec\_augment module +================================================= + +.. automodule:: paddlespeech.audio.transform.spec_augment + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.spectrogram.rst b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst new file mode 100644 index 000000000..6be0c32ee --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.spectrogram module +=============================================== + +.. automodule:: paddlespeech.audio.transform.spectrogram + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.transform_interface.rst b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst new file mode 100644 index 000000000..ec8b20857 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.transform\_interface module +======================================================== + +.. automodule:: paddlespeech.audio.transform.transform_interface + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.transformation.rst b/docs/source/api/paddlespeech.audio.transform.transformation.rst new file mode 100644 index 000000000..94629b9af --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.transformation.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.transformation module +================================================== + +.. automodule:: paddlespeech.audio.transform.transformation + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.transform.wpe.rst b/docs/source/api/paddlespeech.audio.transform.wpe.rst new file mode 100644 index 000000000..85c758114 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.transform.wpe.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.transform.wpe module +======================================= + +.. automodule:: paddlespeech.audio.transform.wpe + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst new file mode 100644 index 000000000..a18f27e65 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.utils.check\_kwargs module +============================================= + +.. automodule:: paddlespeech.audio.utils.check_kwargs + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst new file mode 100644 index 000000000..5d060ee15 --- /dev/null +++ b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.utils.dynamic\_import module +=============================================== + +.. automodule:: paddlespeech.audio.utils.dynamic_import + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.audio.utils.rst b/docs/source/api/paddlespeech.audio.utils.rst index db15927da..217afa8fb 100644 --- a/docs/source/api/paddlespeech.audio.utils.rst +++ b/docs/source/api/paddlespeech.audio.utils.rst @@ -12,8 +12,11 @@ Submodules .. toctree:: :maxdepth: 4 + paddlespeech.audio.utils.check_kwargs paddlespeech.audio.utils.download + paddlespeech.audio.utils.dynamic_import paddlespeech.audio.utils.error paddlespeech.audio.utils.log paddlespeech.audio.utils.numeric + paddlespeech.audio.utils.tensor_utils paddlespeech.audio.utils.time diff --git a/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst new file mode 100644 index 000000000..93a1f70eb --- /dev/null +++ b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst @@ -0,0 +1,7 @@ +paddlespeech.audio.utils.tensor\_utils module +============================================= + +.. automodule:: paddlespeech.audio.utils.tensor_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst new file mode 100644 index 000000000..b533e8c42 --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.collate module +========================================= + +.. automodule:: paddlespeech.kws.exps.mdtc.collate + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst new file mode 100644 index 000000000..45e094555 --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.compute\_det module +============================================== + +.. automodule:: paddlespeech.kws.exps.mdtc.compute_det + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst new file mode 100644 index 000000000..46a149b0b --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.plot\_det\_curve module +================================================== + +.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.rst new file mode 100644 index 000000000..f6cad64e3 --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.rst @@ -0,0 +1,19 @@ +paddlespeech.kws.exps.mdtc package +================================== + +.. automodule:: paddlespeech.kws.exps.mdtc + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.kws.exps.mdtc.collate + paddlespeech.kws.exps.mdtc.compute_det + paddlespeech.kws.exps.mdtc.plot_det_curve + paddlespeech.kws.exps.mdtc.score + paddlespeech.kws.exps.mdtc.train diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst new file mode 100644 index 000000000..aa956b4cb --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.score module +======================================= + +.. automodule:: paddlespeech.kws.exps.mdtc.score + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst new file mode 100644 index 000000000..5e4ca401a --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst @@ -0,0 +1,7 @@ +paddlespeech.kws.exps.mdtc.train module +======================================= + +.. automodule:: paddlespeech.kws.exps.mdtc.train + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.rst b/docs/source/api/paddlespeech.kws.exps.rst new file mode 100644 index 000000000..bf10d2c9f --- /dev/null +++ b/docs/source/api/paddlespeech.kws.exps.rst @@ -0,0 +1,15 @@ +paddlespeech.kws.exps package +============================= + +.. automodule:: paddlespeech.kws.exps + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.kws.exps.mdtc diff --git a/docs/source/api/paddlespeech.kws.rst b/docs/source/api/paddlespeech.kws.rst index c2829a42e..d21d094c7 100644 --- a/docs/source/api/paddlespeech.kws.rst +++ b/docs/source/api/paddlespeech.kws.rst @@ -12,4 +12,5 @@ Subpackages .. toctree:: :maxdepth: 4 + paddlespeech.kws.exps paddlespeech.kws.models diff --git a/docs/source/api/paddlespeech.resource.model_alias.rst b/docs/source/api/paddlespeech.resource.model_alias.rst new file mode 100644 index 000000000..b78e643ac --- /dev/null +++ b/docs/source/api/paddlespeech.resource.model_alias.rst @@ -0,0 +1,7 @@ +paddlespeech.resource.model\_alias module +========================================= + +.. automodule:: paddlespeech.resource.model_alias + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.resource.pretrained_models.rst b/docs/source/api/paddlespeech.resource.pretrained_models.rst new file mode 100644 index 000000000..a02061693 --- /dev/null +++ b/docs/source/api/paddlespeech.resource.pretrained_models.rst @@ -0,0 +1,7 @@ +paddlespeech.resource.pretrained\_models module +=============================================== + +.. automodule:: paddlespeech.resource.pretrained_models + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.resource.resource.rst b/docs/source/api/paddlespeech.resource.resource.rst new file mode 100644 index 000000000..8b51eda3c --- /dev/null +++ b/docs/source/api/paddlespeech.resource.resource.rst @@ -0,0 +1,7 @@ +paddlespeech.resource.resource module +===================================== + +.. automodule:: paddlespeech.resource.resource + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.resource.rst b/docs/source/api/paddlespeech.resource.rst new file mode 100644 index 000000000..61fdd5317 --- /dev/null +++ b/docs/source/api/paddlespeech.resource.rst @@ -0,0 +1,17 @@ +paddlespeech.resource package +============================= + +.. automodule:: paddlespeech.resource + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.resource.model_alias + paddlespeech.resource.pretrained_models + paddlespeech.resource.resource diff --git a/docs/source/api/paddlespeech.rst b/docs/source/api/paddlespeech.rst index e7a01bf76..d06cd2c77 100644 --- a/docs/source/api/paddlespeech.rst +++ b/docs/source/api/paddlespeech.rst @@ -16,8 +16,10 @@ Subpackages paddlespeech.cli paddlespeech.cls paddlespeech.kws + paddlespeech.resource paddlespeech.s2t paddlespeech.server paddlespeech.t2s paddlespeech.text + paddlespeech.utils paddlespeech.vector diff --git a/docs/source/api/paddlespeech.s2t.rst b/docs/source/api/paddlespeech.s2t.rst index 4be22cb87..be9ef52f5 100644 --- a/docs/source/api/paddlespeech.s2t.rst +++ b/docs/source/api/paddlespeech.s2t.rst @@ -19,5 +19,4 @@ Subpackages paddlespeech.s2t.models paddlespeech.s2t.modules paddlespeech.s2t.training - paddlespeech.s2t.transform paddlespeech.s2t.utils diff --git a/docs/source/api/paddlespeech.server.utils.rst b/docs/source/api/paddlespeech.server.utils.rst index 9d1166392..b4051aee3 100644 --- a/docs/source/api/paddlespeech.server.utils.rst +++ b/docs/source/api/paddlespeech.server.utils.rst @@ -18,7 +18,6 @@ Submodules paddlespeech.server.utils.config paddlespeech.server.utils.errors paddlespeech.server.utils.exception - paddlespeech.server.utils.log paddlespeech.server.utils.onnx_infer paddlespeech.server.utils.paddle_predictor paddlespeech.server.utils.util diff --git a/docs/source/api/paddlespeech.t2s.datasets.rst b/docs/source/api/paddlespeech.t2s.datasets.rst index b40eb2bf1..dfbdb0b47 100644 --- a/docs/source/api/paddlespeech.t2s.datasets.rst +++ b/docs/source/api/paddlespeech.t2s.datasets.rst @@ -19,4 +19,5 @@ Submodules paddlespeech.t2s.datasets.get_feats paddlespeech.t2s.datasets.ljspeech paddlespeech.t2s.datasets.preprocess_utils + paddlespeech.t2s.datasets.sampler paddlespeech.t2s.datasets.vocoder_batch_fn diff --git a/docs/source/api/paddlespeech.t2s.datasets.sampler.rst b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst new file mode 100644 index 000000000..ed29c28d7 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.datasets.sampler module +======================================== + +.. automodule:: paddlespeech.t2s.datasets.sampler + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst new file mode 100644 index 000000000..a5e07aace --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.align module +============================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.align + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst new file mode 100644 index 000000000..3771311cb --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.normalize module +================================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst new file mode 100644 index 000000000..8d4c24ffe --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.preprocess module +================================================== + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst new file mode 100644 index 000000000..a61158420 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst @@ -0,0 +1,21 @@ +paddlespeech.t2s.exps.ernie\_sat package +======================================== + +.. automodule:: paddlespeech.t2s.exps.ernie_sat + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.exps.ernie_sat.align + paddlespeech.t2s.exps.ernie_sat.normalize + paddlespeech.t2s.exps.ernie_sat.preprocess + paddlespeech.t2s.exps.ernie_sat.synthesize + paddlespeech.t2s.exps.ernie_sat.synthesize_e2e + paddlespeech.t2s.exps.ernie_sat.train + paddlespeech.t2s.exps.ernie_sat.utils diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst new file mode 100644 index 000000000..ecda2a513 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.synthesize module +================================================== + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst new file mode 100644 index 000000000..00fc44952 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module +======================================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst new file mode 100644 index 000000000..ba9a33344 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.train module +============================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.train + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst new file mode 100644 index 000000000..a2dd26c38 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.ernie\_sat.utils module +============================================= + +.. automodule:: paddlespeech.t2s.exps.ernie_sat.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst index 3c98aa882..fad1fd87f 100644 --- a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst +++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst @@ -16,3 +16,4 @@ Submodules paddlespeech.t2s.exps.fastspeech2.normalize paddlespeech.t2s.exps.fastspeech2.preprocess paddlespeech.t2s.exps.fastspeech2.train + paddlespeech.t2s.exps.fastspeech2.vc2_infer diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst new file mode 100644 index 000000000..70a9d6e15 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.fastspeech2.vc2\_infer module +=================================================== + +.. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.rst b/docs/source/api/paddlespeech.t2s.exps.rst index a688435eb..bee18a972 100644 --- a/docs/source/api/paddlespeech.t2s.exps.rst +++ b/docs/source/api/paddlespeech.t2s.exps.rst @@ -12,11 +12,13 @@ Subpackages .. toctree:: :maxdepth: 4 + paddlespeech.t2s.exps.ernie_sat paddlespeech.t2s.exps.fastspeech2 paddlespeech.t2s.exps.gan_vocoder paddlespeech.t2s.exps.speedyspeech paddlespeech.t2s.exps.tacotron2 paddlespeech.t2s.exps.transformer_tts + paddlespeech.t2s.exps.vits paddlespeech.t2s.exps.waveflow paddlespeech.t2s.exps.wavernn @@ -31,6 +33,7 @@ Submodules paddlespeech.t2s.exps.ort_predict paddlespeech.t2s.exps.ort_predict_e2e paddlespeech.t2s.exps.ort_predict_streaming + paddlespeech.t2s.exps.stream_play_tts paddlespeech.t2s.exps.syn_utils paddlespeech.t2s.exps.synthesize paddlespeech.t2s.exps.synthesize_e2e diff --git a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst new file mode 100644 index 000000000..cb22dde0c --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.stream\_play\_tts module +============================================== + +.. automodule:: paddlespeech.t2s.exps.stream_play_tts + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst new file mode 100644 index 000000000..c5606f998 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.normalize module +=========================================== + +.. automodule:: paddlespeech.t2s.exps.vits.normalize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst new file mode 100644 index 000000000..50633c621 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.preprocess module +============================================ + +.. automodule:: paddlespeech.t2s.exps.vits.preprocess + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.rst b/docs/source/api/paddlespeech.t2s.exps.vits.rst new file mode 100644 index 000000000..51a9418d5 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.rst @@ -0,0 +1,20 @@ +paddlespeech.t2s.exps.vits package +================================== + +.. automodule:: paddlespeech.t2s.exps.vits + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.exps.vits.normalize + paddlespeech.t2s.exps.vits.preprocess + paddlespeech.t2s.exps.vits.synthesize + paddlespeech.t2s.exps.vits.synthesize_e2e + paddlespeech.t2s.exps.vits.train + paddlespeech.t2s.exps.vits.voice_cloning diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst new file mode 100644 index 000000000..4b22d069a --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.synthesize module +============================================ + +.. automodule:: paddlespeech.t2s.exps.vits.synthesize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst new file mode 100644 index 000000000..053ddfc83 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.synthesize\_e2e module +================================================= + +.. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.train.rst b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst new file mode 100644 index 000000000..31bd3a48f --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.train module +======================================= + +.. automodule:: paddlespeech.t2s.exps.vits.train + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst new file mode 100644 index 000000000..d9be0f310 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.exps.vits.voice\_cloning module +================================================ + +.. automodule:: paddlespeech.t2s.exps.vits.voice_cloning + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst new file mode 100644 index 000000000..1635ec284 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.g2pw.dataset module +============================================= + +.. automodule:: paddlespeech.t2s.frontend.g2pw.dataset + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst new file mode 100644 index 000000000..b7d549070 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.g2pw.onnx\_api module +=============================================== + +.. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst new file mode 100644 index 000000000..10a118b76 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst @@ -0,0 +1,17 @@ +paddlespeech.t2s.frontend.g2pw package +====================================== + +.. automodule:: paddlespeech.t2s.frontend.g2pw + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.frontend.g2pw.dataset + paddlespeech.t2s.frontend.g2pw.onnx_api + paddlespeech.t2s.frontend.g2pw.utils diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst new file mode 100644 index 000000000..ce9428037 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.g2pw.utils module +=========================================== + +.. automodule:: paddlespeech.t2s.frontend.g2pw.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst new file mode 100644 index 000000000..4505dddba --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.frontend.mix\_frontend module +============================================== + +.. automodule:: paddlespeech.t2s.frontend.mix_frontend + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.rst index 8fbf1e6eb..b61068616 100644 --- a/docs/source/api/paddlespeech.t2s.frontend.rst +++ b/docs/source/api/paddlespeech.t2s.frontend.rst @@ -12,6 +12,7 @@ Subpackages .. toctree:: :maxdepth: 4 + paddlespeech.t2s.frontend.g2pw paddlespeech.t2s.frontend.normalizer paddlespeech.t2s.frontend.zh_normalization @@ -23,6 +24,7 @@ Submodules paddlespeech.t2s.frontend.arpabet paddlespeech.t2s.frontend.generate_lexicon + paddlespeech.t2s.frontend.mix_frontend paddlespeech.t2s.frontend.phonectic paddlespeech.t2s.frontend.punctuation paddlespeech.t2s.frontend.tone_sandhi diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst new file mode 100644 index 000000000..fce5a83cc --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.ernie\_sat.ernie\_sat module +==================================================== + +.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst new file mode 100644 index 000000000..8a697d6cf --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module +============================================================= + +.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst index 680a85dea..aff7489c7 100644 --- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst +++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst @@ -12,4 +12,5 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.t2s.models.ernie_sat.mlm + paddlespeech.t2s.models.ernie_sat.ernie_sat + paddlespeech.t2s.models.ernie_sat.ernie_sat_updater diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst new file mode 100644 index 000000000..7aaba7952 --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.vits.monotonic\_align.core module +========================================================= + +.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst new file mode 100644 index 000000000..25c819a7e --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst @@ -0,0 +1,16 @@ +paddlespeech.t2s.models.vits.monotonic\_align package +===================================================== + +.. automodule:: paddlespeech.t2s.models.vits.monotonic_align + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.t2s.models.vits.monotonic_align.core + paddlespeech.t2s.models.vits.monotonic_align.setup diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst new file mode 100644 index 000000000..a93c3b8bf --- /dev/null +++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst @@ -0,0 +1,7 @@ +paddlespeech.t2s.models.vits.monotonic\_align.setup module +========================================================== + +.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.utils.dynamic_import.rst b/docs/source/api/paddlespeech.utils.dynamic_import.rst new file mode 100644 index 000000000..daa4e6e78 --- /dev/null +++ b/docs/source/api/paddlespeech.utils.dynamic_import.rst @@ -0,0 +1,7 @@ +paddlespeech.utils.dynamic\_import module +========================================= + +.. automodule:: paddlespeech.utils.dynamic_import + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.utils.env.rst b/docs/source/api/paddlespeech.utils.env.rst new file mode 100644 index 000000000..e51278f82 --- /dev/null +++ b/docs/source/api/paddlespeech.utils.env.rst @@ -0,0 +1,7 @@ +paddlespeech.utils.env module +============================= + +.. automodule:: paddlespeech.utils.env + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/paddlespeech.utils.rst b/docs/source/api/paddlespeech.utils.rst new file mode 100644 index 000000000..3d47626bb --- /dev/null +++ b/docs/source/api/paddlespeech.utils.rst @@ -0,0 +1,16 @@ +paddlespeech.utils package +========================== + +.. automodule:: paddlespeech.utils + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + paddlespeech.utils.dynamic_import + paddlespeech.utils.env diff --git a/docs/source/index.rst b/docs/source/index.rst index 83474c528..8540d3fc6 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -74,8 +74,10 @@ Contents paddlespeech.cli paddlespeech.cls paddlespeech.kws + paddlespeech.resource paddlespeech.s2t paddlespeech.server paddlespeech.t2s paddlespeech.text + paddlespeech.utils paddlespeech.vector diff --git a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py index 08c43dc5f..eb42b33ed 100644 --- a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py +++ b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py @@ -71,31 +71,53 @@ class MLMEncoder(nn.Layer): """Conformer encoder module. Args: - idim (int): Input dimension. - attention_dim (int): Dimension of attention. - attention_heads (int): The number of heads of multi head attention. - linear_units (int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - attention_dropout_rate (float): Dropout rate in attention. - input_layer (Union[str, paddle.nn.Layer]): Input layer type. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + idim (int): + Input dimension. + attention_dim (int): + Dimension of attention. + attention_heads (int): + The number of heads of multi head attention. + linear_units (int): + The number of units of position-wise feed forward. + num_blocks (int): + The number of decoder blocks. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate after adding positional encoding. + attention_dropout_rate (float): + Dropout rate in attention. + input_layer (Union[str, paddle.nn.Layer]): + Input layer type. + normalize_before (bool): + Whether to use layer_norm before the first block. + concat_after (bool): + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. - macaron_style (bool): Whether to use macaron style for positionwise layer. - pos_enc_layer_type (str): Encoder positional encoding layer type. - selfattention_layer_type (str): Encoder attention layer type. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel (int): Kernerl size of convolution module. - padding_idx (int): Padding idx for input_layer=embed. - stochastic_depth_rate (float): Maximum probability to skip the encoder layer. + positionwise_layer_type (str): + "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): + Kernel size of positionwise conv1d layer. + macaron_style (bool): + Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): + Encoder positional encoding layer type. + selfattention_layer_type (str): + Encoder attention layer type. + activation_type (str): + Encoder activation function type. + use_cnn_module (bool): + Whether to use convolution module. + zero_triu (bool): + Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): + Kernerl size of convolution module. + padding_idx (int): + Padding idx for input_layer=embed. + stochastic_depth_rate (float): + Maximum probability to skip the encoder layer. """ @@ -320,12 +342,16 @@ class MLMDecoder(MLMEncoder): """Encode input sequence. Args: - xs (paddle.Tensor): Input tensor (#batch, time, idim). - masks (paddle.Tensor): Mask tensor (#batch, time). + xs (paddle.Tensor): + Input tensor (#batch, time, idim). + masks (paddle.Tensor): + Mask tensor (#batch, time). Returns: - paddle.Tensor: Output tensor (#batch, time, attention_dim). - paddle.Tensor: Mask tensor (#batch, time). + paddle.Tensor: + Output tensor (#batch, time, attention_dim). + paddle.Tensor: + Mask tensor (#batch, time). """ xs = self.embed(xs) @@ -392,19 +418,27 @@ class MLM(nn.Layer): use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]: ''' Args: - speech (paddle.Tensor): input speech (1, Tmax, D). - text (paddle.Tensor): input text (1, Tmax2). - masked_pos (paddle.Tensor): masked position of input speech (1, Tmax) - speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax). - text_mask (paddle.Tensor): mask of text (1, 1, Tmax2). - speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax). - text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2). - span_bdy (List[int]): masked mel boundary of input speech (2,) - use_teacher_forcing (bool): whether to use teacher forcing + speech (paddle.Tensor): + input speech (1, Tmax, D). + text (paddle.Tensor): + input text (1, Tmax2). + masked_pos (paddle.Tensor): + masked position of input speech (1, Tmax) + speech_mask (paddle.Tensor): + mask of speech (1, 1, Tmax). + text_mask (paddle.Tensor): + mask of text (1, 1, Tmax2). + speech_seg_pos (paddle.Tensor): + n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax). + text_seg_pos (paddle.Tensor): + n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2). + span_bdy (List[int]): + masked mel boundary of input speech (2,) + use_teacher_forcing (bool): + whether to use teacher forcing Returns: List[Tensor]: - eg: - [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])] + eg: [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])] ''' z_cache = None diff --git a/paddlespeech/t2s/models/vits/duration_predictor.py b/paddlespeech/t2s/models/vits/duration_predictor.py index 6197d5696..b0bb68d0f 100644 --- a/paddlespeech/t2s/models/vits/duration_predictor.py +++ b/paddlespeech/t2s/models/vits/duration_predictor.py @@ -48,12 +48,18 @@ class StochasticDurationPredictor(nn.Layer): global_channels: int=-1, ): """Initialize StochasticDurationPredictor module. Args: - channels (int): Number of channels. - kernel_size (int): Kernel size. - dropout_rate (float): Dropout rate. - flows (int): Number of flows. - dds_conv_layers (int): Number of conv layers in DDS conv. - global_channels (int): Number of global conditioning channels. + channels (int): + Number of channels. + kernel_size (int): + Kernel size. + dropout_rate (float): + Dropout rate. + flows (int): + Number of flows. + dds_conv_layers (int): + Number of conv layers in DDS conv. + global_channels (int): + Number of global conditioning channels. """ super().__init__() @@ -108,14 +114,21 @@ class StochasticDurationPredictor(nn.Layer): noise_scale: float=1.0, ) -> paddle.Tensor: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T_text). - x_mask (Tensor): Mask tensor (B, 1, T_text). - w (Optional[Tensor]): Duration tensor (B, 1, T_text). - g (Optional[Tensor]): Global conditioning tensor (B, channels, 1) - inverse (bool): Whether to inverse the flow. - noise_scale (float): Noise scale value. + x (Tensor): + Input tensor (B, channels, T_text). + x_mask (Tensor): + Mask tensor (B, 1, T_text). + w (Optional[Tensor]): + Duration tensor (B, 1, T_text). + g (Optional[Tensor]): + Global conditioning tensor (B, channels, 1) + inverse (bool): + Whether to inverse the flow. + noise_scale (float): + Noise scale value. Returns: - Tensor: If not inverse, negative log-likelihood (NLL) tensor (B,). + Tensor: + If not inverse, negative log-likelihood (NLL) tensor (B,). If inverse, log-duration tensor (B, 1, T_text). """ # stop gradient diff --git a/paddlespeech/t2s/models/vits/flow.py b/paddlespeech/t2s/models/vits/flow.py index 3c8f89356..7593eb727 100644 --- a/paddlespeech/t2s/models/vits/flow.py +++ b/paddlespeech/t2s/models/vits/flow.py @@ -34,11 +34,15 @@ class FlipFlow(nn.Layer): ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T). - inverse (bool): Whether to inverse the flow. + x (Tensor): + Input tensor (B, channels, T). + inverse (bool): + Whether to inverse the flow. Returns: - Tensor: Flipped tensor (B, channels, T). - Tensor: Log-determinant tensor for NLL (B,) if not inverse. + Tensor: + Flipped tensor (B, channels, T). + Tensor: + Log-determinant tensor for NLL (B,) if not inverse. """ x = paddle.flip(x, [1]) if not inverse: @@ -60,13 +64,19 @@ class LogFlow(nn.Layer): ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T). - x_mask (Tensor): Mask tensor (B, 1, T). - inverse (bool): Whether to inverse the flow. - eps (float): Epsilon for log. + x (Tensor): + Input tensor (B, channels, T). + x_mask (Tensor): + Mask tensor (B, 1, T). + inverse (bool): + Whether to inverse the flow. + eps (float): + Epsilon for log. Returns: - Tensor: Output tensor (B, channels, T). - Tensor: Log-determinant tensor for NLL (B,) if not inverse. + Tensor: + Output tensor (B, channels, T). + Tensor: + Log-determinant tensor for NLL (B,) if not inverse. """ if not inverse: y = paddle.log(paddle.clip(x, min=eps)) * x_mask @@ -83,7 +93,8 @@ class ElementwiseAffineFlow(nn.Layer): def __init__(self, channels: int): """Initialize ElementwiseAffineFlow module. Args: - channels (int): Number of channels. + channels (int): + Number of channels. """ super().__init__() self.channels = channels @@ -107,12 +118,17 @@ class ElementwiseAffineFlow(nn.Layer): ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T). - x_mask (Tensor): Mask tensor (B, 1, T). - inverse (bool): Whether to inverse the flow. + x (Tensor): + Input tensor (B, channels, T). + x_mask (Tensor): + Mask tensor (B, 1, T). + inverse (bool): + Whether to inverse the flow. Returns: - Tensor: Output tensor (B, channels, T). - Tensor: Log-determinant tensor for NLL (B,) if not inverse. + Tensor: + Output tensor (B, channels, T). + Tensor: + Log-determinant tensor for NLL (B,) if not inverse. """ if not inverse: y = self.m + paddle.exp(self.logs) * x @@ -157,11 +173,16 @@ class DilatedDepthSeparableConv(nn.Layer): eps: float=1e-5, ): """Initialize DilatedDepthSeparableConv module. Args: - channels (int): Number of channels. - kernel_size (int): Kernel size. - layers (int): Number of layers. - dropout_rate (float): Dropout rate. - eps (float): Epsilon for layer norm. + channels (int): + Number of channels. + kernel_size (int): + Kernel size. + layers (int): + Number of layers. + dropout_rate (float): + Dropout rate. + eps (float): + Epsilon for layer norm. """ super().__init__() @@ -198,11 +219,15 @@ class DilatedDepthSeparableConv(nn.Layer): g: Optional[paddle.Tensor]=None) -> paddle.Tensor: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). - x_mask (Tensor): Mask tensor (B, 1, T). - g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). + x (Tensor): + Input tensor (B, in_channels, T). + x_mask (Tensor): + Mask tensor (B, 1, T). + g (Optional[Tensor]): + Global conditioning tensor (B, global_channels, 1). Returns: - Tensor: Output tensor (B, channels, T). + Tensor: + Output tensor (B, channels, T). """ if g is not None: x = x + g @@ -225,12 +250,18 @@ class ConvFlow(nn.Layer): tail_bound: float=5.0, ): """Initialize ConvFlow module. Args: - in_channels (int): Number of input channels. - hidden_channels (int): Number of hidden channels. - kernel_size (int): Kernel size. - layers (int): Number of layers. - bins (int): Number of bins. - tail_bound (float): Tail bound value. + in_channels (int): + Number of input channels. + hidden_channels (int): + Number of hidden channels. + kernel_size (int): + Kernel size. + layers (int): + Number of layers. + bins (int): + Number of bins. + tail_bound (float): + Tail bound value. """ super().__init__() self.half_channels = in_channels // 2 @@ -275,13 +306,19 @@ class ConvFlow(nn.Layer): ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]: """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, channels, T). - x_mask (Tensor): Mask tensor (B, 1, T). - g (Optional[Tensor]): Global conditioning tensor (B, channels, 1). - inverse (bool): Whether to inverse the flow. + x (Tensor): + Input tensor (B, channels, T). + x_mask (Tensor): + Mask tensor (B, 1, T). + g (Optional[Tensor]): + Global conditioning tensor (B, channels, 1). + inverse (bool): + Whether to inverse the flow. Returns: - Tensor: Output tensor (B, channels, T). - Tensor: Log-determinant tensor for NLL (B,) if not inverse. + Tensor: + Output tensor (B, channels, T). + Tensor: + Log-determinant tensor for NLL (B,) if not inverse. """ xa, xb = x.split(2, 1) h = self.input_conv(xa) diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py index 359b66258..7ecc51619 100644 --- a/paddlespeech/t2s/models/vits/generator.py +++ b/paddlespeech/t2s/models/vits/generator.py @@ -97,81 +97,104 @@ class VITSGenerator(nn.Layer): stochastic_duration_predictor_dds_conv_layers: int=3, ): """Initialize VITS generator module. Args: - vocabs (int): Input vocabulary size. - aux_channels (int): Number of acoustic feature channels. - hidden_channels (int): Number of hidden channels. - spks (Optional[int]): Number of speakers. If set to > 1, assume that the + vocabs (int): + Input vocabulary size. + aux_channels (int): + Number of acoustic feature channels. + hidden_channels (int): + Number of hidden channels. + spks (Optional[int]): + Number of speakers. If set to > 1, assume that the sids will be provided as the input and use sid embedding layer. - langs (Optional[int]): Number of languages. If set to > 1, assume that the + langs (Optional[int]): + Number of languages. If set to > 1, assume that the lids will be provided as the input and use sid embedding layer. - spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0, + spk_embed_dim (Optional[int]): + Speaker embedding dimension. If set to > 0, assume that spembs will be provided as the input. - global_channels (int): Number of global conditioning channels. - segment_size (int): Segment size for decoder. - text_encoder_attention_heads (int): Number of heads in conformer block - of text encoder. - text_encoder_ffn_expand (int): Expansion ratio of FFN in conformer block - of text encoder. - text_encoder_blocks (int): Number of conformer blocks in text encoder. - text_encoder_positionwise_layer_type (str): Position-wise layer type in - conformer block of text encoder. - text_encoder_positionwise_conv_kernel_size (int): Position-wise convolution - kernel size in conformer block of text encoder. Only used when the - above layer type is conv1d or conv1d-linear. - text_encoder_positional_encoding_layer_type (str): Positional encoding layer - type in conformer block of text encoder. - text_encoder_self_attention_layer_type (str): Self-attention layer type in - conformer block of text encoder. - text_encoder_activation_type (str): Activation function type in conformer - block of text encoder. - text_encoder_normalize_before (bool): Whether to apply layer norm before - self-attention in conformer block of text encoder. - text_encoder_dropout_rate (float): Dropout rate in conformer block of - text encoder. - text_encoder_positional_dropout_rate (float): Dropout rate for positional - encoding in conformer block of text encoder. - text_encoder_attention_dropout_rate (float): Dropout rate for attention in - conformer block of text encoder. - text_encoder_conformer_kernel_size (int): Conformer conv kernel size. It - will be used when only use_conformer_conv_in_text_encoder = True. - use_macaron_style_in_text_encoder (bool): Whether to use macaron style FFN - in conformer block of text encoder. - use_conformer_conv_in_text_encoder (bool): Whether to use covolution in - conformer block of text encoder. - decoder_kernel_size (int): Decoder kernel size. - decoder_channels (int): Number of decoder initial channels. - decoder_upsample_scales (List[int]): List of upsampling scales in decoder. - decoder_upsample_kernel_sizes (List[int]): List of kernel size for - upsampling layers in decoder. - decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks - in decoder. - decoder_resblock_dilations (List[List[int]]): List of list of dilations for - resblocks in decoder. - use_weight_norm_in_decoder (bool): Whether to apply weight normalization in - decoder. - posterior_encoder_kernel_size (int): Posterior encoder kernel size. - posterior_encoder_layers (int): Number of layers of posterior encoder. - posterior_encoder_stacks (int): Number of stacks of posterior encoder. - posterior_encoder_base_dilation (int): Base dilation of posterior encoder. - posterior_encoder_dropout_rate (float): Dropout rate for posterior encoder. - use_weight_norm_in_posterior_encoder (bool): Whether to apply weight - normalization in posterior encoder. - flow_flows (int): Number of flows in flow. - flow_kernel_size (int): Kernel size in flow. - flow_base_dilation (int): Base dilation in flow. - flow_layers (int): Number of layers in flow. - flow_dropout_rate (float): Dropout rate in flow - use_weight_norm_in_flow (bool): Whether to apply weight normalization in - flow. - use_only_mean_in_flow (bool): Whether to use only mean in flow. - stochastic_duration_predictor_kernel_size (int): Kernel size in stochastic - duration predictor. - stochastic_duration_predictor_dropout_rate (float): Dropout rate in - stochastic duration predictor. - stochastic_duration_predictor_flows (int): Number of flows in stochastic - duration predictor. - stochastic_duration_predictor_dds_conv_layers (int): Number of DDS conv - layers in stochastic duration predictor. + global_channels (int): + Number of global conditioning channels. + segment_size (int): + Segment size for decoder. + text_encoder_attention_heads (int): + Number of heads in conformer block of text encoder. + text_encoder_ffn_expand (int): + Expansion ratio of FFN in conformer block of text encoder. + text_encoder_blocks (int): + Number of conformer blocks in text encoder. + text_encoder_positionwise_layer_type (str): + Position-wise layer type in conformer block of text encoder. + text_encoder_positionwise_conv_kernel_size (int): + Position-wise convolution kernel size in conformer block of text encoder. + Only used when the above layer type is conv1d or conv1d-linear. + text_encoder_positional_encoding_layer_type (str): + Positional encoding layer type in conformer block of text encoder. + text_encoder_self_attention_layer_type (str): + Self-attention layer type in conformer block of text encoder. + text_encoder_activation_type (str): + Activation function type in conformer block of text encoder. + text_encoder_normalize_before (bool): + Whether to apply layer norm before self-attention in conformer block of text encoder. + text_encoder_dropout_rate (float): + Dropout rate in conformer block of text encoder. + text_encoder_positional_dropout_rate (float): + Dropout rate for positional encoding in conformer block of text encoder. + text_encoder_attention_dropout_rate (float): + Dropout rate for attention in conformer block of text encoder. + text_encoder_conformer_kernel_size (int): + Conformer conv kernel size. It will be used when only use_conformer_conv_in_text_encoder = True. + use_macaron_style_in_text_encoder (bool): + Whether to use macaron style FFN in conformer block of text encoder. + use_conformer_conv_in_text_encoder (bool): + Whether to use covolution in conformer block of text encoder. + decoder_kernel_size (int): + Decoder kernel size. + decoder_channels (int): + Number of decoder initial channels. + decoder_upsample_scales (List[int]): + List of upsampling scales in decoder. + decoder_upsample_kernel_sizes (List[int]): + List of kernel size for upsampling layers in decoder. + decoder_resblock_kernel_sizes (List[int]): + List of kernel size for resblocks in decoder. + decoder_resblock_dilations (List[List[int]]): + List of list of dilations for resblocks in decoder. + use_weight_norm_in_decoder (bool): + Whether to apply weight normalization in decoder. + posterior_encoder_kernel_size (int): + Posterior encoder kernel size. + posterior_encoder_layers (int): + Number of layers of posterior encoder. + posterior_encoder_stacks (int): + Number of stacks of posterior encoder. + posterior_encoder_base_dilation (int): + Base dilation of posterior encoder. + posterior_encoder_dropout_rate (float): + Dropout rate for posterior encoder. + use_weight_norm_in_posterior_encoder (bool): + Whether to apply weight normalization in posterior encoder. + flow_flows (int): + Number of flows in flow. + flow_kernel_size (int): + Kernel size in flow. + flow_base_dilation (int): + Base dilation in flow. + flow_layers (int): + Number of layers in flow. + flow_dropout_rate (float): + Dropout rate in flow + use_weight_norm_in_flow (bool): + Whether to apply weight normalization in flow. + use_only_mean_in_flow (bool): + Whether to use only mean in flow. + stochastic_duration_predictor_kernel_size (int): + Kernel size in stochastic duration predictor. + stochastic_duration_predictor_dropout_rate (float): + Dropout rate in stochastic duration predictor. + stochastic_duration_predictor_flows (int): + Number of flows in stochastic duration predictor. + stochastic_duration_predictor_dds_conv_layers (int): + Number of DDS conv layers in stochastic duration predictor. """ super().__init__() self.segment_size = segment_size @@ -272,27 +295,40 @@ class VITSGenerator(nn.Layer): paddle.Tensor, paddle.Tensor, ], ]: """Calculate forward propagation. Args: - text (Tensor): Text index tensor (B, T_text). - text_lengths (Tensor): Text length tensor (B,). - feats (Tensor): Feature tensor (B, aux_channels, T_feats). - feats_lengths (Tensor): Feature length tensor (B,). - sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). - spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). + text (Tensor): + Text index tensor (B, T_text). + text_lengths (Tensor): + Text length tensor (B,). + feats (Tensor): + Feature tensor (B, aux_channels, T_feats). + feats_lengths (Tensor): + Feature length tensor (B,). + sids (Optional[Tensor]): + Speaker index tensor (B,) or (B, 1). + spembs (Optional[Tensor]): + Speaker embedding tensor (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). Returns: - Tensor: Waveform tensor (B, 1, segment_size * upsample_factor). - Tensor: Duration negative log-likelihood (NLL) tensor (B,). - Tensor: Monotonic attention weight tensor (B, 1, T_feats, T_text). - Tensor: Segments start index tensor (B,). - Tensor: Text mask tensor (B, 1, T_text). - Tensor: Feature mask tensor (B, 1, T_feats). - tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: - - Tensor: Posterior encoder hidden representation (B, H, T_feats). - - Tensor: Flow hidden representation (B, H, T_feats). - - Tensor: Expanded text encoder projected mean (B, H, T_feats). - - Tensor: Expanded text encoder projected scale (B, H, T_feats). - - Tensor: Posterior encoder projected mean (B, H, T_feats). - - Tensor: Posterior encoder projected scale (B, H, T_feats). + Tensor: + Waveform tensor (B, 1, segment_size * upsample_factor). + Tensor: + Duration negative log-likelihood (NLL) tensor (B,). + Tensor: + Monotonic attention weight tensor (B, 1, T_feats, T_text). + Tensor: + Segments start index tensor (B,). + Tensor: + Text mask tensor (B, 1, T_text). + Tensor: + Feature mask tensor (B, 1, T_feats). + tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: + - Tensor: Posterior encoder hidden representation (B, H, T_feats). + - Tensor: Flow hidden representation (B, H, T_feats). + - Tensor: Expanded text encoder projected mean (B, H, T_feats). + - Tensor: Expanded text encoder projected scale (B, H, T_feats). + - Tensor: Posterior encoder projected mean (B, H, T_feats). + - Tensor: Posterior encoder projected scale (B, H, T_feats). """ # forward text encoder x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths) @@ -402,24 +438,40 @@ class VITSGenerator(nn.Layer): ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Run inference. Args: - text (Tensor): Input text index tensor (B, T_text,). - text_lengths (Tensor): Text length tensor (B,). - feats (Tensor): Feature tensor (B, aux_channels, T_feats,). - feats_lengths (Tensor): Feature length tensor (B,). - sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). - spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). - dur (Optional[Tensor]): Ground-truth duration (B, T_text,). If provided, + text (Tensor): + Input text index tensor (B, T_text,). + text_lengths (Tensor): + Text length tensor (B,). + feats (Tensor): + Feature tensor (B, aux_channels, T_feats,). + feats_lengths (Tensor): + Feature length tensor (B,). + sids (Optional[Tensor]): + Speaker index tensor (B,) or (B, 1). + spembs (Optional[Tensor]): + Speaker embedding tensor (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). + dur (Optional[Tensor]): + Ground-truth duration (B, T_text,). If provided, skip the prediction of durations (i.e., teacher forcing). - noise_scale (float): Noise scale parameter for flow. - noise_scale_dur (float): Noise scale parameter for duration predictor. - alpha (float): Alpha parameter to control the speed of generated speech. - max_len (Optional[int]): Maximum length of acoustic feature sequence. - use_teacher_forcing (bool): Whether to use teacher forcing. + noise_scale (float): + Noise scale parameter for flow. + noise_scale_dur (float): + Noise scale parameter for duration predictor. + alpha (float): + Alpha parameter to control the speed of generated speech. + max_len (Optional[int]): + Maximum length of acoustic feature sequence. + use_teacher_forcing (bool): + Whether to use teacher forcing. Returns: - Tensor: Generated waveform tensor (B, T_wav). - Tensor: Monotonic attention weight tensor (B, T_feats, T_text). - Tensor: Duration tensor (B, T_text). + Tensor: + Generated waveform tensor (B, T_wav). + Tensor: + Monotonic attention weight tensor (B, T_feats, T_text). + Tensor: + Duration tensor (B, T_text). """ # encoder x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths) @@ -533,15 +585,23 @@ class VITSGenerator(nn.Layer): lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor: """Run voice conversion. Args: - feats (Tensor): Feature tensor (B, aux_channels, T_feats,). - feats_lengths (Tensor): Feature length tensor (B,). - sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1). - sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1). - spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim). - spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). + feats (Tensor): + Feature tensor (B, aux_channels, T_feats,). + feats_lengths (Tensor): + Feature length tensor (B,). + sids_src (Optional[Tensor]): + Speaker index tensor of source feature (B,) or (B, 1). + sids_tgt (Optional[Tensor]): + Speaker index tensor of target feature (B,) or (B, 1). + spembs_src (Optional[Tensor]): + Speaker embedding tensor of source feature (B, spk_embed_dim). + spembs_tgt (Optional[Tensor]): + Speaker embedding tensor of target feature (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). Returns: - Tensor: Generated waveform tensor (B, T_wav). + Tensor: + Generated waveform tensor (B, T_wav). """ # encoder g_src = None @@ -602,10 +662,13 @@ class VITSGenerator(nn.Layer): mask: paddle.Tensor) -> paddle.Tensor: """Generate path a.k.a. monotonic attention. Args: - dur (Tensor): Duration tensor (B, 1, T_text). - mask (Tensor): Attention mask tensor (B, 1, T_feats, T_text). + dur (Tensor): + Duration tensor (B, 1, T_text). + mask (Tensor): + Attention mask tensor (B, 1, T_feats, T_text). Returns: - Tensor: Path tensor (B, 1, T_feats, T_text). + Tensor: + Path tensor (B, 1, T_feats, T_text). """ b, _, t_y, t_x = paddle.shape(mask) cum_dur = paddle.cumsum(dur, -1) diff --git a/paddlespeech/t2s/models/vits/posterior_encoder.py b/paddlespeech/t2s/models/vits/posterior_encoder.py index 853237557..5e3d6b9ce 100644 --- a/paddlespeech/t2s/models/vits/posterior_encoder.py +++ b/paddlespeech/t2s/models/vits/posterior_encoder.py @@ -52,17 +52,28 @@ class PosteriorEncoder(nn.Layer): """Initilialize PosteriorEncoder module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - hidden_channels (int): Number of hidden channels. - kernel_size (int): Kernel size in WaveNet. - layers (int): Number of layers of WaveNet. - stacks (int): Number of repeat stacking of WaveNet. - base_dilation (int): Base dilation factor. - global_channels (int): Number of global conditioning channels. - dropout_rate (float): Dropout rate. - bias (bool): Whether to use bias parameters in conv. - use_weight_norm (bool): Whether to apply weight norm. + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + hidden_channels (int): + Number of hidden channels. + kernel_size (int): + Kernel size in WaveNet. + layers (int): + Number of layers of WaveNet. + stacks (int): + Number of repeat stacking of WaveNet. + base_dilation (int): + Base dilation factor. + global_channels (int): + Number of global conditioning channels. + dropout_rate (float): + Dropout rate. + bias (bool): + Whether to use bias parameters in conv. + use_weight_norm (bool): + Whether to apply weight norm. """ super().__init__() @@ -99,15 +110,22 @@ class PosteriorEncoder(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T_feats). - x_lengths (Tensor): Length tensor (B,). - g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). + x (Tensor): + Input tensor (B, in_channels, T_feats). + x_lengths (Tensor): + Length tensor (B,). + g (Optional[Tensor]): + Global conditioning tensor (B, global_channels, 1). Returns: - Tensor: Encoded hidden representation tensor (B, out_channels, T_feats). - Tensor: Projected mean tensor (B, out_channels, T_feats). - Tensor: Projected scale tensor (B, out_channels, T_feats). - Tensor: Mask tensor for input tensor (B, 1, T_feats). + Tensor: + Encoded hidden representation tensor (B, out_channels, T_feats). + Tensor: + Projected mean tensor (B, out_channels, T_feats). + Tensor: + Projected scale tensor (B, out_channels, T_feats). + Tensor: + Mask tensor for input tensor (B, 1, T_feats). """ x_mask = make_non_pad_mask(x_lengths).unsqueeze(1) diff --git a/paddlespeech/t2s/models/vits/residual_coupling.py b/paddlespeech/t2s/models/vits/residual_coupling.py index c18beedd0..afa6d1fa7 100644 --- a/paddlespeech/t2s/models/vits/residual_coupling.py +++ b/paddlespeech/t2s/models/vits/residual_coupling.py @@ -55,18 +55,30 @@ class ResidualAffineCouplingBlock(nn.Layer): """Initilize ResidualAffineCouplingBlock module. Args: - in_channels (int): Number of input channels. - hidden_channels (int): Number of hidden channels. - flows (int): Number of flows. - kernel_size (int): Kernel size for WaveNet. - base_dilation (int): Base dilation factor for WaveNet. - layers (int): Number of layers of WaveNet. - stacks (int): Number of stacks of WaveNet. - global_channels (int): Number of global channels. - dropout_rate (float): Dropout rate. - use_weight_norm (bool): Whether to use weight normalization in WaveNet. - bias (bool): Whether to use bias paramters in WaveNet. - use_only_mean (bool): Whether to estimate only mean. + in_channels (int): + Number of input channels. + hidden_channels (int): + Number of hidden channels. + flows (int): + Number of flows. + kernel_size (int): + Kernel size for WaveNet. + base_dilation (int): + Base dilation factor for WaveNet. + layers (int): + Number of layers of WaveNet. + stacks (int): + Number of stacks of WaveNet. + global_channels (int): + Number of global channels. + dropout_rate (float): + Dropout rate. + use_weight_norm (bool): + Whether to use weight normalization in WaveNet. + bias (bool): + Whether to use bias paramters in WaveNet. + use_only_mean (bool): + Whether to estimate only mean. """ super().__init__() @@ -97,10 +109,14 @@ class ResidualAffineCouplingBlock(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). - x_mask (Tensor): Length tensor (B, 1, T). - g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). - inverse (bool): Whether to inverse the flow. + x (Tensor): + Input tensor (B, in_channels, T). + x_mask (Tensor): + Length tensor (B, 1, T). + g (Optional[Tensor]): + Global conditioning tensor (B, global_channels, 1). + inverse (bool): + Whether to inverse the flow. Returns: Tensor: Output tensor (B, in_channels, T). @@ -134,17 +150,28 @@ class ResidualAffineCouplingLayer(nn.Layer): """Initialzie ResidualAffineCouplingLayer module. Args: - in_channels (int): Number of input channels. - hidden_channels (int): Number of hidden channels. - kernel_size (int): Kernel size for WaveNet. - base_dilation (int): Base dilation factor for WaveNet. - layers (int): Number of layers of WaveNet. - stacks (int): Number of stacks of WaveNet. - global_channels (int): Number of global channels. - dropout_rate (float): Dropout rate. - use_weight_norm (bool): Whether to use weight normalization in WaveNet. - bias (bool): Whether to use bias paramters in WaveNet. - use_only_mean (bool): Whether to estimate only mean. + in_channels (int): + Number of input channels. + hidden_channels (int): + Number of hidden channels. + kernel_size (int): + Kernel size for WaveNet. + base_dilation (int): + Base dilation factor for WaveNet. + layers (int): + Number of layers of WaveNet. + stacks (int): + Number of stacks of WaveNet. + global_channels (int): + Number of global channels. + dropout_rate (float): + Dropout rate. + use_weight_norm (bool): + Whether to use weight normalization in WaveNet. + bias (bool): + Whether to use bias paramters in WaveNet. + use_only_mean (bool): + Whether to estimate only mean. """ assert in_channels % 2 == 0, "in_channels should be divisible by 2" @@ -211,14 +238,20 @@ class ResidualAffineCouplingLayer(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input tensor (B, in_channels, T). - x_lengths (Tensor): Length tensor (B,). - g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). - inverse (bool): Whether to inverse the flow. + x (Tensor): + Input tensor (B, in_channels, T). + x_lengths (Tensor): + Length tensor (B,). + g (Optional[Tensor]): + Global conditioning tensor (B, global_channels, 1). + inverse (bool): + Whether to inverse the flow. Returns: - Tensor: Output tensor (B, in_channels, T). - Tensor: Log-determinant tensor for NLL (B,) if not inverse. + Tensor: + Output tensor (B, in_channels, T). + Tensor: + Log-determinant tensor for NLL (B,) if not inverse. """ xa, xb = paddle.split(x, 2, axis=1) diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py index 3afc7831a..799e0c759 100644 --- a/paddlespeech/t2s/models/vits/text_encoder.py +++ b/paddlespeech/t2s/models/vits/text_encoder.py @@ -62,23 +62,40 @@ class TextEncoder(nn.Layer): """Initialize TextEncoder module. Args: - vocabs (int): Vocabulary size. - attention_dim (int): Attention dimension. - attention_heads (int): Number of attention heads. - linear_units (int): Number of linear units of positionwise layers. - blocks (int): Number of encoder blocks. - positionwise_layer_type (str): Positionwise layer type. - positionwise_conv_kernel_size (int): Positionwise layer's kernel size. - positional_encoding_layer_type (str): Positional encoding layer type. - self_attention_layer_type (str): Self-attention layer type. - activation_type (str): Activation function type. - normalize_before (bool): Whether to apply LayerNorm before attention. - use_macaron_style (bool): Whether to use macaron style components. - use_conformer_conv (bool): Whether to use conformer conv layers. - conformer_kernel_size (int): Conformer's conv kernel size. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate for positional encoding. - attention_dropout_rate (float): Dropout rate for attention. + vocabs (int): + Vocabulary size. + attention_dim (int): + Attention dimension. + attention_heads (int): + Number of attention heads. + linear_units (int): + Number of linear units of positionwise layers. + blocks (int): + Number of encoder blocks. + positionwise_layer_type (str): + Positionwise layer type. + positionwise_conv_kernel_size (int): + Positionwise layer's kernel size. + positional_encoding_layer_type (str): + Positional encoding layer type. + self_attention_layer_type (str): + Self-attention layer type. + activation_type (str): + Activation function type. + normalize_before (bool): + Whether to apply LayerNorm before attention. + use_macaron_style (bool): + Whether to use macaron style components. + use_conformer_conv (bool): + Whether to use conformer conv layers. + conformer_kernel_size (int): + Conformer's conv kernel size. + dropout_rate (float): + Dropout rate. + positional_dropout_rate (float): + Dropout rate for positional encoding. + attention_dropout_rate (float): + Dropout rate for attention. """ super().__init__() @@ -121,14 +138,20 @@ class TextEncoder(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input index tensor (B, T_text). - x_lengths (Tensor): Length tensor (B,). + x (Tensor): + Input index tensor (B, T_text). + x_lengths (Tensor): + Length tensor (B,). Returns: - Tensor: Encoded hidden representation (B, attention_dim, T_text). - Tensor: Projected mean tensor (B, attention_dim, T_text). - Tensor: Projected scale tensor (B, attention_dim, T_text). - Tensor: Mask tensor for input tensor (B, 1, T_text). + Tensor: + Encoded hidden representation (B, attention_dim, T_text). + Tensor: + Projected mean tensor (B, attention_dim, T_text). + Tensor: + Projected scale tensor (B, attention_dim, T_text). + Tensor: + Mask tensor for input tensor (B, 1, T_text). """ x = self.emb(x) * math.sqrt(self.attention_dim) diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py index 983bf0a36..0ff3a546d 100644 --- a/paddlespeech/t2s/models/vits/vits.py +++ b/paddlespeech/t2s/models/vits/vits.py @@ -156,17 +156,25 @@ class VITS(nn.Layer): init_type: str="xavier_uniform", ): """Initialize VITS module. Args: - idim (int): Input vocabrary size. - odim (int): Acoustic feature dimension. The actual output channels will + idim (int): + Input vocabrary size. + odim (int): + Acoustic feature dimension. The actual output channels will be 1 since VITS is the end-to-end text-to-wave model but for the compatibility odim is used to indicate the acoustic feature dimension. - sampling_rate (int): Sampling rate, not used for the training but it will + sampling_rate (int): + Sampling rate, not used for the training but it will be referred in saving waveform during the inference. - generator_type (str): Generator type. - generator_params (Dict[str, Any]): Parameter dict for generator. - discriminator_type (str): Discriminator type. - discriminator_params (Dict[str, Any]): Parameter dict for discriminator. - cache_generator_outputs (bool): Whether to cache generator outputs. + generator_type (str): + Generator type. + generator_params (Dict[str, Any]): + Parameter dict for generator. + discriminator_type (str): + Discriminator type. + discriminator_params (Dict[str, Any]): + Parameter dict for discriminator. + cache_generator_outputs (bool): + Whether to cache generator outputs. """ assert check_argument_types() super().__init__() @@ -218,14 +226,22 @@ class VITS(nn.Layer): forward_generator: bool=True, ) -> Dict[str, Any]: """Perform generator forward. Args: - text (Tensor): Text index tensor (B, T_text). - text_lengths (Tensor): Text length tensor (B,). - feats (Tensor): Feature tensor (B, T_feats, aux_channels). - feats_lengths (Tensor): Feature length tensor (B,). - sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). - spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). - forward_generator (bool): Whether to forward generator. + text (Tensor): + Text index tensor (B, T_text). + text_lengths (Tensor): + Text length tensor (B,). + feats (Tensor): + Feature tensor (B, T_feats, aux_channels). + feats_lengths (Tensor): + Feature length tensor (B,). + sids (Optional[Tensor]): + Speaker index tensor (B,) or (B, 1). + spembs (Optional[Tensor]): + Speaker embedding tensor (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). + forward_generator (bool): + Whether to forward generator. Returns: """ @@ -259,13 +275,20 @@ class VITS(nn.Layer): lids: Optional[paddle.Tensor]=None, ) -> Dict[str, Any]: """Perform generator forward. Args: - text (Tensor): Text index tensor (B, T_text). - text_lengths (Tensor): Text length tensor (B,). - feats (Tensor): Feature tensor (B, T_feats, aux_channels). - feats_lengths (Tensor): Feature length tensor (B,). - sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). - spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). + text (Tensor): + Text index tensor (B, T_text). + text_lengths (Tensor): + Text length tensor (B,). + feats (Tensor): + Feature tensor (B, T_feats, aux_channels). + feats_lengths (Tensor): + Feature length tensor (B,). + sids (Optional[Tensor]): + Speaker index tensor (B,) or (B, 1). + spembs (Optional[Tensor]): + Speaker embedding tensor (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). Returns: """ @@ -304,13 +327,20 @@ class VITS(nn.Layer): lids: Optional[paddle.Tensor]=None, ) -> Dict[str, Any]: """Perform discriminator forward. Args: - text (Tensor): Text index tensor (B, T_text). - text_lengths (Tensor): Text length tensor (B,). - feats (Tensor): Feature tensor (B, T_feats, aux_channels). - feats_lengths (Tensor): Feature length tensor (B,). - sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). - spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). + text (Tensor): + Text index tensor (B, T_text). + text_lengths (Tensor): + Text length tensor (B,). + feats (Tensor): + Feature tensor (B, T_feats, aux_channels). + feats_lengths (Tensor): + Feature length tensor (B,). + sids (Optional[Tensor]): + Speaker index tensor (B,) or (B, 1). + spembs (Optional[Tensor]): + Speaker embedding tensor (B, spk_embed_dim). + lids (Optional[Tensor]): + Language index tensor (B,) or (B, 1). Returns: """ @@ -353,22 +383,36 @@ class VITS(nn.Layer): use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: """Run inference. Args: - text (Tensor): Input text index tensor (T_text,). - feats (Tensor): Feature tensor (T_feats, aux_channels). - sids (Tensor): Speaker index tensor (1,). - spembs (Optional[Tensor]): Speaker embedding tensor (spk_embed_dim,). - lids (Tensor): Language index tensor (1,). - durations (Tensor): Ground-truth duration tensor (T_text,). - noise_scale (float): Noise scale value for flow. - noise_scale_dur (float): Noise scale value for duration predictor. - alpha (float): Alpha parameter to control the speed of generated speech. - max_len (Optional[int]): Maximum length. - use_teacher_forcing (bool): Whether to use teacher forcing. + text (Tensor): + Input text index tensor (T_text,). + feats (Tensor): + Feature tensor (T_feats, aux_channels). + sids (Tensor): + Speaker index tensor (1,). + spembs (Optional[Tensor]): + Speaker embedding tensor (spk_embed_dim,). + lids (Tensor): + Language index tensor (1,). + durations (Tensor): + Ground-truth duration tensor (T_text,). + noise_scale (float): + Noise scale value for flow. + noise_scale_dur (float): + Noise scale value for duration predictor. + alpha (float): + Alpha parameter to control the speed of generated speech. + max_len (Optional[int]): + Maximum length. + use_teacher_forcing (bool): + Whether to use teacher forcing. Returns: Dict[str, Tensor]: - * wav (Tensor): Generated waveform tensor (T_wav,). - * att_w (Tensor): Monotonic attention weight tensor (T_feats, T_text). - * duration (Tensor): Predicted duration tensor (T_text,). + * wav (Tensor): + Generated waveform tensor (T_wav,). + * att_w (Tensor): + Monotonic attention weight tensor (T_feats, T_text). + * duration (Tensor): + Predicted duration tensor (T_text,). """ # setup text = text[None] @@ -417,15 +461,22 @@ class VITS(nn.Layer): lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor: """Run voice conversion. Args: - feats (Tensor): Feature tensor (T_feats, aux_channels). - sids_src (Optional[Tensor]): Speaker index tensor of source feature (1,). - sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (1,). - spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (spk_embed_dim,). - spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (spk_embed_dim,). - lids (Optional[Tensor]): Language index tensor (1,). + feats (Tensor): + Feature tensor (T_feats, aux_channels). + sids_src (Optional[Tensor]): + Speaker index tensor of source feature (1,). + sids_tgt (Optional[Tensor]): + Speaker index tensor of target feature (1,). + spembs_src (Optional[Tensor]): + Speaker embedding tensor of source feature (spk_embed_dim,). + spembs_tgt (Optional[Tensor]): + Speaker embedding tensor of target feature (spk_embed_dim,). + lids (Optional[Tensor]): + Language index tensor (1,). Returns: Dict[str, Tensor]: - * wav (Tensor): Generated waveform tensor (T_wav,). + * wav (Tensor): + Generated waveform tensor (T_wav,). """ assert feats is not None feats = feats[None].transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/models/vits/wavenet/residual_block.py b/paddlespeech/t2s/models/vits/wavenet/residual_block.py index 197e74975..b5095e168 100644 --- a/paddlespeech/t2s/models/vits/wavenet/residual_block.py +++ b/paddlespeech/t2s/models/vits/wavenet/residual_block.py @@ -39,14 +39,22 @@ class ResidualBlock(nn.Layer): """Initialize ResidualBlock module. Args: - kernel_size (int): Kernel size of dilation convolution layer. - residual_channels (int): Number of channels for residual connection. - skip_channels (int): Number of channels for skip connection. - aux_channels (int): Number of local conditioning channels. - dropout (float): Dropout probability. - dilation (int): Dilation factor. - bias (bool): Whether to add bias parameter in convolution layers. - scale_residual (bool): Whether to scale the residual outputs. + kernel_size (int): + Kernel size of dilation convolution layer. + residual_channels (int): + Number of channels for residual connection. + skip_channels (int): + Number of channels for skip connection. + aux_channels (int): + Number of local conditioning channels. + dropout (float): + Dropout probability. + dilation (int): + Dilation factor. + bias (bool): + Whether to add bias parameter in convolution layers. + scale_residual (bool): + Whether to scale the residual outputs. """ super().__init__() diff --git a/paddlespeech/t2s/models/vits/wavenet/wavenet.py b/paddlespeech/t2s/models/vits/wavenet/wavenet.py index 44693dac6..04422939b 100644 --- a/paddlespeech/t2s/models/vits/wavenet/wavenet.py +++ b/paddlespeech/t2s/models/vits/wavenet/wavenet.py @@ -47,25 +47,42 @@ class WaveNet(nn.Layer): """Initialize WaveNet module. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - kernel_size (int): Kernel size of dilated convolution. - layers (int): Number of residual block layers. - stacks (int): Number of stacks i.e., dilation cycles. - base_dilation (int): Base dilation factor. - residual_channels (int): Number of channels in residual conv. - gate_channels (int): Number of channels in gated conv. - skip_channels (int): Number of channels in skip conv. - aux_channels (int): Number of channels for local conditioning feature. - global_channels (int): Number of channels for global conditioning feature. - dropout_rate (float): Dropout rate. 0.0 means no dropout applied. - bias (bool): Whether to use bias parameter in conv layer. - use_weight_norm (bool): Whether to use weight norm. If set to true, it will - be applied to all of the conv layers. - use_first_conv (bool): Whether to use the first conv layers. - use_last_conv (bool): Whether to use the last conv layers. - scale_residual (bool): Whether to scale the residual outputs. - scale_skip_connect (bool): Whether to scale the skip connection outputs. + in_channels (int): + Number of input channels. + out_channels (int): + Number of output channels. + kernel_size (int): + Kernel size of dilated convolution. + layers (int): + Number of residual block layers. + stacks (int): + Number of stacks i.e., dilation cycles. + base_dilation (int): + Base dilation factor. + residual_channels (int): + Number of channels in residual conv. + gate_channels (int): + Number of channels in gated conv. + skip_channels (int): + Number of channels in skip conv. + aux_channels (int): + Number of channels for local conditioning feature. + global_channels (int): + Number of channels for global conditioning feature. + dropout_rate (float): + Dropout rate. 0.0 means no dropout applied. + bias (bool): + Whether to use bias parameter in conv layer. + use_weight_norm (bool): + Whether to use weight norm. If set to true, it will be applied to all of the conv layers. + use_first_conv (bool): + Whether to use the first conv layers. + use_last_conv (bool): + Whether to use the last conv layers. + scale_residual (bool): + Whether to scale the residual outputs. + scale_skip_connect (bool): + Whether to scale the skip connection outputs. """ super().__init__() @@ -128,15 +145,18 @@ class WaveNet(nn.Layer): """Calculate forward propagation. Args: - x (Tensor): Input noise signal (B, 1, T) if use_first_conv else - (B, residual_channels, T). - x_mask (Optional[Tensor]): Mask tensor (B, 1, T). - c (Optional[Tensor]): Local conditioning features (B, aux_channels, T). - g (Optional[Tensor]): Global conditioning features (B, global_channels, 1). + x (Tensor): + Input noise signal (B, 1, T) if use_first_conv else (B, residual_channels, T). + x_mask (Optional[Tensor]): + Mask tensor (B, 1, T). + c (Optional[Tensor]): + Local conditioning features (B, aux_channels, T). + g (Optional[Tensor]): + Global conditioning features (B, global_channels, 1). Returns: - Tensor: Output tensor (B, out_channels, T) if use_last_conv else - (B, residual_channels, T). + Tensor: + Output tensor (B, out_channels, T) if use_last_conv else(B, residual_channels, T). """ # encode to hidden representation diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py index 254edbb2d..44e9f2d8d 100644 --- a/paddlespeech/t2s/models/wavernn/wavernn.py +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -69,9 +69,11 @@ class MelResNet(nn.Layer): def forward(self, x): ''' Args: - x (Tensor): Input tensor (B, in_dims, T). + x (Tensor): + Input tensor (B, in_dims, T). Returns: - Tensor: Output tensor (B, res_out_dims, T). + Tensor: + Output tensor (B, res_out_dims, T). ''' x = self.conv_in(x) @@ -119,10 +121,13 @@ class UpsampleNetwork(nn.Layer): def forward(self, m): ''' Args: - c (Tensor): Input tensor (B, C_aux, T). + c (Tensor): + Input tensor (B, C_aux, T). Returns: - Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux). - Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims). + Tensor: + Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux). + Tensor: + Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims). ''' # aux: [B, C_aux, T] # -> [B, res_out_dims, T - 2 * aux_context_window] @@ -302,7 +307,8 @@ class WaveRNN(nn.Layer): number of samples for crossfading between batches mu_law(bool) Returns: - wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out). + wav sequence: + Output (T' * prod(upsample_scales), out_channels, C_out). """ self.eval() @@ -423,7 +429,7 @@ class WaveRNN(nn.Layer): x(Tensor): mel, [1, n_frames, 80] pad(int): - side(str, optional): (Default value = 'both') + side(str, optional): (Default value = 'both') Returns: Tensor From 260752aa2a3284a37c06b88da2fef3b6d0118280 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 19 Sep 2022 14:10:16 +0000 Subject: [PATCH 016/124] using forward_attention_decoder --- paddlespeech/s2t/exps/u2/bin/test_wav.py | 8 +++----- paddlespeech/s2t/models/u2/u2.py | 14 ++++++-------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index c04e3ae47..a55a1eca0 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -69,8 +69,7 @@ class U2Infer(): with paddle.no_grad(): # read audio, sample_rate = soundfile.read( - self.audio_file, dtype="int16", always_2d=True) - + self.audio_file, dtype="int16", always_2d=True) audio = audio[:, 0] logger.info(f"audio shape: {audio.shape}") @@ -78,11 +77,10 @@ class U2Infer(): feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") - np.savetxt("feat.transform.txt", feat) - ilen = paddle.to_tensor(feat.shape[0]) - xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) + xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode + logger.debug(f"decode cfg: {decode_config}") result_transcripts = self.model.decode( xs, ilen, diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index d7b8630a3..b4ec6b033 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -545,17 +545,11 @@ class U2BaseModel(ASRInterface, nn.Layer): [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) + logger.debug(f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = paddle.ones( - (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool) - decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, - hyps_lens) # (beam_size, max_hyps_len, vocab_size) # ctc score in ln domain - decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1) - decoder_out = decoder_out.numpy() + decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, encoder_out) # Only use decoder score for rescoring best_score = -float('inf') @@ -567,11 +561,15 @@ class U2BaseModel(ASRInterface, nn.Layer): score += decoder_out[i][j][w] # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] + logger.debug(f"hyp {i} len {len(hyp[0])} l2r rescore_score: {score} ctc_score: {hyp[1]}") + # add ctc score (which in ln domain) score += hyp[1] * ctc_weight if score > best_score: best_score = score best_index = i + + logger.debug(f"result: {hyps[best_index]}") return hyps[best_index][0] @jit.to_static(property=True) From 4d5cfd400386bcd5be8729f8b3e1dfc5bae8365c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 03:23:50 +0000 Subject: [PATCH 017/124] export param from cnofig --- paddlespeech/s2t/exps/u2/model.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index ee4df9cb9..2b70f117b 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -462,13 +462,13 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.clone(), self.args.checkpoint_path) - batch_size = 1 feat_dim = self.test_loader.feat_dim - model_size = 512 + model_size = self.config.encoder_conf.output_size num_left_chunks = -1 + logger.info(f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}") - return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) + return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) @paddle.no_grad() def export(self): @@ -553,20 +553,10 @@ class U2Tester(U2Trainer): cnn_cache = paddle.zeros([0, 0, 0, 0]) xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - import soundfile - audio, sample_rate = soundfile.read( - './zh.wav', dtype="int16", always_2d=True) - audio = audio[:, 0] - logger.info(f"audio shape: {audio.shape}") - audio = paddle.to_tensor(audio, paddle.int16) - feat_d = infer_model.forward_feature(audio) - logger.info(f"{feat_d}") - np.savetxt("feat.tostatic.txt", feat_d) - - # load static model from paddle.jit.layer import Layer layer = Layer() + logger.info(f"load export model: {self.args.export_path}") layer.load(self.args.export_path, paddle.CPUPlace()) # forward_encoder_chunk static @@ -580,9 +570,3 @@ class U2Tester(U2Trainer): np.testing.assert_allclose(att_cache_d, att_cache_s, atol=1e-4) np.testing.assert_allclose(cnn_cache_d, cnn_cache_s, atol=1e-4) # logger.info(f"forward_encoder_chunk output: {xs_s}") - - # forward_feature static - func = getattr(layer, 'forward_feature') - feat_s = func(audio)[0] - logger.info(f"{feat_s}") - np.testing.assert_allclose(feat_d, feat_s, atol=1e-5) From 549d477592fbba8533c9e6a3e573918bdf9ca82a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 03:27:33 +0000 Subject: [PATCH 018/124] fix code style --- paddlespeech/s2t/exps/u2/bin/test_wav.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index a55a1eca0..e01d0e401 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -18,7 +18,6 @@ from pathlib import Path import paddle import soundfile -import numpy as np from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation From 53d6baff0be0e2e1d64c6b6b5772d064c24c2bf3 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 20 Sep 2022 03:33:35 +0000 Subject: [PATCH 019/124] format --- paddlespeech/audio/compliance/kaldi.py | 3 +- paddlespeech/s2t/exps/u2/bin/test_wav.py | 2 +- paddlespeech/s2t/exps/u2/model.py | 37 +++++++++++-------- paddlespeech/s2t/models/u2/u2.py | 19 ++++++---- paddlespeech/s2t/modules/cmvn.py | 13 +++---- paddlespeech/s2t/modules/encoder.py | 9 +++-- paddlespeech/s2t/modules/fbank.py | 12 +++--- .../engine/asr/online/python/asr_engine.py | 1 - 8 files changed, 52 insertions(+), 44 deletions(-) diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index 24415058c..eb92ec1f2 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -376,7 +376,8 @@ def _get_mel_banks(num_bins: int, center_freqs = _inverse_mel_scale(center_mel) # (num_bins) # (1, num_fft_bins) - mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins, dtype=paddle.float32)).unsqueeze(0) + mel = _mel_scale(fft_bin_width * paddle.arange( + num_fft_bins, dtype=paddle.float32)).unsqueeze(0) # (num_bins, num_fft_bins) up_slope = (mel - left_mel) / (center_mel - left_mel) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index e01d0e401..ccf44d6b4 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -68,7 +68,7 @@ class U2Infer(): with paddle.no_grad(): # read audio, sample_rate = soundfile.read( - self.audio_file, dtype="int16", always_2d=True) + self.audio_file, dtype="int16", always_2d=True) audio = audio[:, 0] logger.info(f"audio shape: {audio.shape}") diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 2b70f117b..68354ff68 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -462,11 +462,13 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.clone(), self.args.checkpoint_path) - batch_size = 1 + batch_size = 1 feat_dim = self.test_loader.feat_dim model_size = self.config.encoder_conf.output_size num_left_chunks = -1 - logger.info(f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}") + logger.info( + f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}" + ) return infer_model, (batch_size, feat_dim, model_size, num_left_chunks) @@ -479,29 +481,29 @@ class U2Tester(U2Trainer): assert isinstance(input_spec, (list, tuple)), type(input_spec) batch_size, feat_dim, model_size, num_left_chunks = input_spec - ######################## infer_model.forward_encoder_chunk ############ input_spec = [ # (T,), int16 paddle.static.InputSpec(shape=[None], dtype='int16'), ] - infer_model.forward_feature = paddle.jit.to_static(infer_model.forward_feature, input_spec=input_spec) + infer_model.forward_feature = paddle.jit.to_static( + infer_model.forward_feature, input_spec=input_spec) ######################### infer_model.forward_encoder_chunk ############ input_spec = [ # xs, (B, T, D) - paddle.static.InputSpec(shape=[batch_size, None, feat_dim], dtype='float32'), + paddle.static.InputSpec( + shape=[batch_size, None, feat_dim], dtype='float32'), # offset, int, but need be tensor - paddle.static.InputSpec(shape=[1], dtype='int32'), + paddle.static.InputSpec(shape=[1], dtype='int32'), # required_cache_size, int num_left_chunks, # att_cache paddle.static.InputSpec( - shape=[None, None, None, None], - dtype='float32'), + shape=[None, None, None, None], dtype='float32'), # cnn_cache paddle.static.InputSpec( - shape=[None, None, None, None], dtype='float32') + shape=[None, None, None, None], dtype='float32') ] infer_model.forward_encoder_chunk = paddle.jit.to_static( infer_model.forward_encoder_chunk, input_spec=input_spec) @@ -509,12 +511,12 @@ class U2Tester(U2Trainer): ######################### infer_model.ctc_activation ######################## input_spec = [ # encoder_out, (B,T,D) - paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32') ] infer_model.ctc_activation = paddle.jit.to_static( infer_model.ctc_activation, input_spec=input_spec) - ######################### infer_model.forward_attention_decoder ######################## input_spec = [ # hyps, (B, U) @@ -522,15 +524,19 @@ class U2Tester(U2Trainer): # hyps_lens, (B,) paddle.static.InputSpec(shape=[None], dtype='int64'), # encoder_out, (B,T,D) - paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32') ] infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) # jit save logger.info(f"export save: {self.args.export_path}") - paddle.jit.save(infer_model, self.args.export_path, combine_params=True, skip_forward=True) - + paddle.jit.save( + infer_model, + self.args.export_path, + combine_params=True, + skip_forward=True) # test dy2static def flatten(out): @@ -551,7 +557,8 @@ class U2Tester(U2Trainer): required_cache_size = num_left_chunks att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) - xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk( + xs1, offset, required_cache_size, att_cache, cnn_cache) # load static model from paddle.jit.layer import Layer diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 135045aaa..32d0940d9 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -545,11 +545,13 @@ class U2BaseModel(ASRInterface, nn.Layer): [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - logger.debug(f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") + logger.debug( + f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") hyps_lens = hyps_lens + 1 # Add at begining # ctc score in ln domain - decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, encoder_out) + decoder_out = self.forward_attention_decoder(hyps_pad, hyps_lens, + encoder_out) # Only use decoder score for rescoring best_score = -float('inf') @@ -561,7 +563,9 @@ class U2BaseModel(ASRInterface, nn.Layer): score += decoder_out[i][j][w] # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] - logger.debug(f"hyp {i} len {len(hyp[0])} l2r rescore_score: {score} ctc_score: {hyp[1]}") + logger.debug( + f"hyp {i} len {len(hyp[0])} l2r rescore_score: {score} ctc_score: {hyp[1]}" + ) # add ctc score (which in ln domain) score += hyp[1] * ctc_weight @@ -933,9 +937,7 @@ class U2InferModel(U2Model): if process_type == 'fbank_kaldi': opts.update({'n_mels': input_dim}) opts['dither'] = 0.0 - self.fbank = KaldiFbank( - **opts - ) + self.fbank = KaldiFbank(**opts) logger.info(f"{self.__class__.__name__} export: {self.fbank}") if process_type == 'cmvn_json': # align with paddlespeech.audio.transform.cmvn:GlobalCMVN @@ -956,7 +958,8 @@ class U2InferModel(U2Model): self.global_cmvn = GlobalCMVN( paddle.to_tensor(mean, dtype=paddle.float), paddle.to_tensor(istd, dtype=paddle.float)) - logger.info(f"{self.__class__.__name__} export: {self.global_cmvn}") + logger.info( + f"{self.__class__.__name__} export: {self.global_cmvn}") def forward(self, feats, @@ -994,4 +997,4 @@ class U2InferModel(U2Model): x = paddle.cast(x, paddle.float32) feat = self.fbank(x) feat = self.global_cmvn(feat) - return feat \ No newline at end of file + return feat diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py index 53c508f1a..6a8c1660c 100644 --- a/paddlespeech/s2t/modules/cmvn.py +++ b/paddlespeech/s2t/modules/cmvn.py @@ -41,12 +41,11 @@ class GlobalCMVN(nn.Layer): self.register_buffer("istd", istd) def __repr__(self): - return ( - "{name}(mean={mean}, istd={istd}, norm_var={norm_var})".format( - name=self.__class__.__name__, - mean=self.mean, - istd=self.istd, - norm_var=self.norm_var)) + return ("{name}(mean={mean}, istd={istd}, norm_var={norm_var})".format( + name=self.__class__.__name__, + mean=self.mean, + istd=self.istd, + norm_var=self.norm_var)) def forward(self, x: paddle.Tensor): """ @@ -58,4 +57,4 @@ class GlobalCMVN(nn.Layer): x = x - self.mean if self.norm_var: x = x * self.istd - return x \ No newline at end of file + return x diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 458921b5a..87b83ef55 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -256,10 +256,11 @@ class BaseEncoder(nn.Layer): # att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, # cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i+1], - cnn_cache=cnn_cache[i:i+1], - ) + xs, + att_mask, + pos_emb, + att_cache=att_cache[i:i + 1], + cnn_cache=cnn_cache[i:i + 1], ) # new_att_cache = (1, head, attention_key_size, d_k*2) # new_cnn_cache = (B=1, hidden-dim, cache_t2) r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) diff --git a/paddlespeech/s2t/modules/fbank.py b/paddlespeech/s2t/modules/fbank.py index 4ec620a79..8d76a4727 100644 --- a/paddlespeech/s2t/modules/fbank.py +++ b/paddlespeech/s2t/modules/fbank.py @@ -1,19 +1,17 @@ - - - import paddle from paddle import nn from paddlespeech.audio.compliance import kaldi - from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() __all__ = ['KaldiFbank'] + class KaldiFbank(nn.Layer): - def __init__(self, + def __init__( + self, fs=16000, n_mels=80, n_shift=160, # unit:sample, 10ms @@ -62,7 +60,7 @@ class KaldiFbank(nn.Layer): assert x.ndim == 1 feat = kaldi.fbank( - x.unsqueeze(0), # append channel dim, (C, Ti) + x.unsqueeze(0), # append channel dim, (C, Ti) n_mels=self.n_mels, frame_length=self.n_frame_length, frame_shift=self.n_frame_shift, @@ -70,5 +68,5 @@ class KaldiFbank(nn.Layer): energy_floor=self.energy_floor, sr=self.fs) - assert feat.ndim == 2 # (T,D) + assert feat.ndim == 2 # (T,D) return feat diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 1dc970891..5782d7035 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -80,7 +80,6 @@ class PaddleASRConnectionHanddler: self.init_decoder() self.reset() - def init_decoder(self): if "deepspeech2" in self.model_type: assert self.continuous_decoding is False, "ds2 model not support endpoint" From 1a56a6e42bccedee0285d8a22205d802878bab92 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Tue, 20 Sep 2022 03:42:07 +0000 Subject: [PATCH 020/124] add bitransformer decoder, test=asr --- paddlespeech/audio/utils/tensor_utils.py | 41 ++++-- paddlespeech/s2t/exps/u2/bin/test_wav.py | 3 +- paddlespeech/s2t/exps/u2/model.py | 9 +- paddlespeech/s2t/models/u2/u2.py | 152 ++++++++++++++++++++--- paddlespeech/s2t/modules/decoder.py | 128 ++++++++++++++++++- 5 files changed, 302 insertions(+), 31 deletions(-) diff --git a/paddlespeech/audio/utils/tensor_utils.py b/paddlespeech/audio/utils/tensor_utils.py index 16f60810e..ac86757b5 100644 --- a/paddlespeech/audio/utils/tensor_utils.py +++ b/paddlespeech/audio/utils/tensor_utils.py @@ -31,7 +31,6 @@ def has_tensor(val): return True elif isinstance(val, dict): for k, v in val.items(): - print(k) if has_tensor(v): return True else: @@ -143,14 +142,15 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int, [ 7, 8, 9, 11, -1, -1]]) """ # TODO(Hui Zhang): using comment code, - #_sos = paddle.to_tensor( - # [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place) - #_eos = paddle.to_tensor( - # [eos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place) - #ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys] - #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys] - #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id) + # _sos = paddle.to_tensor( + # [sos], dtype=ys_pad.dtype, stop_gradient=True, place=ys_pad.place) + # _eos = paddle.to_tensor( + # [eos], dtype=ys_pad.dtype, stop_gradient=True, place=ys_pad.place) + # ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys + # ys_in = [paddle.concat([_sos, y], axis=0) for y in ys] + # ys_out = [paddle.concat([y, _eos], axis=0) for y in ys] + # return pad_sequence(ys_in, padding_value=eos).transpose([1,0]), pad_sequence(ys_out, padding_value=ignore_id).transpose([1,0]) + B = ys_pad.shape[0] _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos @@ -190,3 +190,26 @@ def th_accuracy(pad_outputs: paddle.Tensor, # denominator = paddle.sum(mask) denominator = paddle.sum(mask.type_as(pad_targets)) return float(numerator) / float(denominator) + + +def reverse_pad_list(ys_pad: paddle.Tensor, + ys_lens: paddle.Tensor, + pad_value: float=-1.0) -> paddle.Tensor: + """Reverse padding for the list of tensors. + Args: + ys_pad (tensor): The padded tensor (B, Tokenmax). + ys_lens (tensor): The lens of token seqs (B) + pad_value (int): Value for padding. + Returns: + Tensor: Padded tensor (B, Tokenmax). + Examples: + >>> x + tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) + >>> pad_list(x, 0) + tensor([[4, 3, 2, 1], + [7, 6, 5, 0], + [9, 8, 0, 0]]) + """ + r_ys_pad = pad_sequence([(paddle.flip(y.int()[:i], [0])) + for y, i in zip(ys_pad, ys_lens)], True, pad_value) + return r_ys_pad diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 887ec7a6d..51b72209d 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -89,7 +89,8 @@ class U2Infer(): ctc_weight=decode_config.ctc_weight, decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, - simulate_streaming=decode_config.simulate_streaming) + simulate_streaming=decode_config.simulate_streaming, + reverse_weight=self.config.model_conf.reverse_weight) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {result_transcripts[0][0]}") diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index db60083b0..a7ccba485 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -250,10 +250,12 @@ class U2Trainer(Trainer): model_conf.output_dim = self.train_loader.vocab_size else: model_conf.input_dim = self.test_loader.feat_dim - model_conf.output_dim = self.test_loader.vocab_size + model_conf.output_dim = 5538 model = U2Model.from_config(model_conf) - + # params = model.state_dict() + # paddle.save(params, 'for_torch/test.pdparams') + # exit() if self.parallel: model = paddle.DataParallel(model) @@ -350,7 +352,8 @@ class U2Tester(U2Trainer): ctc_weight=decode_config.ctc_weight, decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, - simulate_streaming=decode_config.simulate_streaming) + simulate_streaming=decode_config.simulate_streaming, + reverse_weight=self.config.model_conf.reverse_weight) decode_time = time.time() - start_time for utt, target, result, rec_tids in zip( diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 813e1e529..84c0e5b5e 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -31,6 +31,7 @@ from paddle import nn from paddlespeech.audio.utils.tensor_utils import add_sos_eos from paddlespeech.audio.utils.tensor_utils import pad_sequence +from paddlespeech.audio.utils.tensor_utils import reverse_pad_list from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.decoders.scorers.ctc import CTCPrefixScorer from paddlespeech.s2t.frontend.utility import IGNORE_ID @@ -38,6 +39,7 @@ from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.models.asr_interface import ASRInterface from paddlespeech.s2t.modules.cmvn import GlobalCMVN from paddlespeech.s2t.modules.ctc import CTCDecoderBase +from paddlespeech.s2t.modules.decoder import BiTransformerDecoder from paddlespeech.s2t.modules.decoder import TransformerDecoder from paddlespeech.s2t.modules.encoder import ConformerEncoder from paddlespeech.s2t.modules.encoder import TransformerEncoder @@ -69,6 +71,7 @@ class U2BaseModel(ASRInterface, nn.Layer): ctc: CTCDecoderBase, ctc_weight: float=0.5, ignore_id: int=IGNORE_ID, + reverse_weight: float=0.0, lsm_weight: float=0.0, length_normalized_loss: bool=False, **kwargs): @@ -82,6 +85,7 @@ class U2BaseModel(ASRInterface, nn.Layer): self.vocab_size = vocab_size self.ignore_id = ignore_id self.ctc_weight = ctc_weight + self.reverse_weight = reverse_weight self.encoder = encoder self.decoder = decoder @@ -171,12 +175,21 @@ class U2BaseModel(ASRInterface, nn.Layer): self.ignore_id) ys_in_lens = ys_pad_lens + 1 + r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) + r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, + self.ignore_id) # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad, - ys_in_lens) + decoder_out, r_decoder_out, _ = self.decoder( + encoder_out, encoder_mask, ys_in_pad, ys_in_lens, r_ys_in_pad, + self.reverse_weight) # 2. Compute attention loss loss_att = self.criterion_att(decoder_out, ys_out_pad) + r_loss_att = paddle.to_tensor(0.0) + if self.reverse_weight > 0.0: + r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) + loss_att = loss_att * (1 - self.reverse_weight + ) + r_loss_att * self.reverse_weight acc_att = th_accuracy( decoder_out.view(-1, self.vocab_size), ys_out_pad, @@ -359,6 +372,7 @@ class U2BaseModel(ASRInterface, nn.Layer): # Let's assume B = batch_size # encoder_out: (B, maxlen, encoder_dim) # encoder_mask: (B, 1, Tmax) + encoder_out, encoder_mask = self._forward_encoder( speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) @@ -500,7 +514,8 @@ class U2BaseModel(ASRInterface, nn.Layer): decoding_chunk_size: int=-1, num_decoding_left_chunks: int=-1, ctc_weight: float=0.0, - simulate_streaming: bool=False, ) -> List[int]: + simulate_streaming: bool=False, + reverse_weight: float=0.0, ) -> List[int]: """ Apply attention rescoring decoding, CTC prefix beam search is applied first to get nbest, then we resoring the nbest on attention decoder with corresponding encoder out @@ -520,6 +535,9 @@ class U2BaseModel(ASRInterface, nn.Layer): """ assert speech.shape[0] == speech_lengths.shape[0] assert decoding_chunk_size != 0 + if reverse_weight > 0.0: + # decoder should be a bitransformer decoder if reverse_weight > 0.0 + assert hasattr(self.decoder, 'right_decoder') device = speech.place batch_size = speech.shape[0] # For attention rescoring we only support batch_size=1 @@ -541,6 +559,7 @@ class U2BaseModel(ASRInterface, nn.Layer): hyp_content, place=device, dtype=paddle.long) hyp_list.append(hyp_content) hyps_pad = pad_sequence(hyp_list, True, self.ignore_id) + ori_hyps_pad = hyps_pad hyps_lens = paddle.to_tensor( [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) @@ -550,13 +569,24 @@ class U2BaseModel(ASRInterface, nn.Layer): encoder_out = encoder_out.repeat(beam_size, 1, 1) encoder_mask = paddle.ones( (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool) - decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, - hyps_lens) # (beam_size, max_hyps_len, vocab_size) + + # used for right to left decoder + r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens - 1, + self.ignore_id) + r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, + self.ignore_id) + decoder_out, r_decoder_out, _ = self.decoder( + encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, + reverse_weight) # (beam_size, max_hyps_len, vocab_size) # ctc score in ln domain decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1) decoder_out = decoder_out.numpy() + # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a + # conventional transformer decoder. + r_decoder_out = paddle.nn.functional.log_softmax(r_decoder_out, axis=-1) + r_decoder_out = r_decoder_out.numpy() + # Only use decoder score for rescoring best_score = -float('inf') best_index = 0 @@ -567,6 +597,12 @@ class U2BaseModel(ASRInterface, nn.Layer): score += decoder_out[i][j][w] # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] + if reverse_weight > 0: + r_score = 0.0 + for j, w in enumerate(hyp[0]): + r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] + r_score += r_decoder_out[i][len(hyp[0])][self.eos] + score = score * (1 - reverse_weight) + r_score * reverse_weight # add ctc score (which in ln domain) score += hyp[1] * ctc_weight if score > best_score: @@ -653,12 +689,24 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) + @jit.to_static + def is_bidirectional_decoder(self) -> bool: + """ + Returns: + torch.Tensor: decoder output + """ + if hasattr(self.decoder, 'right_decoder'): + return True + else: + return False + @jit.to_static def forward_attention_decoder( self, hyps: paddle.Tensor, hyps_lens: paddle.Tensor, - encoder_out: paddle.Tensor, ) -> paddle.Tensor: + encoder_out: paddle.Tensor, + reverse_weight: float=0, ) -> paddle.Tensor: """ Export interface for c++ call, forward decoder with multiple hypothesis from ctc prefix beam search and one encoder output Args: @@ -676,11 +724,75 @@ class U2BaseModel(ASRInterface, nn.Layer): # (B, 1, T) encoder_mask = paddle.ones( [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool) + + # input for right to left decoder + # this hyps_lens has count token, we need minus it. + r_hyps_lens = hyps_lens - 1 + # this hyps has included token, so it should be + # convert the original hyps. + r_hyps = hyps[:, 1:] # (num_hyps, max_hyps_len, vocab_size) + + # Equal to: + # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) + # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) + max_len = paddle.max(r_hyps_lens) + index_range = paddle.arange(0, max_len, 1) + seq_len_expand = r_hyps_lens.unsqueeze(1) + seq_mask = seq_len_expand > index_range # (beam, max_len) + + index = (seq_len_expand - 1) - index_range # (beam, max_len) + # >>> index + # >>> tensor([[ 2, 1, 0], + # >>> [ 2, 1, 0], + # >>> [ 0, -1, -2]]) + index = index * seq_mask + + # >>> index + # >>> tensor([[2, 1, 0], + # >>> [2, 1, 0], + # >>> [0, 0, 0]]) + def paddle_gather(x, dim, index): + index_shape = index.shape + index_flatten = index.flatten() + if dim < 0: + dim = len(x.shape) + dim + nd_index = [] + for k in range(len(x.shape)): + if k == dim: + nd_index.append(index_flatten) + else: + reshape_shape = [1] * len(x.shape) + reshape_shape[k] = x.shape[k] + x_arange = paddle.arange(x.shape[k], dtype=index.dtype) + x_arange = x_arange.reshape(reshape_shape) + dim_index = paddle.expand(x_arange, index_shape).flatten() + nd_index.append(dim_index) + ind2 = paddle.transpose(paddle.stack(nd_index), + [1, 0]).astype("int64") + paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape) + return paddle_out + + r_hyps = paddle_gather(r_hyps, 1, index) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, 2, 2]]) + r_hyps = paddle.where(seq_mask, r_hyps, self.eos) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, eos, eos]]) + r_hyps = torch.concat([hyps[:, 0:1], r_hyps], axis=1) + # >>> r_hyps + # >>> tensor([[sos, 3, 2, 1], + # >>> [sos, 4, 8, 9], + # >>> [sos, 2, eos, eos]]) decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps, - hyps_lens) + hyps_lens, r_hyps, reverse_weight) decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1) - return decoder_out + r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) + return decoder_out, r_decoder_out @paddle.no_grad() def decode(self, @@ -692,7 +804,8 @@ class U2BaseModel(ASRInterface, nn.Layer): ctc_weight: float=0.0, decoding_chunk_size: int=-1, num_decoding_left_chunks: int=-1, - simulate_streaming: bool=False): + simulate_streaming: bool=False, + reverse_weight: float=0.0): """u2 decoding. Args: @@ -801,7 +914,6 @@ class U2Model(U2DecodeModel): with DefaultInitializerContext(init_type): vocab_size, encoder, decoder, ctc = U2Model._init_from_config( configs) - super().__init__( vocab_size=vocab_size, encoder=encoder, @@ -851,10 +963,20 @@ class U2Model(U2DecodeModel): raise ValueError(f"not support encoder type:{encoder_type}") # decoder - decoder = TransformerDecoder(vocab_size, - encoder.output_size(), - **configs['decoder_conf']) - + decoder_type = configs.get('decoder', 'transformer') + logger.debug(f"U2 Decoder type: {decoder_type}") + if decoder_type == 'transformer': + decoder = TransformerDecoder(vocab_size, + encoder.output_size(), + **configs['decoder_conf']) + elif decoder_type == 'bitransformer': + assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 + assert configs['decoder_conf']['r_num_blocks'] > 0 + decoder = BiTransformerDecoder(vocab_size, + encoder.output_size(), + **configs['decoder_conf']) + else: + raise ValueError(f"not support decoder type:{decoder_type}") # ctc decoder and ctc loss model_conf = configs.get('model_conf', dict()) dropout_rate = model_conf.get('ctc_dropout_rate', 0.0) diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index ccc8482d5..2052a19e1 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -35,7 +35,6 @@ from paddlespeech.s2t.modules.mask import make_xs_mask from paddlespeech.s2t.modules.mask import subsequent_mask from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward from paddlespeech.s2t.utils.log import Log - logger = Log(__name__).getlog() __all__ = ["TransformerDecoder"] @@ -116,13 +115,19 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): memory: paddle.Tensor, memory_mask: paddle.Tensor, ys_in_pad: paddle.Tensor, - ys_in_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor]: + ys_in_lens: paddle.Tensor, + r_ys_in_pad: paddle.Tensor=paddle.empty([0]), + reverse_weight: float=0.0) -> Tuple[paddle.Tensor, paddle.Tensor]: """Forward decoder. Args: memory: encoded memory, float32 (batch, maxlen_in, feat) memory_mask: encoder memory mask, (batch, 1, maxlen_in) ys_in_pad: padded input token ids, int64 (batch, maxlen_out) ys_in_lens: input lengths of this batch (batch) + r_ys_in_pad: not used in transformer decoder, in order to unify api + with bidirectional decoder + reverse_weight: not used in transformer decoder, in order to unify + api with bidirectional decode Returns: (tuple): tuple containing: x: decoded token score before softmax (batch, maxlen_out, vocab_size) @@ -151,7 +156,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): # TODO(Hui Zhang): reduce_sum not support bool type # olens = tgt_mask.sum(1) olens = tgt_mask.astype(paddle.int).sum(1) - return x, olens + return x, paddle.to_tensor(0.0), olens def forward_one_step( self, @@ -251,3 +256,120 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] return logp, state_list + + +class BiTransformerDecoder(BatchScorerInterface, nn.Layer): + """Base class of Transfomer decoder module. + Args: + vocab_size: output dim + encoder_output_size: dimension of attention + attention_heads: the number of heads of multi head attention + linear_units: the hidden units number of position-wise feedforward + num_blocks: the number of decoder blocks + r_num_blocks: the number of right to left decoder blocks + dropout_rate: dropout rate + self_attention_dropout_rate: dropout rate for attention + input_layer: input layer type + use_output_layer: whether to use output layer + pos_enc_class: PositionalEncoding or ScaledPositionalEncoding + normalize_before: + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + concat_after: whether to concat attention layer's input and output + True: x -> x + linear(concat(x, att(x))) + False: x -> x + att(x) + """ + + def __init__(self, + vocab_size: int, + encoder_output_size: int, + attention_heads: int=4, + linear_units: int=2048, + num_blocks: int=6, + r_num_blocks: int=0, + dropout_rate: float=0.1, + positional_dropout_rate: float=0.1, + self_attention_dropout_rate: float=0.0, + src_attention_dropout_rate: float=0.0, + input_layer: str="embed", + use_output_layer: bool=True, + normalize_before: bool=True, + concat_after: bool=False, + max_len: int=5000): + + assert check_argument_types() + + nn.Layer.__init__(self) + self.left_decoder = TransformerDecoder( + vocab_size, encoder_output_size, attention_heads, linear_units, + num_blocks, dropout_rate, positional_dropout_rate, + self_attention_dropout_rate, src_attention_dropout_rate, + input_layer, use_output_layer, normalize_before, concat_after, + max_len) + + self.right_decoder = TransformerDecoder( + vocab_size, encoder_output_size, attention_heads, linear_units, + r_num_blocks, dropout_rate, positional_dropout_rate, + self_attention_dropout_rate, src_attention_dropout_rate, + input_layer, use_output_layer, normalize_before, concat_after, + max_len) + + def forward( + self, + memory: paddle.Tensor, + memory_mask: paddle.Tensor, + ys_in_pad: paddle.Tensor, + ys_in_lens: paddle.Tensor, + r_ys_in_pad: paddle.Tensor, + reverse_weight: float=0.0, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + """Forward decoder. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoder memory mask, (batch, 1, maxlen_in) + ys_in_pad: padded input token ids, int64 (batch, maxlen_out) + ys_in_lens: input lengths of this batch (batch) + r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), + used for right to left decoder + reverse_weight: used for right to left decoder + Returns: + (tuple): tuple containing: + x: decoded token score before softmax (batch, maxlen_out, + vocab_size) if use_output_layer is True, + r_x: x: decoded token score (right to left decoder) + before softmax (batch, maxlen_out, vocab_size) + if use_output_layer is True, + olens: (batch, ) + """ + l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, + ys_in_lens) + r_x = paddle.to_tensor(0.0) + if reverse_weight > 0.0: + r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, + ys_in_lens) + return l_x, r_x, olens + + def forward_one_step( + self, + memory: paddle.Tensor, + memory_mask: paddle.Tensor, + tgt: paddle.Tensor, + tgt_mask: paddle.Tensor, + cache: Optional[List[paddle.Tensor]]=None, + ) -> Tuple[paddle.Tensor, List[paddle.Tensor]]: + """Forward one step. + This is only used for decoding. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoded memory mask, (batch, 1, maxlen_in) + tgt: input token ids, int64 (batch, maxlen_out) + tgt_mask: input token mask, (batch, maxlen_out) + dtype=torch.uint8 in PyTorch 1.2- + dtype=torch.bool in PyTorch 1.2+ (include 1.2) + cache: cached output list of (batch, max_time_out-1, size) + Returns: + y, cache: NN output value and cache per `self.decoders`. + y.shape` is (batch, maxlen_out, token) + """ + return self.left_decoder.forward_one_step(memory, memory_mask, tgt, + tgt_mask, cache) From ecbf324286c55125e5fd2712c16bedc22f1e51c9 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Tue, 20 Sep 2022 05:28:02 +0000 Subject: [PATCH 021/124] support bitransformer decoder, test=asr --- paddlespeech/server/engine/asr/online/python/asr_engine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 87d88ee60..4c7c4b37a 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -613,7 +613,8 @@ class PaddleASRConnectionHanddler: encoder_out = self.encoder_out.repeat(beam_size, 1, 1) encoder_mask = paddle.ones( (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool) - decoder_out, _ = self.model.decoder( + + decoder_out, _, _ = self.model.decoder( encoder_out, encoder_mask, hyps_pad, hyps_lens) # (beam_size, max_hyps_len, vocab_size) # ctc score in ln domain From 5cdc79ddf214f5f06d224db75b1a2b89279ea704 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 20 Sep 2022 14:34:41 +0800 Subject: [PATCH 022/124] [doc]add finetune demos in readthe docs (#2411) * add finetune demos, test=doc --- docs/requirements.txt | 3 +- ...lespeech.cls.exps.panns.deploy.predict.rst | 7 - .../paddlespeech.cls.exps.panns.deploy.rst | 1 - ...ddlespeech.cls.exps.panns.export_model.rst | 7 - .../paddlespeech.cls.exps.panns.predict.rst | 7 - .../api/paddlespeech.cls.exps.panns.rst | 3 - .../api/paddlespeech.cls.exps.panns.train.rst | 7 - ...dlespeech.kws.exps.mdtc.plot_det_curve.rst | 7 - .../source/api/paddlespeech.kws.exps.mdtc.rst | 1 - .../paddlespeech.s2t.decoders.ctcdecoder.rst | 1 - ....decoders.ctcdecoder.scorer_deprecated.rst | 7 - .../paddlespeech.s2t.decoders.recog_bin.rst | 7 - docs/source/api/paddlespeech.s2t.decoders.rst | 1 - ...addlespeech.s2t.decoders.scorers.ngram.rst | 7 - .../api/paddlespeech.s2t.decoders.scorers.rst | 1 - ...s2t.exps.deepspeech2.bin.deploy.client.rst | 7 - ...s2t.exps.deepspeech2.bin.deploy.record.rst | 7 - ...speech.s2t.exps.deepspeech2.bin.deploy.rst | 3 - ...h.s2t.exps.deepspeech2.bin.deploy.send.rst | 7 - docs/source/api/paddlespeech.s2t.exps.u2.rst | 1 - .../api/paddlespeech.s2t.exps.u2.trainer.rst | 7 - ...ddlespeech.s2t.exps.u2_kaldi.bin.recog.rst | 7 - .../paddlespeech.s2t.exps.u2_kaldi.bin.rst | 1 - .../paddlespeech.s2t.training.extensions.rst | 2 - ...peech.s2t.training.extensions.snapshot.rst | 7 - ...ech.s2t.training.extensions.visualizer.rst | 7 - .../paddlespeech.s2t.training.updaters.rst | 1 - ...lespeech.s2t.training.updaters.trainer.rst | 7 - .../paddlespeech.s2t.transform.add_deltas.rst | 7 - ...espeech.s2t.transform.channel_selector.rst | 7 - .../api/paddlespeech.s2t.transform.cmvn.rst | 7 - .../paddlespeech.s2t.transform.functional.rst | 7 - .../paddlespeech.s2t.transform.perturb.rst | 7 - .../source/api/paddlespeech.s2t.transform.rst | 24 - ...addlespeech.s2t.transform.spec_augment.rst | 7 - ...paddlespeech.s2t.transform.spectrogram.rst | 7 - ...eech.s2t.transform.transform_interface.rst | 7 - ...dlespeech.s2t.transform.transformation.rst | 7 - .../api/paddlespeech.s2t.transform.wpe.rst | 7 - ...ch.server.engine.acs.python.acs_engine.rst | 7 - .../paddlespeech.server.engine.acs.python.rst | 1 - .../api/paddlespeech.server.utils.log.rst | 7 - docs/source/api/paddlespeech.t2s.exps.rst | 2 +- .../paddlespeech.t2s.exps.stream_play_tts.rst | 7 - .../paddlespeech.t2s.models.ernie_sat.mlm.rst | 7 - ...h.t2s.models.vits.monotonic_align.core.rst | 7 - ...speech.t2s.models.vits.monotonic_align.rst | 16 - ....t2s.models.vits.monotonic_align.setup.rst | 7 - .../api/paddlespeech.t2s.models.vits.rst | 1 - docs/source/tts/demo.rst | 472 +++++++++++------- docs/source/tts/demo_2.rst | 56 +-- 51 files changed, 336 insertions(+), 479 deletions(-) delete mode 100644 docs/source/api/paddlespeech.cls.exps.panns.deploy.predict.rst delete mode 100644 docs/source/api/paddlespeech.cls.exps.panns.export_model.rst delete mode 100644 docs/source/api/paddlespeech.cls.exps.panns.predict.rst delete mode 100644 docs/source/api/paddlespeech.cls.exps.panns.train.rst delete mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst delete mode 100644 docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated.rst delete mode 100644 docs/source/api/paddlespeech.s2t.decoders.recog_bin.rst delete mode 100644 docs/source/api/paddlespeech.s2t.decoders.scorers.ngram.rst delete mode 100644 docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.client.rst delete mode 100644 docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.record.rst delete mode 100644 docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.send.rst delete mode 100644 docs/source/api/paddlespeech.s2t.exps.u2.trainer.rst delete mode 100644 docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.recog.rst delete mode 100644 docs/source/api/paddlespeech.s2t.training.extensions.snapshot.rst delete mode 100644 docs/source/api/paddlespeech.s2t.training.extensions.visualizer.rst delete mode 100644 docs/source/api/paddlespeech.s2t.training.updaters.trainer.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.add_deltas.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.channel_selector.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.cmvn.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.functional.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.perturb.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.spec_augment.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.spectrogram.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.transform_interface.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.transformation.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.wpe.rst delete mode 100644 docs/source/api/paddlespeech.server.engine.acs.python.acs_engine.rst delete mode 100644 docs/source/api/paddlespeech.server.utils.log.rst delete mode 100644 docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst delete mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.mlm.rst delete mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst delete mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst delete mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst diff --git a/docs/requirements.txt b/docs/requirements.txt index 3fb82367f..fd7a481ba 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -20,6 +20,7 @@ onnxruntime==1.10.0 opencc paddlenlp paddlepaddle>=2.2.2 +paddlespeech_ctcdecoders paddlespeech_feat pandas pathos == 0.2.8 @@ -27,8 +28,8 @@ pattern_singleton Pillow>=9.0.0 praatio==5.0.0 prettytable -pypinyin<=0.44.0 pypinyin-dict +pypinyin<=0.44.0 python-dateutil pyworld==0.2.12 recommonmark>=0.5.0 diff --git a/docs/source/api/paddlespeech.cls.exps.panns.deploy.predict.rst b/docs/source/api/paddlespeech.cls.exps.panns.deploy.predict.rst deleted file mode 100644 index d4f92a2ea..000000000 --- a/docs/source/api/paddlespeech.cls.exps.panns.deploy.predict.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.cls.exps.panns.deploy.predict module -================================================= - -.. automodule:: paddlespeech.cls.exps.panns.deploy.predict - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst b/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst index 4415c9330..369862ccf 100644 --- a/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst +++ b/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst @@ -12,4 +12,3 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.cls.exps.panns.deploy.predict diff --git a/docs/source/api/paddlespeech.cls.exps.panns.export_model.rst b/docs/source/api/paddlespeech.cls.exps.panns.export_model.rst deleted file mode 100644 index 6c39c2bc8..000000000 --- a/docs/source/api/paddlespeech.cls.exps.panns.export_model.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.cls.exps.panns.export\_model module -================================================ - -.. automodule:: paddlespeech.cls.exps.panns.export_model - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.cls.exps.panns.predict.rst b/docs/source/api/paddlespeech.cls.exps.panns.predict.rst deleted file mode 100644 index 88cd40338..000000000 --- a/docs/source/api/paddlespeech.cls.exps.panns.predict.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.cls.exps.panns.predict module -========================================== - -.. automodule:: paddlespeech.cls.exps.panns.predict - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.cls.exps.panns.rst b/docs/source/api/paddlespeech.cls.exps.panns.rst index 6147b245e..72f30ba61 100644 --- a/docs/source/api/paddlespeech.cls.exps.panns.rst +++ b/docs/source/api/paddlespeech.cls.exps.panns.rst @@ -20,6 +20,3 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.cls.exps.panns.export_model - paddlespeech.cls.exps.panns.predict - paddlespeech.cls.exps.panns.train diff --git a/docs/source/api/paddlespeech.cls.exps.panns.train.rst b/docs/source/api/paddlespeech.cls.exps.panns.train.rst deleted file mode 100644 index a89b7eecc..000000000 --- a/docs/source/api/paddlespeech.cls.exps.panns.train.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.cls.exps.panns.train module -======================================== - -.. automodule:: paddlespeech.cls.exps.panns.train - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst deleted file mode 100644 index 46a149b0b..000000000 --- a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.kws.exps.mdtc.plot\_det\_curve module -================================================== - -.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.rst index f6cad64e3..33d4a55cd 100644 --- a/docs/source/api/paddlespeech.kws.exps.mdtc.rst +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.rst @@ -14,6 +14,5 @@ Submodules paddlespeech.kws.exps.mdtc.collate paddlespeech.kws.exps.mdtc.compute_det - paddlespeech.kws.exps.mdtc.plot_det_curve paddlespeech.kws.exps.mdtc.score paddlespeech.kws.exps.mdtc.train diff --git a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst index 8093619b1..dfcd274ca 100644 --- a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst +++ b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst @@ -13,5 +13,4 @@ Submodules :maxdepth: 4 paddlespeech.s2t.decoders.ctcdecoder.decoders_deprecated - paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper diff --git a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated.rst b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated.rst deleted file mode 100644 index 1079d6721..000000000 --- a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.decoders.ctcdecoder.scorer\_deprecated module -============================================================== - -.. automodule:: paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.decoders.recog_bin.rst b/docs/source/api/paddlespeech.s2t.decoders.recog_bin.rst deleted file mode 100644 index 4952e2e6a..000000000 --- a/docs/source/api/paddlespeech.s2t.decoders.recog_bin.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.decoders.recog\_bin module -=========================================== - -.. automodule:: paddlespeech.s2t.decoders.recog_bin - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.decoders.rst b/docs/source/api/paddlespeech.s2t.decoders.rst index e4eabedfd..53e0d9c49 100644 --- a/docs/source/api/paddlespeech.s2t.decoders.rst +++ b/docs/source/api/paddlespeech.s2t.decoders.rst @@ -23,5 +23,4 @@ Submodules :maxdepth: 4 paddlespeech.s2t.decoders.recog - paddlespeech.s2t.decoders.recog_bin paddlespeech.s2t.decoders.utils diff --git a/docs/source/api/paddlespeech.s2t.decoders.scorers.ngram.rst b/docs/source/api/paddlespeech.s2t.decoders.scorers.ngram.rst deleted file mode 100644 index f38a61099..000000000 --- a/docs/source/api/paddlespeech.s2t.decoders.scorers.ngram.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.decoders.scorers.ngram module -============================================== - -.. automodule:: paddlespeech.s2t.decoders.scorers.ngram - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.decoders.scorers.rst b/docs/source/api/paddlespeech.s2t.decoders.scorers.rst index 83808c49b..ca834f6b5 100644 --- a/docs/source/api/paddlespeech.s2t.decoders.scorers.rst +++ b/docs/source/api/paddlespeech.s2t.decoders.scorers.rst @@ -15,5 +15,4 @@ Submodules paddlespeech.s2t.decoders.scorers.ctc paddlespeech.s2t.decoders.scorers.ctc_prefix_score paddlespeech.s2t.decoders.scorers.length_bonus - paddlespeech.s2t.decoders.scorers.ngram paddlespeech.s2t.decoders.scorers.scorer_interface diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.client.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.client.rst deleted file mode 100644 index a73a56853..000000000 --- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.client.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.exps.deepspeech2.bin.deploy.client module -========================================================== - -.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.client - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.record.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.record.rst deleted file mode 100644 index bc1078485..000000000 --- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.record.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.exps.deepspeech2.bin.deploy.record module -========================================================== - -.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.record - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst index d1f966fc1..28de0f7fb 100644 --- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst +++ b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst @@ -12,8 +12,5 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.s2t.exps.deepspeech2.bin.deploy.client - paddlespeech.s2t.exps.deepspeech2.bin.deploy.record paddlespeech.s2t.exps.deepspeech2.bin.deploy.runtime - paddlespeech.s2t.exps.deepspeech2.bin.deploy.send paddlespeech.s2t.exps.deepspeech2.bin.deploy.server diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.send.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.send.rst deleted file mode 100644 index ba1ae0a62..000000000 --- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.send.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.exps.deepspeech2.bin.deploy.send module -======================================================== - -.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.send - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.rst b/docs/source/api/paddlespeech.s2t.exps.u2.rst index e0ebb7fc9..bf5656701 100644 --- a/docs/source/api/paddlespeech.s2t.exps.u2.rst +++ b/docs/source/api/paddlespeech.s2t.exps.u2.rst @@ -21,4 +21,3 @@ Submodules :maxdepth: 4 paddlespeech.s2t.exps.u2.model - paddlespeech.s2t.exps.u2.trainer diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.trainer.rst b/docs/source/api/paddlespeech.s2t.exps.u2.trainer.rst deleted file mode 100644 index 0cd28945a..000000000 --- a/docs/source/api/paddlespeech.s2t.exps.u2.trainer.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.exps.u2.trainer module -======================================= - -.. automodule:: paddlespeech.s2t.exps.u2.trainer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.recog.rst b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.recog.rst deleted file mode 100644 index bc749c8f8..000000000 --- a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.recog.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.exps.u2\_kaldi.bin.recog module -================================================ - -.. automodule:: paddlespeech.s2t.exps.u2_kaldi.bin.recog - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst index ff1a6efee..087b87677 100644 --- a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst +++ b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst @@ -12,6 +12,5 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.s2t.exps.u2_kaldi.bin.recog paddlespeech.s2t.exps.u2_kaldi.bin.test paddlespeech.s2t.exps.u2_kaldi.bin.train diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.rst b/docs/source/api/paddlespeech.s2t.training.extensions.rst index f31b8427e..13530a8d2 100644 --- a/docs/source/api/paddlespeech.s2t.training.extensions.rst +++ b/docs/source/api/paddlespeech.s2t.training.extensions.rst @@ -15,5 +15,3 @@ Submodules paddlespeech.s2t.training.extensions.evaluator paddlespeech.s2t.training.extensions.extension paddlespeech.s2t.training.extensions.plot - paddlespeech.s2t.training.extensions.snapshot - paddlespeech.s2t.training.extensions.visualizer diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.snapshot.rst b/docs/source/api/paddlespeech.s2t.training.extensions.snapshot.rst deleted file mode 100644 index e0ca21a73..000000000 --- a/docs/source/api/paddlespeech.s2t.training.extensions.snapshot.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.training.extensions.snapshot module -==================================================== - -.. automodule:: paddlespeech.s2t.training.extensions.snapshot - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.visualizer.rst b/docs/source/api/paddlespeech.s2t.training.extensions.visualizer.rst deleted file mode 100644 index 22ae11f11..000000000 --- a/docs/source/api/paddlespeech.s2t.training.extensions.visualizer.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.training.extensions.visualizer module -====================================================== - -.. automodule:: paddlespeech.s2t.training.extensions.visualizer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.training.updaters.rst b/docs/source/api/paddlespeech.s2t.training.updaters.rst index a06170168..b38704a0d 100644 --- a/docs/source/api/paddlespeech.s2t.training.updaters.rst +++ b/docs/source/api/paddlespeech.s2t.training.updaters.rst @@ -13,5 +13,4 @@ Submodules :maxdepth: 4 paddlespeech.s2t.training.updaters.standard_updater - paddlespeech.s2t.training.updaters.trainer paddlespeech.s2t.training.updaters.updater diff --git a/docs/source/api/paddlespeech.s2t.training.updaters.trainer.rst b/docs/source/api/paddlespeech.s2t.training.updaters.trainer.rst deleted file mode 100644 index 6981a8f05..000000000 --- a/docs/source/api/paddlespeech.s2t.training.updaters.trainer.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.training.updaters.trainer module -================================================= - -.. automodule:: paddlespeech.s2t.training.updaters.trainer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.add_deltas.rst b/docs/source/api/paddlespeech.s2t.transform.add_deltas.rst deleted file mode 100644 index 5007fd9d8..000000000 --- a/docs/source/api/paddlespeech.s2t.transform.add_deltas.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.add\_deltas module -============================================= - -.. automodule:: paddlespeech.s2t.transform.add_deltas - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.channel_selector.rst b/docs/source/api/paddlespeech.s2t.transform.channel_selector.rst deleted file mode 100644 index e08dd253e..000000000 --- a/docs/source/api/paddlespeech.s2t.transform.channel_selector.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.channel\_selector module -=================================================== - -.. automodule:: paddlespeech.s2t.transform.channel_selector - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.cmvn.rst b/docs/source/api/paddlespeech.s2t.transform.cmvn.rst deleted file mode 100644 index 8348e3d4b..000000000 --- a/docs/source/api/paddlespeech.s2t.transform.cmvn.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.cmvn module -====================================== - -.. automodule:: paddlespeech.s2t.transform.cmvn - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.functional.rst b/docs/source/api/paddlespeech.s2t.transform.functional.rst deleted file mode 100644 index eb2b54a67..000000000 --- a/docs/source/api/paddlespeech.s2t.transform.functional.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.functional module -============================================ - -.. automodule:: paddlespeech.s2t.transform.functional - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.perturb.rst b/docs/source/api/paddlespeech.s2t.transform.perturb.rst deleted file mode 100644 index 0be28ab7e..000000000 --- a/docs/source/api/paddlespeech.s2t.transform.perturb.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.perturb module -========================================= - -.. automodule:: paddlespeech.s2t.transform.perturb - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.rst b/docs/source/api/paddlespeech.s2t.transform.rst deleted file mode 100644 index 5016ff4f1..000000000 --- a/docs/source/api/paddlespeech.s2t.transform.rst +++ /dev/null @@ -1,24 +0,0 @@ -paddlespeech.s2t.transform package -================================== - -.. automodule:: paddlespeech.s2t.transform - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - paddlespeech.s2t.transform.add_deltas - paddlespeech.s2t.transform.channel_selector - paddlespeech.s2t.transform.cmvn - paddlespeech.s2t.transform.functional - paddlespeech.s2t.transform.perturb - paddlespeech.s2t.transform.spec_augment - paddlespeech.s2t.transform.spectrogram - paddlespeech.s2t.transform.transform_interface - paddlespeech.s2t.transform.transformation - paddlespeech.s2t.transform.wpe diff --git a/docs/source/api/paddlespeech.s2t.transform.spec_augment.rst b/docs/source/api/paddlespeech.s2t.transform.spec_augment.rst deleted file mode 100644 index 00fd3ea12..000000000 --- a/docs/source/api/paddlespeech.s2t.transform.spec_augment.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.spec\_augment module -=============================================== - -.. automodule:: paddlespeech.s2t.transform.spec_augment - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.spectrogram.rst b/docs/source/api/paddlespeech.s2t.transform.spectrogram.rst deleted file mode 100644 index 33c499a7a..000000000 --- a/docs/source/api/paddlespeech.s2t.transform.spectrogram.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.spectrogram module -============================================= - -.. automodule:: paddlespeech.s2t.transform.spectrogram - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.transform_interface.rst b/docs/source/api/paddlespeech.s2t.transform.transform_interface.rst deleted file mode 100644 index 009b06589..000000000 --- a/docs/source/api/paddlespeech.s2t.transform.transform_interface.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.transform\_interface module -====================================================== - -.. automodule:: paddlespeech.s2t.transform.transform_interface - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.transformation.rst b/docs/source/api/paddlespeech.s2t.transform.transformation.rst deleted file mode 100644 index a03e731a5..000000000 --- a/docs/source/api/paddlespeech.s2t.transform.transformation.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.transformation module -================================================ - -.. automodule:: paddlespeech.s2t.transform.transformation - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.wpe.rst b/docs/source/api/paddlespeech.s2t.transform.wpe.rst deleted file mode 100644 index c4831f7f9..000000000 --- a/docs/source/api/paddlespeech.s2t.transform.wpe.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.wpe module -===================================== - -.. automodule:: paddlespeech.s2t.transform.wpe - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.server.engine.acs.python.acs_engine.rst b/docs/source/api/paddlespeech.server.engine.acs.python.acs_engine.rst deleted file mode 100644 index 9b61633e0..000000000 --- a/docs/source/api/paddlespeech.server.engine.acs.python.acs_engine.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.server.engine.acs.python.acs\_engine module -======================================================== - -.. automodule:: paddlespeech.server.engine.acs.python.acs_engine - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.server.engine.acs.python.rst b/docs/source/api/paddlespeech.server.engine.acs.python.rst index 3c06ba080..7e5582bd0 100644 --- a/docs/source/api/paddlespeech.server.engine.acs.python.rst +++ b/docs/source/api/paddlespeech.server.engine.acs.python.rst @@ -12,4 +12,3 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.server.engine.acs.python.acs_engine diff --git a/docs/source/api/paddlespeech.server.utils.log.rst b/docs/source/api/paddlespeech.server.utils.log.rst deleted file mode 100644 index 453b4a61f..000000000 --- a/docs/source/api/paddlespeech.server.utils.log.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.server.utils.log module -==================================== - -.. automodule:: paddlespeech.server.utils.log - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.rst b/docs/source/api/paddlespeech.t2s.exps.rst index bee18a972..643f97b4c 100644 --- a/docs/source/api/paddlespeech.t2s.exps.rst +++ b/docs/source/api/paddlespeech.t2s.exps.rst @@ -30,10 +30,10 @@ Submodules paddlespeech.t2s.exps.inference paddlespeech.t2s.exps.inference_streaming + paddlespeech.t2s.models.vits.monotonic_align paddlespeech.t2s.exps.ort_predict paddlespeech.t2s.exps.ort_predict_e2e paddlespeech.t2s.exps.ort_predict_streaming - paddlespeech.t2s.exps.stream_play_tts paddlespeech.t2s.exps.syn_utils paddlespeech.t2s.exps.synthesize paddlespeech.t2s.exps.synthesize_e2e diff --git a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst deleted file mode 100644 index cb22dde0c..000000000 --- a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.t2s.exps.stream\_play\_tts module -============================================== - -.. automodule:: paddlespeech.t2s.exps.stream_play_tts - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.mlm.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.mlm.rst deleted file mode 100644 index f0e8fd11a..000000000 --- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.mlm.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.t2s.models.ernie\_sat.mlm module -============================================= - -.. automodule:: paddlespeech.t2s.models.ernie_sat.mlm - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst deleted file mode 100644 index 7aaba7952..000000000 --- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.t2s.models.vits.monotonic\_align.core module -========================================================= - -.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst deleted file mode 100644 index 25c819a7e..000000000 --- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst +++ /dev/null @@ -1,16 +0,0 @@ -paddlespeech.t2s.models.vits.monotonic\_align package -===================================================== - -.. automodule:: paddlespeech.t2s.models.vits.monotonic_align - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - paddlespeech.t2s.models.vits.monotonic_align.core - paddlespeech.t2s.models.vits.monotonic_align.setup diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst deleted file mode 100644 index a93c3b8bf..000000000 --- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.t2s.models.vits.monotonic\_align.setup module -========================================================== - -.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.vits.rst b/docs/source/api/paddlespeech.t2s.models.vits.rst index 3146094b0..205496f0f 100644 --- a/docs/source/api/paddlespeech.t2s.models.vits.rst +++ b/docs/source/api/paddlespeech.t2s.models.vits.rst @@ -12,7 +12,6 @@ Subpackages .. toctree:: :maxdepth: 4 - paddlespeech.t2s.models.vits.monotonic_align paddlespeech.t2s.models.vits.wavenet Submodules diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst index ca2fd98e4..1ae687f85 100644 --- a/docs/source/tts/demo.rst +++ b/docs/source/tts/demo.rst @@ -42,7 +42,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition -