From 1a56a6e42bccedee0285d8a22205d802878bab92 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Tue, 20 Sep 2022 03:42:07 +0000 Subject: [PATCH 01/35] add bitransformer decoder, test=asr --- paddlespeech/audio/utils/tensor_utils.py | 41 ++++-- paddlespeech/s2t/exps/u2/bin/test_wav.py | 3 +- paddlespeech/s2t/exps/u2/model.py | 9 +- paddlespeech/s2t/models/u2/u2.py | 152 ++++++++++++++++++++--- paddlespeech/s2t/modules/decoder.py | 128 ++++++++++++++++++- 5 files changed, 302 insertions(+), 31 deletions(-) diff --git a/paddlespeech/audio/utils/tensor_utils.py b/paddlespeech/audio/utils/tensor_utils.py index 16f60810..ac86757b 100644 --- a/paddlespeech/audio/utils/tensor_utils.py +++ b/paddlespeech/audio/utils/tensor_utils.py @@ -31,7 +31,6 @@ def has_tensor(val): return True elif isinstance(val, dict): for k, v in val.items(): - print(k) if has_tensor(v): return True else: @@ -143,14 +142,15 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int, [ 7, 8, 9, 11, -1, -1]]) """ # TODO(Hui Zhang): using comment code, - #_sos = paddle.to_tensor( - # [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place) - #_eos = paddle.to_tensor( - # [eos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place) - #ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys] - #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys] - #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id) + # _sos = paddle.to_tensor( + # [sos], dtype=ys_pad.dtype, stop_gradient=True, place=ys_pad.place) + # _eos = paddle.to_tensor( + # [eos], dtype=ys_pad.dtype, stop_gradient=True, place=ys_pad.place) + # ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys + # ys_in = [paddle.concat([_sos, y], axis=0) for y in ys] + # ys_out = [paddle.concat([y, _eos], axis=0) for y in ys] + # return pad_sequence(ys_in, padding_value=eos).transpose([1,0]), pad_sequence(ys_out, padding_value=ignore_id).transpose([1,0]) + B = ys_pad.shape[0] _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos @@ -190,3 +190,26 @@ def th_accuracy(pad_outputs: paddle.Tensor, # denominator = paddle.sum(mask) denominator = paddle.sum(mask.type_as(pad_targets)) return float(numerator) / float(denominator) + + +def reverse_pad_list(ys_pad: paddle.Tensor, + ys_lens: paddle.Tensor, + pad_value: float=-1.0) -> paddle.Tensor: + """Reverse padding for the list of tensors. + Args: + ys_pad (tensor): The padded tensor (B, Tokenmax). + ys_lens (tensor): The lens of token seqs (B) + pad_value (int): Value for padding. + Returns: + Tensor: Padded tensor (B, Tokenmax). + Examples: + >>> x + tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) + >>> pad_list(x, 0) + tensor([[4, 3, 2, 1], + [7, 6, 5, 0], + [9, 8, 0, 0]]) + """ + r_ys_pad = pad_sequence([(paddle.flip(y.int()[:i], [0])) + for y, i in zip(ys_pad, ys_lens)], True, pad_value) + return r_ys_pad diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 887ec7a6..51b72209 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -89,7 +89,8 @@ class U2Infer(): ctc_weight=decode_config.ctc_weight, decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, - simulate_streaming=decode_config.simulate_streaming) + simulate_streaming=decode_config.simulate_streaming, + reverse_weight=self.config.model_conf.reverse_weight) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {result_transcripts[0][0]}") diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index db60083b..a7ccba48 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -250,10 +250,12 @@ class U2Trainer(Trainer): model_conf.output_dim = self.train_loader.vocab_size else: model_conf.input_dim = self.test_loader.feat_dim - model_conf.output_dim = self.test_loader.vocab_size + model_conf.output_dim = 5538 model = U2Model.from_config(model_conf) - + # params = model.state_dict() + # paddle.save(params, 'for_torch/test.pdparams') + # exit() if self.parallel: model = paddle.DataParallel(model) @@ -350,7 +352,8 @@ class U2Tester(U2Trainer): ctc_weight=decode_config.ctc_weight, decoding_chunk_size=decode_config.decoding_chunk_size, num_decoding_left_chunks=decode_config.num_decoding_left_chunks, - simulate_streaming=decode_config.simulate_streaming) + simulate_streaming=decode_config.simulate_streaming, + reverse_weight=self.config.model_conf.reverse_weight) decode_time = time.time() - start_time for utt, target, result, rec_tids in zip( diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 813e1e52..84c0e5b5 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -31,6 +31,7 @@ from paddle import nn from paddlespeech.audio.utils.tensor_utils import add_sos_eos from paddlespeech.audio.utils.tensor_utils import pad_sequence +from paddlespeech.audio.utils.tensor_utils import reverse_pad_list from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.decoders.scorers.ctc import CTCPrefixScorer from paddlespeech.s2t.frontend.utility import IGNORE_ID @@ -38,6 +39,7 @@ from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.models.asr_interface import ASRInterface from paddlespeech.s2t.modules.cmvn import GlobalCMVN from paddlespeech.s2t.modules.ctc import CTCDecoderBase +from paddlespeech.s2t.modules.decoder import BiTransformerDecoder from paddlespeech.s2t.modules.decoder import TransformerDecoder from paddlespeech.s2t.modules.encoder import ConformerEncoder from paddlespeech.s2t.modules.encoder import TransformerEncoder @@ -69,6 +71,7 @@ class U2BaseModel(ASRInterface, nn.Layer): ctc: CTCDecoderBase, ctc_weight: float=0.5, ignore_id: int=IGNORE_ID, + reverse_weight: float=0.0, lsm_weight: float=0.0, length_normalized_loss: bool=False, **kwargs): @@ -82,6 +85,7 @@ class U2BaseModel(ASRInterface, nn.Layer): self.vocab_size = vocab_size self.ignore_id = ignore_id self.ctc_weight = ctc_weight + self.reverse_weight = reverse_weight self.encoder = encoder self.decoder = decoder @@ -171,12 +175,21 @@ class U2BaseModel(ASRInterface, nn.Layer): self.ignore_id) ys_in_lens = ys_pad_lens + 1 + r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) + r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, + self.ignore_id) # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad, - ys_in_lens) + decoder_out, r_decoder_out, _ = self.decoder( + encoder_out, encoder_mask, ys_in_pad, ys_in_lens, r_ys_in_pad, + self.reverse_weight) # 2. Compute attention loss loss_att = self.criterion_att(decoder_out, ys_out_pad) + r_loss_att = paddle.to_tensor(0.0) + if self.reverse_weight > 0.0: + r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) + loss_att = loss_att * (1 - self.reverse_weight + ) + r_loss_att * self.reverse_weight acc_att = th_accuracy( decoder_out.view(-1, self.vocab_size), ys_out_pad, @@ -359,6 +372,7 @@ class U2BaseModel(ASRInterface, nn.Layer): # Let's assume B = batch_size # encoder_out: (B, maxlen, encoder_dim) # encoder_mask: (B, 1, Tmax) + encoder_out, encoder_mask = self._forward_encoder( speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) @@ -500,7 +514,8 @@ class U2BaseModel(ASRInterface, nn.Layer): decoding_chunk_size: int=-1, num_decoding_left_chunks: int=-1, ctc_weight: float=0.0, - simulate_streaming: bool=False, ) -> List[int]: + simulate_streaming: bool=False, + reverse_weight: float=0.0, ) -> List[int]: """ Apply attention rescoring decoding, CTC prefix beam search is applied first to get nbest, then we resoring the nbest on attention decoder with corresponding encoder out @@ -520,6 +535,9 @@ class U2BaseModel(ASRInterface, nn.Layer): """ assert speech.shape[0] == speech_lengths.shape[0] assert decoding_chunk_size != 0 + if reverse_weight > 0.0: + # decoder should be a bitransformer decoder if reverse_weight > 0.0 + assert hasattr(self.decoder, 'right_decoder') device = speech.place batch_size = speech.shape[0] # For attention rescoring we only support batch_size=1 @@ -541,6 +559,7 @@ class U2BaseModel(ASRInterface, nn.Layer): hyp_content, place=device, dtype=paddle.long) hyp_list.append(hyp_content) hyps_pad = pad_sequence(hyp_list, True, self.ignore_id) + ori_hyps_pad = hyps_pad hyps_lens = paddle.to_tensor( [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) @@ -550,13 +569,24 @@ class U2BaseModel(ASRInterface, nn.Layer): encoder_out = encoder_out.repeat(beam_size, 1, 1) encoder_mask = paddle.ones( (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool) - decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, - hyps_lens) # (beam_size, max_hyps_len, vocab_size) + + # used for right to left decoder + r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens - 1, + self.ignore_id) + r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, + self.ignore_id) + decoder_out, r_decoder_out, _ = self.decoder( + encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, + reverse_weight) # (beam_size, max_hyps_len, vocab_size) # ctc score in ln domain decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1) decoder_out = decoder_out.numpy() + # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a + # conventional transformer decoder. + r_decoder_out = paddle.nn.functional.log_softmax(r_decoder_out, axis=-1) + r_decoder_out = r_decoder_out.numpy() + # Only use decoder score for rescoring best_score = -float('inf') best_index = 0 @@ -567,6 +597,12 @@ class U2BaseModel(ASRInterface, nn.Layer): score += decoder_out[i][j][w] # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] + if reverse_weight > 0: + r_score = 0.0 + for j, w in enumerate(hyp[0]): + r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] + r_score += r_decoder_out[i][len(hyp[0])][self.eos] + score = score * (1 - reverse_weight) + r_score * reverse_weight # add ctc score (which in ln domain) score += hyp[1] * ctc_weight if score > best_score: @@ -653,12 +689,24 @@ class U2BaseModel(ASRInterface, nn.Layer): """ return self.ctc.log_softmax(xs) + @jit.to_static + def is_bidirectional_decoder(self) -> bool: + """ + Returns: + torch.Tensor: decoder output + """ + if hasattr(self.decoder, 'right_decoder'): + return True + else: + return False + @jit.to_static def forward_attention_decoder( self, hyps: paddle.Tensor, hyps_lens: paddle.Tensor, - encoder_out: paddle.Tensor, ) -> paddle.Tensor: + encoder_out: paddle.Tensor, + reverse_weight: float=0, ) -> paddle.Tensor: """ Export interface for c++ call, forward decoder with multiple hypothesis from ctc prefix beam search and one encoder output Args: @@ -676,11 +724,75 @@ class U2BaseModel(ASRInterface, nn.Layer): # (B, 1, T) encoder_mask = paddle.ones( [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool) + + # input for right to left decoder + # this hyps_lens has count token, we need minus it. + r_hyps_lens = hyps_lens - 1 + # this hyps has included token, so it should be + # convert the original hyps. + r_hyps = hyps[:, 1:] # (num_hyps, max_hyps_len, vocab_size) + + # Equal to: + # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) + # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) + max_len = paddle.max(r_hyps_lens) + index_range = paddle.arange(0, max_len, 1) + seq_len_expand = r_hyps_lens.unsqueeze(1) + seq_mask = seq_len_expand > index_range # (beam, max_len) + + index = (seq_len_expand - 1) - index_range # (beam, max_len) + # >>> index + # >>> tensor([[ 2, 1, 0], + # >>> [ 2, 1, 0], + # >>> [ 0, -1, -2]]) + index = index * seq_mask + + # >>> index + # >>> tensor([[2, 1, 0], + # >>> [2, 1, 0], + # >>> [0, 0, 0]]) + def paddle_gather(x, dim, index): + index_shape = index.shape + index_flatten = index.flatten() + if dim < 0: + dim = len(x.shape) + dim + nd_index = [] + for k in range(len(x.shape)): + if k == dim: + nd_index.append(index_flatten) + else: + reshape_shape = [1] * len(x.shape) + reshape_shape[k] = x.shape[k] + x_arange = paddle.arange(x.shape[k], dtype=index.dtype) + x_arange = x_arange.reshape(reshape_shape) + dim_index = paddle.expand(x_arange, index_shape).flatten() + nd_index.append(dim_index) + ind2 = paddle.transpose(paddle.stack(nd_index), + [1, 0]).astype("int64") + paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape) + return paddle_out + + r_hyps = paddle_gather(r_hyps, 1, index) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, 2, 2]]) + r_hyps = paddle.where(seq_mask, r_hyps, self.eos) + # >>> r_hyps + # >>> tensor([[3, 2, 1], + # >>> [4, 8, 9], + # >>> [2, eos, eos]]) + r_hyps = torch.concat([hyps[:, 0:1], r_hyps], axis=1) + # >>> r_hyps + # >>> tensor([[sos, 3, 2, 1], + # >>> [sos, 4, 8, 9], + # >>> [sos, 2, eos, eos]]) decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps, - hyps_lens) + hyps_lens, r_hyps, reverse_weight) decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1) - return decoder_out + r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) + return decoder_out, r_decoder_out @paddle.no_grad() def decode(self, @@ -692,7 +804,8 @@ class U2BaseModel(ASRInterface, nn.Layer): ctc_weight: float=0.0, decoding_chunk_size: int=-1, num_decoding_left_chunks: int=-1, - simulate_streaming: bool=False): + simulate_streaming: bool=False, + reverse_weight: float=0.0): """u2 decoding. Args: @@ -801,7 +914,6 @@ class U2Model(U2DecodeModel): with DefaultInitializerContext(init_type): vocab_size, encoder, decoder, ctc = U2Model._init_from_config( configs) - super().__init__( vocab_size=vocab_size, encoder=encoder, @@ -851,10 +963,20 @@ class U2Model(U2DecodeModel): raise ValueError(f"not support encoder type:{encoder_type}") # decoder - decoder = TransformerDecoder(vocab_size, - encoder.output_size(), - **configs['decoder_conf']) - + decoder_type = configs.get('decoder', 'transformer') + logger.debug(f"U2 Decoder type: {decoder_type}") + if decoder_type == 'transformer': + decoder = TransformerDecoder(vocab_size, + encoder.output_size(), + **configs['decoder_conf']) + elif decoder_type == 'bitransformer': + assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 + assert configs['decoder_conf']['r_num_blocks'] > 0 + decoder = BiTransformerDecoder(vocab_size, + encoder.output_size(), + **configs['decoder_conf']) + else: + raise ValueError(f"not support decoder type:{decoder_type}") # ctc decoder and ctc loss model_conf = configs.get('model_conf', dict()) dropout_rate = model_conf.get('ctc_dropout_rate', 0.0) diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index ccc8482d..2052a19e 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -35,7 +35,6 @@ from paddlespeech.s2t.modules.mask import make_xs_mask from paddlespeech.s2t.modules.mask import subsequent_mask from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward from paddlespeech.s2t.utils.log import Log - logger = Log(__name__).getlog() __all__ = ["TransformerDecoder"] @@ -116,13 +115,19 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): memory: paddle.Tensor, memory_mask: paddle.Tensor, ys_in_pad: paddle.Tensor, - ys_in_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor]: + ys_in_lens: paddle.Tensor, + r_ys_in_pad: paddle.Tensor=paddle.empty([0]), + reverse_weight: float=0.0) -> Tuple[paddle.Tensor, paddle.Tensor]: """Forward decoder. Args: memory: encoded memory, float32 (batch, maxlen_in, feat) memory_mask: encoder memory mask, (batch, 1, maxlen_in) ys_in_pad: padded input token ids, int64 (batch, maxlen_out) ys_in_lens: input lengths of this batch (batch) + r_ys_in_pad: not used in transformer decoder, in order to unify api + with bidirectional decoder + reverse_weight: not used in transformer decoder, in order to unify + api with bidirectional decode Returns: (tuple): tuple containing: x: decoded token score before softmax (batch, maxlen_out, vocab_size) @@ -151,7 +156,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): # TODO(Hui Zhang): reduce_sum not support bool type # olens = tgt_mask.sum(1) olens = tgt_mask.astype(paddle.int).sum(1) - return x, olens + return x, paddle.to_tensor(0.0), olens def forward_one_step( self, @@ -251,3 +256,120 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] return logp, state_list + + +class BiTransformerDecoder(BatchScorerInterface, nn.Layer): + """Base class of Transfomer decoder module. + Args: + vocab_size: output dim + encoder_output_size: dimension of attention + attention_heads: the number of heads of multi head attention + linear_units: the hidden units number of position-wise feedforward + num_blocks: the number of decoder blocks + r_num_blocks: the number of right to left decoder blocks + dropout_rate: dropout rate + self_attention_dropout_rate: dropout rate for attention + input_layer: input layer type + use_output_layer: whether to use output layer + pos_enc_class: PositionalEncoding or ScaledPositionalEncoding + normalize_before: + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + concat_after: whether to concat attention layer's input and output + True: x -> x + linear(concat(x, att(x))) + False: x -> x + att(x) + """ + + def __init__(self, + vocab_size: int, + encoder_output_size: int, + attention_heads: int=4, + linear_units: int=2048, + num_blocks: int=6, + r_num_blocks: int=0, + dropout_rate: float=0.1, + positional_dropout_rate: float=0.1, + self_attention_dropout_rate: float=0.0, + src_attention_dropout_rate: float=0.0, + input_layer: str="embed", + use_output_layer: bool=True, + normalize_before: bool=True, + concat_after: bool=False, + max_len: int=5000): + + assert check_argument_types() + + nn.Layer.__init__(self) + self.left_decoder = TransformerDecoder( + vocab_size, encoder_output_size, attention_heads, linear_units, + num_blocks, dropout_rate, positional_dropout_rate, + self_attention_dropout_rate, src_attention_dropout_rate, + input_layer, use_output_layer, normalize_before, concat_after, + max_len) + + self.right_decoder = TransformerDecoder( + vocab_size, encoder_output_size, attention_heads, linear_units, + r_num_blocks, dropout_rate, positional_dropout_rate, + self_attention_dropout_rate, src_attention_dropout_rate, + input_layer, use_output_layer, normalize_before, concat_after, + max_len) + + def forward( + self, + memory: paddle.Tensor, + memory_mask: paddle.Tensor, + ys_in_pad: paddle.Tensor, + ys_in_lens: paddle.Tensor, + r_ys_in_pad: paddle.Tensor, + reverse_weight: float=0.0, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + """Forward decoder. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoder memory mask, (batch, 1, maxlen_in) + ys_in_pad: padded input token ids, int64 (batch, maxlen_out) + ys_in_lens: input lengths of this batch (batch) + r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), + used for right to left decoder + reverse_weight: used for right to left decoder + Returns: + (tuple): tuple containing: + x: decoded token score before softmax (batch, maxlen_out, + vocab_size) if use_output_layer is True, + r_x: x: decoded token score (right to left decoder) + before softmax (batch, maxlen_out, vocab_size) + if use_output_layer is True, + olens: (batch, ) + """ + l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, + ys_in_lens) + r_x = paddle.to_tensor(0.0) + if reverse_weight > 0.0: + r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, + ys_in_lens) + return l_x, r_x, olens + + def forward_one_step( + self, + memory: paddle.Tensor, + memory_mask: paddle.Tensor, + tgt: paddle.Tensor, + tgt_mask: paddle.Tensor, + cache: Optional[List[paddle.Tensor]]=None, + ) -> Tuple[paddle.Tensor, List[paddle.Tensor]]: + """Forward one step. + This is only used for decoding. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoded memory mask, (batch, 1, maxlen_in) + tgt: input token ids, int64 (batch, maxlen_out) + tgt_mask: input token mask, (batch, maxlen_out) + dtype=torch.uint8 in PyTorch 1.2- + dtype=torch.bool in PyTorch 1.2+ (include 1.2) + cache: cached output list of (batch, max_time_out-1, size) + Returns: + y, cache: NN output value and cache per `self.decoders`. + y.shape` is (batch, maxlen_out, token) + """ + return self.left_decoder.forward_one_step(memory, memory_mask, tgt, + tgt_mask, cache) From ecbf324286c55125e5fd2712c16bedc22f1e51c9 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Tue, 20 Sep 2022 05:28:02 +0000 Subject: [PATCH 02/35] support bitransformer decoder, test=asr --- paddlespeech/server/engine/asr/online/python/asr_engine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 87d88ee6..4c7c4b37 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -613,7 +613,8 @@ class PaddleASRConnectionHanddler: encoder_out = self.encoder_out.repeat(beam_size, 1, 1) encoder_mask = paddle.ones( (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool) - decoder_out, _ = self.model.decoder( + + decoder_out, _, _ = self.model.decoder( encoder_out, encoder_mask, hyps_pad, hyps_lens) # (beam_size, max_hyps_len, vocab_size) # ctc score in ln domain From 5cdc79ddf214f5f06d224db75b1a2b89279ea704 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 20 Sep 2022 14:34:41 +0800 Subject: [PATCH 03/35] [doc]add finetune demos in readthe docs (#2411) * add finetune demos, test=doc --- docs/requirements.txt | 3 +- ...lespeech.cls.exps.panns.deploy.predict.rst | 7 - .../paddlespeech.cls.exps.panns.deploy.rst | 1 - ...ddlespeech.cls.exps.panns.export_model.rst | 7 - .../paddlespeech.cls.exps.panns.predict.rst | 7 - .../api/paddlespeech.cls.exps.panns.rst | 3 - .../api/paddlespeech.cls.exps.panns.train.rst | 7 - ...dlespeech.kws.exps.mdtc.plot_det_curve.rst | 7 - .../source/api/paddlespeech.kws.exps.mdtc.rst | 1 - .../paddlespeech.s2t.decoders.ctcdecoder.rst | 1 - ....decoders.ctcdecoder.scorer_deprecated.rst | 7 - .../paddlespeech.s2t.decoders.recog_bin.rst | 7 - docs/source/api/paddlespeech.s2t.decoders.rst | 1 - ...addlespeech.s2t.decoders.scorers.ngram.rst | 7 - .../api/paddlespeech.s2t.decoders.scorers.rst | 1 - ...s2t.exps.deepspeech2.bin.deploy.client.rst | 7 - ...s2t.exps.deepspeech2.bin.deploy.record.rst | 7 - ...speech.s2t.exps.deepspeech2.bin.deploy.rst | 3 - ...h.s2t.exps.deepspeech2.bin.deploy.send.rst | 7 - docs/source/api/paddlespeech.s2t.exps.u2.rst | 1 - .../api/paddlespeech.s2t.exps.u2.trainer.rst | 7 - ...ddlespeech.s2t.exps.u2_kaldi.bin.recog.rst | 7 - .../paddlespeech.s2t.exps.u2_kaldi.bin.rst | 1 - .../paddlespeech.s2t.training.extensions.rst | 2 - ...peech.s2t.training.extensions.snapshot.rst | 7 - ...ech.s2t.training.extensions.visualizer.rst | 7 - .../paddlespeech.s2t.training.updaters.rst | 1 - ...lespeech.s2t.training.updaters.trainer.rst | 7 - .../paddlespeech.s2t.transform.add_deltas.rst | 7 - ...espeech.s2t.transform.channel_selector.rst | 7 - .../api/paddlespeech.s2t.transform.cmvn.rst | 7 - .../paddlespeech.s2t.transform.functional.rst | 7 - .../paddlespeech.s2t.transform.perturb.rst | 7 - .../source/api/paddlespeech.s2t.transform.rst | 24 - ...addlespeech.s2t.transform.spec_augment.rst | 7 - ...paddlespeech.s2t.transform.spectrogram.rst | 7 - ...eech.s2t.transform.transform_interface.rst | 7 - ...dlespeech.s2t.transform.transformation.rst | 7 - .../api/paddlespeech.s2t.transform.wpe.rst | 7 - ...ch.server.engine.acs.python.acs_engine.rst | 7 - .../paddlespeech.server.engine.acs.python.rst | 1 - .../api/paddlespeech.server.utils.log.rst | 7 - docs/source/api/paddlespeech.t2s.exps.rst | 2 +- .../paddlespeech.t2s.exps.stream_play_tts.rst | 7 - .../paddlespeech.t2s.models.ernie_sat.mlm.rst | 7 - ...h.t2s.models.vits.monotonic_align.core.rst | 7 - ...speech.t2s.models.vits.monotonic_align.rst | 16 - ....t2s.models.vits.monotonic_align.setup.rst | 7 - .../api/paddlespeech.t2s.models.vits.rst | 1 - docs/source/tts/demo.rst | 472 +++++++++++------- docs/source/tts/demo_2.rst | 56 +-- 51 files changed, 336 insertions(+), 479 deletions(-) delete mode 100644 docs/source/api/paddlespeech.cls.exps.panns.deploy.predict.rst delete mode 100644 docs/source/api/paddlespeech.cls.exps.panns.export_model.rst delete mode 100644 docs/source/api/paddlespeech.cls.exps.panns.predict.rst delete mode 100644 docs/source/api/paddlespeech.cls.exps.panns.train.rst delete mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst delete mode 100644 docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated.rst delete mode 100644 docs/source/api/paddlespeech.s2t.decoders.recog_bin.rst delete mode 100644 docs/source/api/paddlespeech.s2t.decoders.scorers.ngram.rst delete mode 100644 docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.client.rst delete mode 100644 docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.record.rst delete mode 100644 docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.send.rst delete mode 100644 docs/source/api/paddlespeech.s2t.exps.u2.trainer.rst delete mode 100644 docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.recog.rst delete mode 100644 docs/source/api/paddlespeech.s2t.training.extensions.snapshot.rst delete mode 100644 docs/source/api/paddlespeech.s2t.training.extensions.visualizer.rst delete mode 100644 docs/source/api/paddlespeech.s2t.training.updaters.trainer.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.add_deltas.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.channel_selector.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.cmvn.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.functional.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.perturb.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.spec_augment.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.spectrogram.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.transform_interface.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.transformation.rst delete mode 100644 docs/source/api/paddlespeech.s2t.transform.wpe.rst delete mode 100644 docs/source/api/paddlespeech.server.engine.acs.python.acs_engine.rst delete mode 100644 docs/source/api/paddlespeech.server.utils.log.rst delete mode 100644 docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst delete mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.mlm.rst delete mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst delete mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst delete mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst diff --git a/docs/requirements.txt b/docs/requirements.txt index 3fb82367..fd7a481b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -20,6 +20,7 @@ onnxruntime==1.10.0 opencc paddlenlp paddlepaddle>=2.2.2 +paddlespeech_ctcdecoders paddlespeech_feat pandas pathos == 0.2.8 @@ -27,8 +28,8 @@ pattern_singleton Pillow>=9.0.0 praatio==5.0.0 prettytable -pypinyin<=0.44.0 pypinyin-dict +pypinyin<=0.44.0 python-dateutil pyworld==0.2.12 recommonmark>=0.5.0 diff --git a/docs/source/api/paddlespeech.cls.exps.panns.deploy.predict.rst b/docs/source/api/paddlespeech.cls.exps.panns.deploy.predict.rst deleted file mode 100644 index d4f92a2e..00000000 --- a/docs/source/api/paddlespeech.cls.exps.panns.deploy.predict.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.cls.exps.panns.deploy.predict module -================================================= - -.. automodule:: paddlespeech.cls.exps.panns.deploy.predict - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst b/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst index 4415c933..369862cc 100644 --- a/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst +++ b/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst @@ -12,4 +12,3 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.cls.exps.panns.deploy.predict diff --git a/docs/source/api/paddlespeech.cls.exps.panns.export_model.rst b/docs/source/api/paddlespeech.cls.exps.panns.export_model.rst deleted file mode 100644 index 6c39c2bc..00000000 --- a/docs/source/api/paddlespeech.cls.exps.panns.export_model.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.cls.exps.panns.export\_model module -================================================ - -.. automodule:: paddlespeech.cls.exps.panns.export_model - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.cls.exps.panns.predict.rst b/docs/source/api/paddlespeech.cls.exps.panns.predict.rst deleted file mode 100644 index 88cd4033..00000000 --- a/docs/source/api/paddlespeech.cls.exps.panns.predict.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.cls.exps.panns.predict module -========================================== - -.. automodule:: paddlespeech.cls.exps.panns.predict - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.cls.exps.panns.rst b/docs/source/api/paddlespeech.cls.exps.panns.rst index 6147b245..72f30ba6 100644 --- a/docs/source/api/paddlespeech.cls.exps.panns.rst +++ b/docs/source/api/paddlespeech.cls.exps.panns.rst @@ -20,6 +20,3 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.cls.exps.panns.export_model - paddlespeech.cls.exps.panns.predict - paddlespeech.cls.exps.panns.train diff --git a/docs/source/api/paddlespeech.cls.exps.panns.train.rst b/docs/source/api/paddlespeech.cls.exps.panns.train.rst deleted file mode 100644 index a89b7eec..00000000 --- a/docs/source/api/paddlespeech.cls.exps.panns.train.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.cls.exps.panns.train module -======================================== - -.. automodule:: paddlespeech.cls.exps.panns.train - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst deleted file mode 100644 index 46a149b0..00000000 --- a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.kws.exps.mdtc.plot\_det\_curve module -================================================== - -.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.rst index f6cad64e..33d4a55c 100644 --- a/docs/source/api/paddlespeech.kws.exps.mdtc.rst +++ b/docs/source/api/paddlespeech.kws.exps.mdtc.rst @@ -14,6 +14,5 @@ Submodules paddlespeech.kws.exps.mdtc.collate paddlespeech.kws.exps.mdtc.compute_det - paddlespeech.kws.exps.mdtc.plot_det_curve paddlespeech.kws.exps.mdtc.score paddlespeech.kws.exps.mdtc.train diff --git a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst index 8093619b..dfcd274c 100644 --- a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst +++ b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst @@ -13,5 +13,4 @@ Submodules :maxdepth: 4 paddlespeech.s2t.decoders.ctcdecoder.decoders_deprecated - paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper diff --git a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated.rst b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated.rst deleted file mode 100644 index 1079d672..00000000 --- a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.decoders.ctcdecoder.scorer\_deprecated module -============================================================== - -.. automodule:: paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.decoders.recog_bin.rst b/docs/source/api/paddlespeech.s2t.decoders.recog_bin.rst deleted file mode 100644 index 4952e2e6..00000000 --- a/docs/source/api/paddlespeech.s2t.decoders.recog_bin.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.decoders.recog\_bin module -=========================================== - -.. automodule:: paddlespeech.s2t.decoders.recog_bin - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.decoders.rst b/docs/source/api/paddlespeech.s2t.decoders.rst index e4eabedf..53e0d9c4 100644 --- a/docs/source/api/paddlespeech.s2t.decoders.rst +++ b/docs/source/api/paddlespeech.s2t.decoders.rst @@ -23,5 +23,4 @@ Submodules :maxdepth: 4 paddlespeech.s2t.decoders.recog - paddlespeech.s2t.decoders.recog_bin paddlespeech.s2t.decoders.utils diff --git a/docs/source/api/paddlespeech.s2t.decoders.scorers.ngram.rst b/docs/source/api/paddlespeech.s2t.decoders.scorers.ngram.rst deleted file mode 100644 index f38a6109..00000000 --- a/docs/source/api/paddlespeech.s2t.decoders.scorers.ngram.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.decoders.scorers.ngram module -============================================== - -.. automodule:: paddlespeech.s2t.decoders.scorers.ngram - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.decoders.scorers.rst b/docs/source/api/paddlespeech.s2t.decoders.scorers.rst index 83808c49..ca834f6b 100644 --- a/docs/source/api/paddlespeech.s2t.decoders.scorers.rst +++ b/docs/source/api/paddlespeech.s2t.decoders.scorers.rst @@ -15,5 +15,4 @@ Submodules paddlespeech.s2t.decoders.scorers.ctc paddlespeech.s2t.decoders.scorers.ctc_prefix_score paddlespeech.s2t.decoders.scorers.length_bonus - paddlespeech.s2t.decoders.scorers.ngram paddlespeech.s2t.decoders.scorers.scorer_interface diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.client.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.client.rst deleted file mode 100644 index a73a5685..00000000 --- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.client.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.exps.deepspeech2.bin.deploy.client module -========================================================== - -.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.client - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.record.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.record.rst deleted file mode 100644 index bc107848..00000000 --- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.record.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.exps.deepspeech2.bin.deploy.record module -========================================================== - -.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.record - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst index d1f966fc..28de0f7f 100644 --- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst +++ b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst @@ -12,8 +12,5 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.s2t.exps.deepspeech2.bin.deploy.client - paddlespeech.s2t.exps.deepspeech2.bin.deploy.record paddlespeech.s2t.exps.deepspeech2.bin.deploy.runtime - paddlespeech.s2t.exps.deepspeech2.bin.deploy.send paddlespeech.s2t.exps.deepspeech2.bin.deploy.server diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.send.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.send.rst deleted file mode 100644 index ba1ae0a6..00000000 --- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.send.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.exps.deepspeech2.bin.deploy.send module -======================================================== - -.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.send - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.rst b/docs/source/api/paddlespeech.s2t.exps.u2.rst index e0ebb7fc..bf565670 100644 --- a/docs/source/api/paddlespeech.s2t.exps.u2.rst +++ b/docs/source/api/paddlespeech.s2t.exps.u2.rst @@ -21,4 +21,3 @@ Submodules :maxdepth: 4 paddlespeech.s2t.exps.u2.model - paddlespeech.s2t.exps.u2.trainer diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.trainer.rst b/docs/source/api/paddlespeech.s2t.exps.u2.trainer.rst deleted file mode 100644 index 0cd28945..00000000 --- a/docs/source/api/paddlespeech.s2t.exps.u2.trainer.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.exps.u2.trainer module -======================================= - -.. automodule:: paddlespeech.s2t.exps.u2.trainer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.recog.rst b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.recog.rst deleted file mode 100644 index bc749c8f..00000000 --- a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.recog.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.exps.u2\_kaldi.bin.recog module -================================================ - -.. automodule:: paddlespeech.s2t.exps.u2_kaldi.bin.recog - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst index ff1a6efe..087b8767 100644 --- a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst +++ b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst @@ -12,6 +12,5 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.s2t.exps.u2_kaldi.bin.recog paddlespeech.s2t.exps.u2_kaldi.bin.test paddlespeech.s2t.exps.u2_kaldi.bin.train diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.rst b/docs/source/api/paddlespeech.s2t.training.extensions.rst index f31b8427..13530a8d 100644 --- a/docs/source/api/paddlespeech.s2t.training.extensions.rst +++ b/docs/source/api/paddlespeech.s2t.training.extensions.rst @@ -15,5 +15,3 @@ Submodules paddlespeech.s2t.training.extensions.evaluator paddlespeech.s2t.training.extensions.extension paddlespeech.s2t.training.extensions.plot - paddlespeech.s2t.training.extensions.snapshot - paddlespeech.s2t.training.extensions.visualizer diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.snapshot.rst b/docs/source/api/paddlespeech.s2t.training.extensions.snapshot.rst deleted file mode 100644 index e0ca21a7..00000000 --- a/docs/source/api/paddlespeech.s2t.training.extensions.snapshot.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.training.extensions.snapshot module -==================================================== - -.. automodule:: paddlespeech.s2t.training.extensions.snapshot - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.visualizer.rst b/docs/source/api/paddlespeech.s2t.training.extensions.visualizer.rst deleted file mode 100644 index 22ae11f1..00000000 --- a/docs/source/api/paddlespeech.s2t.training.extensions.visualizer.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.training.extensions.visualizer module -====================================================== - -.. automodule:: paddlespeech.s2t.training.extensions.visualizer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.training.updaters.rst b/docs/source/api/paddlespeech.s2t.training.updaters.rst index a0617016..b38704a0 100644 --- a/docs/source/api/paddlespeech.s2t.training.updaters.rst +++ b/docs/source/api/paddlespeech.s2t.training.updaters.rst @@ -13,5 +13,4 @@ Submodules :maxdepth: 4 paddlespeech.s2t.training.updaters.standard_updater - paddlespeech.s2t.training.updaters.trainer paddlespeech.s2t.training.updaters.updater diff --git a/docs/source/api/paddlespeech.s2t.training.updaters.trainer.rst b/docs/source/api/paddlespeech.s2t.training.updaters.trainer.rst deleted file mode 100644 index 6981a8f0..00000000 --- a/docs/source/api/paddlespeech.s2t.training.updaters.trainer.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.training.updaters.trainer module -================================================= - -.. automodule:: paddlespeech.s2t.training.updaters.trainer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.add_deltas.rst b/docs/source/api/paddlespeech.s2t.transform.add_deltas.rst deleted file mode 100644 index 5007fd9d..00000000 --- a/docs/source/api/paddlespeech.s2t.transform.add_deltas.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.add\_deltas module -============================================= - -.. automodule:: paddlespeech.s2t.transform.add_deltas - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.channel_selector.rst b/docs/source/api/paddlespeech.s2t.transform.channel_selector.rst deleted file mode 100644 index e08dd253..00000000 --- a/docs/source/api/paddlespeech.s2t.transform.channel_selector.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.channel\_selector module -=================================================== - -.. automodule:: paddlespeech.s2t.transform.channel_selector - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.cmvn.rst b/docs/source/api/paddlespeech.s2t.transform.cmvn.rst deleted file mode 100644 index 8348e3d4..00000000 --- a/docs/source/api/paddlespeech.s2t.transform.cmvn.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.cmvn module -====================================== - -.. automodule:: paddlespeech.s2t.transform.cmvn - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.functional.rst b/docs/source/api/paddlespeech.s2t.transform.functional.rst deleted file mode 100644 index eb2b54a6..00000000 --- a/docs/source/api/paddlespeech.s2t.transform.functional.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.functional module -============================================ - -.. automodule:: paddlespeech.s2t.transform.functional - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.perturb.rst b/docs/source/api/paddlespeech.s2t.transform.perturb.rst deleted file mode 100644 index 0be28ab7..00000000 --- a/docs/source/api/paddlespeech.s2t.transform.perturb.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.perturb module -========================================= - -.. automodule:: paddlespeech.s2t.transform.perturb - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.rst b/docs/source/api/paddlespeech.s2t.transform.rst deleted file mode 100644 index 5016ff4f..00000000 --- a/docs/source/api/paddlespeech.s2t.transform.rst +++ /dev/null @@ -1,24 +0,0 @@ -paddlespeech.s2t.transform package -================================== - -.. automodule:: paddlespeech.s2t.transform - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - paddlespeech.s2t.transform.add_deltas - paddlespeech.s2t.transform.channel_selector - paddlespeech.s2t.transform.cmvn - paddlespeech.s2t.transform.functional - paddlespeech.s2t.transform.perturb - paddlespeech.s2t.transform.spec_augment - paddlespeech.s2t.transform.spectrogram - paddlespeech.s2t.transform.transform_interface - paddlespeech.s2t.transform.transformation - paddlespeech.s2t.transform.wpe diff --git a/docs/source/api/paddlespeech.s2t.transform.spec_augment.rst b/docs/source/api/paddlespeech.s2t.transform.spec_augment.rst deleted file mode 100644 index 00fd3ea1..00000000 --- a/docs/source/api/paddlespeech.s2t.transform.spec_augment.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.spec\_augment module -=============================================== - -.. automodule:: paddlespeech.s2t.transform.spec_augment - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.spectrogram.rst b/docs/source/api/paddlespeech.s2t.transform.spectrogram.rst deleted file mode 100644 index 33c499a7..00000000 --- a/docs/source/api/paddlespeech.s2t.transform.spectrogram.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.spectrogram module -============================================= - -.. automodule:: paddlespeech.s2t.transform.spectrogram - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.transform_interface.rst b/docs/source/api/paddlespeech.s2t.transform.transform_interface.rst deleted file mode 100644 index 009b0658..00000000 --- a/docs/source/api/paddlespeech.s2t.transform.transform_interface.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.transform\_interface module -====================================================== - -.. automodule:: paddlespeech.s2t.transform.transform_interface - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.transformation.rst b/docs/source/api/paddlespeech.s2t.transform.transformation.rst deleted file mode 100644 index a03e731a..00000000 --- a/docs/source/api/paddlespeech.s2t.transform.transformation.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.transformation module -================================================ - -.. automodule:: paddlespeech.s2t.transform.transformation - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.s2t.transform.wpe.rst b/docs/source/api/paddlespeech.s2t.transform.wpe.rst deleted file mode 100644 index c4831f7f..00000000 --- a/docs/source/api/paddlespeech.s2t.transform.wpe.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.s2t.transform.wpe module -===================================== - -.. automodule:: paddlespeech.s2t.transform.wpe - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.server.engine.acs.python.acs_engine.rst b/docs/source/api/paddlespeech.server.engine.acs.python.acs_engine.rst deleted file mode 100644 index 9b61633e..00000000 --- a/docs/source/api/paddlespeech.server.engine.acs.python.acs_engine.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.server.engine.acs.python.acs\_engine module -======================================================== - -.. automodule:: paddlespeech.server.engine.acs.python.acs_engine - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.server.engine.acs.python.rst b/docs/source/api/paddlespeech.server.engine.acs.python.rst index 3c06ba08..7e5582bd 100644 --- a/docs/source/api/paddlespeech.server.engine.acs.python.rst +++ b/docs/source/api/paddlespeech.server.engine.acs.python.rst @@ -12,4 +12,3 @@ Submodules .. toctree:: :maxdepth: 4 - paddlespeech.server.engine.acs.python.acs_engine diff --git a/docs/source/api/paddlespeech.server.utils.log.rst b/docs/source/api/paddlespeech.server.utils.log.rst deleted file mode 100644 index 453b4a61..00000000 --- a/docs/source/api/paddlespeech.server.utils.log.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.server.utils.log module -==================================== - -.. automodule:: paddlespeech.server.utils.log - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.exps.rst b/docs/source/api/paddlespeech.t2s.exps.rst index bee18a97..643f97b4 100644 --- a/docs/source/api/paddlespeech.t2s.exps.rst +++ b/docs/source/api/paddlespeech.t2s.exps.rst @@ -30,10 +30,10 @@ Submodules paddlespeech.t2s.exps.inference paddlespeech.t2s.exps.inference_streaming + paddlespeech.t2s.models.vits.monotonic_align paddlespeech.t2s.exps.ort_predict paddlespeech.t2s.exps.ort_predict_e2e paddlespeech.t2s.exps.ort_predict_streaming - paddlespeech.t2s.exps.stream_play_tts paddlespeech.t2s.exps.syn_utils paddlespeech.t2s.exps.synthesize paddlespeech.t2s.exps.synthesize_e2e diff --git a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst deleted file mode 100644 index cb22dde0..00000000 --- a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.t2s.exps.stream\_play\_tts module -============================================== - -.. automodule:: paddlespeech.t2s.exps.stream_play_tts - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.mlm.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.mlm.rst deleted file mode 100644 index f0e8fd11..00000000 --- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.mlm.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.t2s.models.ernie\_sat.mlm module -============================================= - -.. automodule:: paddlespeech.t2s.models.ernie_sat.mlm - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst deleted file mode 100644 index 7aaba795..00000000 --- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.t2s.models.vits.monotonic\_align.core module -========================================================= - -.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst deleted file mode 100644 index 25c819a7..00000000 --- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst +++ /dev/null @@ -1,16 +0,0 @@ -paddlespeech.t2s.models.vits.monotonic\_align package -===================================================== - -.. automodule:: paddlespeech.t2s.models.vits.monotonic_align - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - paddlespeech.t2s.models.vits.monotonic_align.core - paddlespeech.t2s.models.vits.monotonic_align.setup diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst deleted file mode 100644 index a93c3b8b..00000000 --- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst +++ /dev/null @@ -1,7 +0,0 @@ -paddlespeech.t2s.models.vits.monotonic\_align.setup module -========================================================== - -.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/paddlespeech.t2s.models.vits.rst b/docs/source/api/paddlespeech.t2s.models.vits.rst index 3146094b..205496f0 100644 --- a/docs/source/api/paddlespeech.t2s.models.vits.rst +++ b/docs/source/api/paddlespeech.t2s.models.vits.rst @@ -12,7 +12,6 @@ Subpackages .. toctree:: :maxdepth: 4 - paddlespeech.t2s.models.vits.monotonic_align paddlespeech.t2s.models.vits.wavenet Submodules diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst index ca2fd98e..1ae687f8 100644 --- a/docs/source/tts/demo.rst +++ b/docs/source/tts/demo.rst @@ -42,7 +42,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition -