From 1cdd41bd03488b38c6082c766bf819b6bc94f61c Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 24 May 2022 09:46:12 +0000 Subject: [PATCH 01/40] fix pad_sequence, test=asr --- paddlespeech/s2t/utils/tensor_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py index 0dbaa0b6b..e105253c2 100644 --- a/paddlespeech/s2t/utils/tensor_utils.py +++ b/paddlespeech/s2t/utils/tensor_utils.py @@ -82,7 +82,7 @@ def pad_sequence(sequences: List[paddle.Tensor], max_size = sequences[0].size() # (TODO Hui Zhang): slice not supprot `end==start` # trailing_dims = max_size[1:] - trailing_dims = max_size[1:] if max_size.ndim >= 2 else () + trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else () max_len = max([s.shape[0] for s in sequences]) if batch_first: out_dims = (len(sequences), max_len) + trailing_dims @@ -99,7 +99,7 @@ def pad_sequence(sequences: List[paddle.Tensor], if batch_first: # TODO (Hui Zhang): set_value op not supprot `end==start` # TODO (Hui Zhang): set_value op not support int16 - # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] + # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] # out_tensor[i, :length, ...] = tensor if length != 0: out_tensor[i, :length] = tensor @@ -145,7 +145,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int, [ 4, 5, 6, 11, -1, -1], [ 7, 8, 9, 11, -1, -1]]) """ - # TODO(Hui Zhang): using comment code, + # TODO(Hui Zhang): using comment code, #_sos = paddle.to_tensor( # [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place) #_eos = paddle.to_tensor( From e1888f9ae6d239b8c28f9739f7fd2a0120caac9e Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 24 May 2022 12:37:42 +0000 Subject: [PATCH 02/40] remove size,test=asr --- paddlespeech/s2t/__init__.py | 19 ------------- .../s2t/decoders/beam_search/beam_search.py | 10 +++---- paddlespeech/s2t/decoders/scorers/ctc.py | 4 +-- .../s2t/decoders/scorers/ctc_prefix_score.py | 27 +++++++++---------- paddlespeech/s2t/models/u2/u2.py | 2 +- paddlespeech/s2t/modules/decoder.py | 2 +- paddlespeech/s2t/modules/embedding.py | 4 +-- paddlespeech/s2t/utils/tensor_utils.py | 6 ++--- 8 files changed, 27 insertions(+), 47 deletions(-) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 2365071f3..7ec9e1aba 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -189,25 +189,6 @@ if not hasattr(paddle.Tensor, 'contiguous'): paddle.static.Variable.contiguous = contiguous -def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor: - nargs = len(args) - assert (nargs <= 1) - s = paddle.shape(xs) - if nargs == 1: - return s[args[0]] - else: - return s - - -#`to_static` do not process `size` property, maybe some `paddle` api dependent on it. -logger.debug( - "override size of paddle.Tensor " - "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!" -) -paddle.Tensor.size = size -paddle.static.Variable.size = size - - def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor: return xs.reshape(args) diff --git a/paddlespeech/s2t/decoders/beam_search/beam_search.py b/paddlespeech/s2t/decoders/beam_search/beam_search.py index f331cb1c9..5029e1577 100644 --- a/paddlespeech/s2t/decoders/beam_search/beam_search.py +++ b/paddlespeech/s2t/decoders/beam_search/beam_search.py @@ -194,7 +194,7 @@ class BeamSearch(paddle.nn.Layer): Args: hyp (Hypothesis): Hypothesis with prefix tokens to score - ids (paddle.Tensor): 1D tensor of new partial tokens to score, + ids (paddle.Tensor): 1D tensor of new partial tokens to score, len(ids) < n_vocab x (paddle.Tensor): Corresponding input feature, (T, D) @@ -224,14 +224,14 @@ class BeamSearch(paddle.nn.Layer): ids (paddle.Tensor): The partial token ids(Global) to compute topk. Returns: - Tuple[paddle.Tensor, paddle.Tensor]: + Tuple[paddle.Tensor, paddle.Tensor]: The topk full token ids and partial token ids. Their shapes are `(self.beam_size,)`. i.e. (global ids, global relative local ids). """ # no pre beam performed, `ids` equal to `weighted_scores` - if weighted_scores.size(0) == ids.size(0): + if weighted_scores.shape[0] == ids.shape[0]: top_ids = weighted_scores.topk( self.beam_size)[1] # index in n_vocab return top_ids, top_ids @@ -374,8 +374,8 @@ class BeamSearch(paddle.nn.Layer): elif maxlenratio < 0: maxlen = -1 * int(maxlenratio) else: - maxlen = max(1, int(maxlenratio * x.size(0))) - minlen = int(minlenratio * x.size(0)) + maxlen = max(1, int(maxlenratio * x.shape[0])) + minlen = int(minlenratio * x.shape[0]) logger.info("decoder input length: " + str(x.shape[0])) logger.info("max output length: " + str(maxlen)) logger.info("min output length: " + str(minlen)) diff --git a/paddlespeech/s2t/decoders/scorers/ctc.py b/paddlespeech/s2t/decoders/scorers/ctc.py index 81d8b0783..6f1d8c007 100644 --- a/paddlespeech/s2t/decoders/scorers/ctc.py +++ b/paddlespeech/s2t/decoders/scorers/ctc.py @@ -69,7 +69,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface): return sc[i], st[i] else: # for CTCPrefixScorePD (need new_id > 0) r, log_psi, f_min, f_max, scoring_idmap = state - s = log_psi[i, new_id].expand(log_psi.size(1)) + s = log_psi[i, new_id].expand(log_psi.shape[1]) if scoring_idmap is not None: return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max else: @@ -107,7 +107,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface): """ logp = self.ctc.log_softmax(x.unsqueeze(0)) # assuming batch_size = 1 - xlen = paddle.to_tensor([logp.size(1)]) + xlen = paddle.to_tensor([logp.shape[1]]) self.impl = CTCPrefixScorePD(logp, xlen, 0, self.eos) return None diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py index 78b8fe36c..0e63a52a8 100644 --- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py +++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py @@ -33,9 +33,9 @@ class CTCPrefixScorePD(): self.logzero = -10000000000.0 self.blank = blank self.eos = eos - self.batch = x.size(0) - self.input_length = x.size(1) - self.odim = x.size(2) + self.batch = x.shape[0] + self.input_length = x.shape[1] + self.odim = x.shape[2] self.dtype = x.dtype # Pad the rest of posteriors in the batch @@ -76,8 +76,7 @@ class CTCPrefixScorePD(): last_ids = [yi[-1] for yi in y] # last output label ids n_bh = len(last_ids) # batch * hyps n_hyps = n_bh // self.batch # assuming each utterance has the same # of hyps - self.scoring_num = scoring_ids.size( - -1) if scoring_ids is not None else 0 + self.scoring_num = scoring_ids.shape[-1] if scoring_ids is not None else 0 # prepare state info if state is None: r_prev = paddle.full( @@ -153,7 +152,7 @@ class CTCPrefixScorePD(): # compute forward probabilities log(r_t^n(h)) and log(r_t^b(h)) for t in range(start, end): - rp = r[t - 1] # (2 x BW x O') + rp = r[t - 1] # (2 x BW x O') rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view( 2, 2, n_bh, snum) # (2,2,BW,O') r[t] = paddle.logsumexp(rr, 1) + x_[:, t] @@ -227,7 +226,7 @@ class CTCPrefixScorePD(): if self.x.shape[1] < x.shape[1]: # self.x (2,T,B,O); x (B,T,O) # Pad the rest of posteriors in the batch # TODO(takaaki-hori): need a better way without for-loops - xlens = [x.size(1)] + xlens = [x.shape[1]] for i, l in enumerate(xlens): if l < self.input_length: x[i, l:, :] = self.logzero @@ -237,7 +236,7 @@ class CTCPrefixScorePD(): xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim) self.x = paddle.stack([xn, xb]) # (2, T, B, O) self.x[:, :tmp_x.shape[1], :, :] = tmp_x - self.input_length = x.size(1) + self.input_length = x.shape[1] self.end_frames = paddle.to_tensor(xlens) - 1 def extend_state(self, state): @@ -318,16 +317,16 @@ class CTCPrefixScore(): r[0, 0] = xs[0] r[0, 1] = self.logzero else: - # Although the code does not exactly follow Algorithm 2, - # we don't have to change it because we can assume - # r_t(h)=0 for t < |h| in CTC forward computation + # Although the code does not exactly follow Algorithm 2, + # we don't have to change it because we can assume + # r_t(h)=0 for t < |h| in CTC forward computation # (Note: we assume here that index t starts with 0). # The purpose of this difference is to reduce the number of for-loops. # https://github.com/espnet/espnet/pull/3655 - # where we start to accumulate r_t(h) from t=|h| - # and iterate r_t(h) = (r_{t-1}(h) + ...) to T-1, + # where we start to accumulate r_t(h) from t=|h| + # and iterate r_t(h) = (r_{t-1}(h) + ...) to T-1, # avoiding accumulating zeros for t=1~|h|-1. - # Thus, we need to set r_{|h|-1}(h) = 0, + # Thus, we need to set r_{|h|-1}(h) = 0, # i.e., r[output_length-1] = logzero, for initialization. # This is just for reducing the computation. r[output_length - 1] = self.logzero diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 530840d0f..e3f46b15a 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -775,7 +775,7 @@ class U2DecodeModel(U2BaseModel): """ self.eval() x = paddle.to_tensor(x).unsqueeze(0) - ilen = x.size(1) + ilen = x.shape[1] enc_output, _ = self._forward_encoder(x, ilen) return enc_output.squeeze(0) diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index 42ac119b4..ce78059c0 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -242,7 +242,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): ] # batch decoding - ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0) # (B,L,L) + ys_mask = subsequent_mask(ys.shape[-1]).unsqueeze(0) # (B,L,L) xs_mask = make_xs_mask(xs).unsqueeze(1) # (B,1,T) logp, states = self.forward_one_step( xs, xs_mask, ys, ys_mask, cache=batch_state) diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index 596f61b78..cc1fdffe2 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -115,7 +115,7 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface): assert offset + x.shape[ 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( offset, x.shape[1], self.max_len) - #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor + #TODO(Hui Zhang): using T = x.shape[1], __getitem__ not support Tensor pos_emb = self.pe[:, offset:offset + T] x = x * self.xscale + pos_emb return self.dropout(x), self.dropout(pos_emb) @@ -165,6 +165,6 @@ class RelPositionalEncoding(PositionalEncoding): 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( offset, x.shape[1], self.max_len) x = x * self.xscale - #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor + #TODO(Hui Zhang): using x.shape[1], __getitem__ not support Tensor pos_emb = self.pe[:, offset:offset + x.shape[1]] return self.dropout(x), self.dropout(pos_emb) diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py index e105253c2..ca8689569 100644 --- a/paddlespeech/s2t/utils/tensor_utils.py +++ b/paddlespeech/s2t/utils/tensor_utils.py @@ -58,8 +58,8 @@ def pad_sequence(sequences: List[paddle.Tensor], >>> a = paddle.ones(25, 300) >>> b = paddle.ones(22, 300) >>> c = paddle.ones(15, 300) - >>> pad_sequence([a, b, c]).size() - paddle.Tensor([25, 3, 300]) + >>> pad_sequence([a, b, c]).shape + [25, 3, 300] Note: This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` @@ -79,7 +79,7 @@ def pad_sequence(sequences: List[paddle.Tensor], # assuming trailing dimensions and type of all the Tensors # in sequences are same and fetching those from sequences[0] - max_size = sequences[0].size() + max_size = sequences[0].shape # (TODO Hui Zhang): slice not supprot `end==start` # trailing_dims = max_size[1:] trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else () From 4c09927f61668952ee263cd178798b0ea5634760 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 24 May 2022 13:34:01 +0000 Subject: [PATCH 03/40] fix --- paddlespeech/s2t/__init__.py | 2 +- paddlespeech/s2t/models/lm/transformer.py | 4 ++-- paddlespeech/s2t/modules/encoder.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 7ec9e1aba..a2fce3057 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -200,7 +200,7 @@ if not hasattr(paddle.Tensor, 'view'): def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor: - return xs.reshape(ys.size()) + return xs.reshape(ys.shape) if not hasattr(paddle.Tensor, 'view_as'): diff --git a/paddlespeech/s2t/models/lm/transformer.py b/paddlespeech/s2t/models/lm/transformer.py index 85bd7c232..bb281168f 100644 --- a/paddlespeech/s2t/models/lm/transformer.py +++ b/paddlespeech/s2t/models/lm/transformer.py @@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): def _target_mask(self, ys_in_pad): ys_mask = ys_in_pad != 0 - m = subsequent_mask(ys_mask.size(-1)).unsqueeze(0) + m = subsequent_mask(ys_mask.shape[-1])).unsqueeze(0) return ys_mask.unsqueeze(-2) & m def forward(self, x: paddle.Tensor, t: paddle.Tensor @@ -112,7 +112,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): in perplexity: p(t)^{-n} = exp(-log p(t) / n) """ - batch_size = x.size(0) + batch_size = x.shape[0] xm = x != 0 xlen = xm.sum(axis=1) if self.embed_drop is not None: diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 669a12d65..7298c61f2 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -218,7 +218,7 @@ class BaseEncoder(nn.Layer): assert xs.shape[0] == 1 # batch size must be one # tmp_masks is just for interface compatibility # TODO(Hui Zhang): stride_slice not support bool tensor - # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool) + # tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool) tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32) tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T] From b23bde8ec5ff4ed3990f151246dfbb8c9dccf385 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 25 May 2022 03:30:48 +0000 Subject: [PATCH 04/40] tensor.shape => paddle.shape(tensor) --- paddlespeech/s2t/__init__.py | 2 +- paddlespeech/s2t/decoders/beam_search/beam_search.py | 10 +++++----- paddlespeech/s2t/decoders/scorers/ctc.py | 4 ++-- .../s2t/decoders/scorers/ctc_prefix_score.py | 12 ++++++------ paddlespeech/s2t/models/lm/transformer.py | 6 +++--- paddlespeech/s2t/models/u2/u2.py | 2 +- paddlespeech/s2t/modules/decoder.py | 2 +- paddlespeech/s2t/modules/embedding.py | 4 ++-- paddlespeech/s2t/modules/encoder.py | 2 +- paddlespeech/s2t/utils/tensor_utils.py | 4 ++-- 10 files changed, 24 insertions(+), 24 deletions(-) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index a2fce3057..2da68435c 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -200,7 +200,7 @@ if not hasattr(paddle.Tensor, 'view'): def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor: - return xs.reshape(ys.shape) + return xs.reshape(paddle.shape(ys)) if not hasattr(paddle.Tensor, 'view_as'): diff --git a/paddlespeech/s2t/decoders/beam_search/beam_search.py b/paddlespeech/s2t/decoders/beam_search/beam_search.py index 5029e1577..f6a2b4b0a 100644 --- a/paddlespeech/s2t/decoders/beam_search/beam_search.py +++ b/paddlespeech/s2t/decoders/beam_search/beam_search.py @@ -231,7 +231,7 @@ class BeamSearch(paddle.nn.Layer): """ # no pre beam performed, `ids` equal to `weighted_scores` - if weighted_scores.shape[0] == ids.shape[0]: + if paddle.shape(weighted_scores)[0] == paddle.shape(ids)[0]: top_ids = weighted_scores.topk( self.beam_size)[1] # index in n_vocab return top_ids, top_ids @@ -370,13 +370,13 @@ class BeamSearch(paddle.nn.Layer): """ # set length bounds if maxlenratio == 0: - maxlen = x.shape[0] + maxlen = paddle.shape(x)[0] elif maxlenratio < 0: maxlen = -1 * int(maxlenratio) else: - maxlen = max(1, int(maxlenratio * x.shape[0])) - minlen = int(minlenratio * x.shape[0]) - logger.info("decoder input length: " + str(x.shape[0])) + maxlen = max(1, int(maxlenratio * paddle.shape(x)[0])) + minlen = int(minlenratio * paddle.shape(x)[0]) + logger.info("decoder input length: " + str(paddle.shape(x)[0])) logger.info("max output length: " + str(maxlen)) logger.info("min output length: " + str(minlen)) diff --git a/paddlespeech/s2t/decoders/scorers/ctc.py b/paddlespeech/s2t/decoders/scorers/ctc.py index 6f1d8c007..3c1d4cf80 100644 --- a/paddlespeech/s2t/decoders/scorers/ctc.py +++ b/paddlespeech/s2t/decoders/scorers/ctc.py @@ -69,7 +69,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface): return sc[i], st[i] else: # for CTCPrefixScorePD (need new_id > 0) r, log_psi, f_min, f_max, scoring_idmap = state - s = log_psi[i, new_id].expand(log_psi.shape[1]) + s = log_psi[i, new_id].expand(paddle.shape(log_psi)[1]) if scoring_idmap is not None: return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max else: @@ -107,7 +107,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface): """ logp = self.ctc.log_softmax(x.unsqueeze(0)) # assuming batch_size = 1 - xlen = paddle.to_tensor([logp.shape[1]]) + xlen = paddle.to_tensor([paddle.shape(logp)[1]]) self.impl = CTCPrefixScorePD(logp, xlen, 0, self.eos) return None diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py index 0e63a52a8..d8ca5ccde 100644 --- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py +++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py @@ -33,9 +33,9 @@ class CTCPrefixScorePD(): self.logzero = -10000000000.0 self.blank = blank self.eos = eos - self.batch = x.shape[0] - self.input_length = x.shape[1] - self.odim = x.shape[2] + self.batch = paddle.shape(x)[0] + self.input_length = paddle.shape(x)[1] + self.odim = paddle.shape(x)[2] self.dtype = x.dtype # Pad the rest of posteriors in the batch @@ -76,7 +76,7 @@ class CTCPrefixScorePD(): last_ids = [yi[-1] for yi in y] # last output label ids n_bh = len(last_ids) # batch * hyps n_hyps = n_bh // self.batch # assuming each utterance has the same # of hyps - self.scoring_num = scoring_ids.shape[-1] if scoring_ids is not None else 0 + self.scoring_num = paddle.shape(scoring_ids)[-1] if scoring_ids is not None else 0 # prepare state info if state is None: r_prev = paddle.full( @@ -226,7 +226,7 @@ class CTCPrefixScorePD(): if self.x.shape[1] < x.shape[1]: # self.x (2,T,B,O); x (B,T,O) # Pad the rest of posteriors in the batch # TODO(takaaki-hori): need a better way without for-loops - xlens = [x.shape[1]] + xlens = [paddle.shape(x)[1]] for i, l in enumerate(xlens): if l < self.input_length: x[i, l:, :] = self.logzero @@ -236,7 +236,7 @@ class CTCPrefixScorePD(): xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim) self.x = paddle.stack([xn, xb]) # (2, T, B, O) self.x[:, :tmp_x.shape[1], :, :] = tmp_x - self.input_length = x.shape[1] + self.input_length = paddle.shape(x)[1] self.end_frames = paddle.to_tensor(xlens) - 1 def extend_state(self, state): diff --git a/paddlespeech/s2t/models/lm/transformer.py b/paddlespeech/s2t/models/lm/transformer.py index bb281168f..d14f99563 100644 --- a/paddlespeech/s2t/models/lm/transformer.py +++ b/paddlespeech/s2t/models/lm/transformer.py @@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): def _target_mask(self, ys_in_pad): ys_mask = ys_in_pad != 0 - m = subsequent_mask(ys_mask.shape[-1])).unsqueeze(0) + m = subsequent_mask(paddle.shape(ys_mask)[-1])).unsqueeze(0) return ys_mask.unsqueeze(-2) & m def forward(self, x: paddle.Tensor, t: paddle.Tensor @@ -112,7 +112,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): in perplexity: p(t)^{-n} = exp(-log p(t) / n) """ - batch_size = x.shape[0] + batch_size = paddle.shape(x)[0] xm = x != 0 xlen = xm.sum(axis=1) if self.embed_drop is not None: @@ -122,7 +122,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): h, _ = self.encoder(emb, xlen) y = self.decoder(h) loss = F.cross_entropy( - y.view(-1, y.shape[-1]), t.view(-1), reduction="none") + y.view(-1, paddle.shape(y)[-1]), t.view(-1), reduction="none") mask = xm.to(loss.dtype) logp = loss * mask.view(-1) nll = logp.view(batch_size, -1).sum(-1) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index e3f46b15a..d5471369f 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -775,7 +775,7 @@ class U2DecodeModel(U2BaseModel): """ self.eval() x = paddle.to_tensor(x).unsqueeze(0) - ilen = x.shape[1] + ilen = paddle.shape(x)[1] enc_output, _ = self._forward_encoder(x, ilen) return enc_output.squeeze(0) diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index ce78059c0..ccc8482d5 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -242,7 +242,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): ] # batch decoding - ys_mask = subsequent_mask(ys.shape[-1]).unsqueeze(0) # (B,L,L) + ys_mask = subsequent_mask(paddle.shape(ys)[-1]).unsqueeze(0) # (B,L,L) xs_mask = make_xs_mask(xs).unsqueeze(1) # (B,1,T) logp, states = self.forward_one_step( xs, xs_mask, ys, ys_mask, cache=batch_state) diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index cc1fdffe2..51e558eb8 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -115,7 +115,7 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface): assert offset + x.shape[ 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( offset, x.shape[1], self.max_len) - #TODO(Hui Zhang): using T = x.shape[1], __getitem__ not support Tensor + #TODO(Hui Zhang): using T = paddle.shape(x)[1], __getitem__ not support Tensor pos_emb = self.pe[:, offset:offset + T] x = x * self.xscale + pos_emb return self.dropout(x), self.dropout(pos_emb) @@ -165,6 +165,6 @@ class RelPositionalEncoding(PositionalEncoding): 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( offset, x.shape[1], self.max_len) x = x * self.xscale - #TODO(Hui Zhang): using x.shape[1], __getitem__ not support Tensor + #TODO(Hui Zhang): using paddle.shape(x)[1], __getitem__ not support Tensor pos_emb = self.pe[:, offset:offset + x.shape[1]] return self.dropout(x), self.dropout(pos_emb) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 7298c61f2..4d31acf1a 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -218,7 +218,7 @@ class BaseEncoder(nn.Layer): assert xs.shape[0] == 1 # batch size must be one # tmp_masks is just for interface compatibility # TODO(Hui Zhang): stride_slice not support bool tensor - # tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool) + # tmp_masks = paddle.ones([1, paddle.shape(xs)[1]], dtype=paddle.bool) tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32) tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T] diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py index ca8689569..bc557b130 100644 --- a/paddlespeech/s2t/utils/tensor_utils.py +++ b/paddlespeech/s2t/utils/tensor_utils.py @@ -59,7 +59,7 @@ def pad_sequence(sequences: List[paddle.Tensor], >>> b = paddle.ones(22, 300) >>> c = paddle.ones(15, 300) >>> pad_sequence([a, b, c]).shape - [25, 3, 300] + paddle.Tensor([25, 3, 300]) Note: This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` @@ -79,7 +79,7 @@ def pad_sequence(sequences: List[paddle.Tensor], # assuming trailing dimensions and type of all the Tensors # in sequences are same and fetching those from sequences[0] - max_size = sequences[0].shape + max_size = paddle.shape(sequences[0]) # (TODO Hui Zhang): slice not supprot `end==start` # trailing_dims = max_size[1:] trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else () From 6f7917b7f2b489b8341aeda2c8ff318975b84f78 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 25 May 2022 09:25:17 +0000 Subject: [PATCH 05/40] fix streaming asr --- .../conf/ws_conformer_application.yaml | 2 +- ...plication.yaml => ws_ds2_application.yaml} | 0 .../server/engine/asr/online/asr_engine.py | 53 ++++--------------- 3 files changed, 12 insertions(+), 43 deletions(-) rename demos/streaming_asr_server/conf/{ws_application.yaml => ws_ds2_application.yaml} (100%) diff --git a/demos/streaming_asr_server/conf/ws_conformer_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_application.yaml index 2affde073..6a10741bd 100644 --- a/demos/streaming_asr_server/conf/ws_conformer_application.yaml +++ b/demos/streaming_asr_server/conf/ws_conformer_application.yaml @@ -4,7 +4,7 @@ # SERVER SETTING # ################################################################################# host: 0.0.0.0 -port: 8090 +port: 8091 # The task format in the engin_list is: _ # task choices = ['asr_online'] diff --git a/demos/streaming_asr_server/conf/ws_application.yaml b/demos/streaming_asr_server/conf/ws_ds2_application.yaml similarity index 100% rename from demos/streaming_asr_server/conf/ws_application.yaml rename to demos/streaming_asr_server/conf/ws_ds2_application.yaml diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 70bfcfb66..d7bd458f8 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -53,7 +53,7 @@ class PaddleASRConnectionHanddler: logger.info( "create an paddle asr connection handler to process the websocket connection" ) - self.config = asr_engine.config + self.config = asr_engine.config # server config self.model_config = asr_engine.executor.config self.asr_engine = asr_engine @@ -249,10 +249,13 @@ class PaddleASRConnectionHanddler: def reset(self): if "deepspeech2" in self.model_type: # for deepspeech2 - self.chunk_state_h_box = copy.deepcopy( - self.asr_engine.executor.chunk_state_h_box) - self.chunk_state_c_box = copy.deepcopy( - self.asr_engine.executor.chunk_state_c_box) + # init state + self.chunk_state_h_box = np.zeros( + (self.model_config .num_rnn_layers, 1, self.model_config.rnn_layer_size), + dtype=float32) + self.chunk_state_c_box = np.zeros( + (self.model_config.num_rnn_layers, 1, self.model_config.rnn_layer_size), + dtype=float32) self.decoder.reset_decoder(batch_size=1) self.device = None @@ -803,36 +806,6 @@ class ASRServerExecutor(ASRExecutor): model_file=self.am_model, params_file=self.am_params, predictor_conf=self.am_predictor_conf) - - # decoder - logger.info("ASR engine start to create the ctc decoder instance") - self.decoder = CTCDecoder( - odim=self.config.output_dim, # is in vocab - enc_n_units=self.config.rnn_layer_size * 2, - blank_id=self.config.blank_id, - dropout_rate=0.0, - reduction=True, # sum - batch_average=True, # sum / batch_size - grad_norm_type=self.config.get('ctc_grad_norm_type', None)) - - # init decoder - logger.info("ASR engine start to init the ctc decoder") - cfg = self.config.decode - decode_batch_size = 1 # for online - self.decoder.init_decoder( - decode_batch_size, self.text_feature.vocab_list, - cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta, - cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n, - cfg.num_proc_bsearch) - - # init state box - self.chunk_state_h_box = np.zeros( - (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), - dtype=float32) - self.chunk_state_c_box = np.zeros( - (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), - dtype=float32) - elif "conformer" in model_type or "transformer" in model_type: model_name = model_type[:model_type.rindex( '_')] # model_type: {model_name}_{dataset} @@ -847,15 +820,11 @@ class ASRServerExecutor(ASRExecutor): model_dict = paddle.load(self.am_model) self.model.set_state_dict(model_dict) logger.info("create the transformer like model success") - - # update the ctc decoding - self.searcher = CTCPrefixBeamSearch(self.config.decode) - self.transformer_decode_reset() else: raise ValueError(f"Not support: {model_type}") return True - + class ASREngine(BaseEngine): """ASR server resource @@ -881,8 +850,8 @@ class ASREngine(BaseEngine): self.executor = ASRServerExecutor() try: - default_dev = paddle.get_device() - paddle.set_device(self.config.get("device", default_dev)) + self.device = self.config.get("device", paddle.get_device()) + paddle.set_device(self.device) except BaseException as e: logger.error( f"Set device failed, please check if device '{self.device}' is already used and the parameter 'device' in the yaml file" From f9f014d159e28efa788f4d241794420716d369ad Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 25 May 2022 10:39:28 +0000 Subject: [PATCH 06/40] add VITS readme, test=tts --- examples/aishell3/tts3/README.md | 30 ++-- examples/aishell3/voc1/README.md | 2 +- examples/aishell3/voc5/README.md | 21 +-- examples/csmsc/tts0/README.md | 30 ++-- examples/csmsc/tts2/README.md | 30 ++-- examples/csmsc/tts3/README.md | 31 ++-- examples/csmsc/tts3/README_cn.md | 30 ++-- examples/csmsc/vits/README.md | 146 ++++++++++++++++++ examples/csmsc/voc1/README.md | 2 +- examples/csmsc/voc3/README.md | 2 +- examples/csmsc/voc4/README.md | 2 +- examples/csmsc/voc5/README.md | 2 +- examples/csmsc/voc6/README.md | 2 +- examples/ljspeech/tts0/README.md | 30 ++-- examples/ljspeech/tts1/README.md | 2 +- examples/ljspeech/tts3/README.md | 30 ++-- examples/ljspeech/voc1/README.md | 2 +- examples/ljspeech/voc5/README.md | 21 +-- examples/vctk/tts3/README.md | 30 ++-- examples/vctk/voc1/README.md | 2 +- examples/vctk/voc5/README.md | 21 +-- .../t2s/exps/gan_vocoder/hifigan/train.py | 3 +- .../gan_vocoder/multi_band_melgan/train.py | 2 +- .../gan_vocoder/parallelwave_gan/train.py | 2 +- .../exps/gan_vocoder/style_melgan/train.py | 3 +- .../t2s/exps/transformer_tts/train.py | 2 +- paddlespeech/t2s/exps/vits/train.py | 5 +- paddlespeech/t2s/exps/wavernn/train.py | 3 +- 28 files changed, 285 insertions(+), 203 deletions(-) diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index d02ad1b63..93ce62c96 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -120,12 +120,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -134,11 +134,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -150,10 +149,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -169,12 +168,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -184,11 +183,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -199,10 +197,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -215,9 +213,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index eb30e7c40..503f8a19d 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -75,7 +75,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md index c957c4a3a..f8f28f409 100644 --- a/examples/aishell3/voc5/README.md +++ b/examples/aishell3/voc5/README.md @@ -67,15 +67,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -83,19 +81,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md index 01376bd61..a337c7d45 100644 --- a/examples/csmsc/tts0/README.md +++ b/examples/csmsc/tts0/README.md @@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -133,10 +132,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -182,10 +180,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -198,9 +196,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 081d85848..553a370c9 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -139,10 +138,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -188,10 +186,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,9 +202,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index c734199b4..be18de7d6 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -111,12 +111,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -125,11 +125,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -141,10 +140,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -160,12 +159,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -175,11 +174,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -190,10 +188,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,11 +202,12 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output_dir OUTPUT_DIR output dir. + ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md index 25931ecb1..a88615134 100644 --- a/examples/csmsc/tts3/README_cn.md +++ b/examples/csmsc/tts3/README_cn.md @@ -117,12 +117,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -131,11 +131,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -147,10 +146,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -167,12 +166,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -182,11 +181,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -197,10 +195,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -213,9 +211,9 @@ optional arguments: output dir. ``` 1. `--am` 声学模型格式是否符合 {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。 +2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。 3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。 +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。 5. `--lang` 对应模型的语言可以是 `zh` 或 `en` 。 6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、 7. `--text` 是文本文件,其中包含要合成的句子。 diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md index e69de29bb..0c16840a0 100644 --- a/examples/csmsc/vits/README.md +++ b/examples/csmsc/vits/README.md @@ -0,0 +1,146 @@ +# VITS with CSMSC +This example contains code used to train a [VITS](https://arxiv.org/abs/2106.06103) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). + +## Dataset +### Download and Extract +Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for VITS, the durations of MFA are not needed here. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/BZNSYP`. +Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from a text file. + +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│   ├── norm +│   └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│   ├── norm +│   └── raw +└── train + ├── feats_stats.npy + ├── norm + └── raw +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains wave and linear spectrogram of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/feats_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, feats, feats_lengths, the path of linear spectrogram features, the path of raw waves, speaker, and the id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + +Train a VITS model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG config file to overwrite default config. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. +``` +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. + +### Synthesizing + +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--config CONFIG] [--ckpt CKPT] + [--phones_dict PHONES_DICT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + +Synthesize with VITS + +optional arguments: + -h, --help show this help message and exit + --config CONFIG Config of VITS. + --ckpt CKPT Checkpoint file of VITS. + --phones_dict PHONES_DICT + phone vocabulary file. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. +``` +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] [--config CONFIG] [--ckpt CKPT] + [--phones_dict PHONES_DICT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with VITS + +optional arguments: + -h, --help show this help message and exit + --config CONFIG Config of VITS. + --ckpt CKPT Checkpoint file of VITS. + --phones_dict PHONES_DICT + phone vocabulary file. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. +``` +1. `--config`, `--ckpt`, and `--phones_dict` are arguments for acoustic model, which correspond to the 3 files in the VITS pretrained model. +2. `--lang` is the model language, which can be `zh` or `en`. +3. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +4. `--text` is the text file, which contains sentences to synthesize. +5. `--output_dir` is the directory to save synthesized audio files. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Model diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index 77da5b185..d19fe8497 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -65,7 +65,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index 12adaf7f4..eb7710362 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -63,7 +63,7 @@ Train a Multi-Band MelGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG Multi-Band MelGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md index b7add3e57..d9e86a88d 100644 --- a/examples/csmsc/voc4/README.md +++ b/examples/csmsc/voc4/README.md @@ -63,7 +63,7 @@ Train a Style MelGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG Style MelGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 94f93b48b..e044a0c74 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -63,7 +63,7 @@ Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md index 7dcf133bd..f1a5ec3bb 100644 --- a/examples/csmsc/voc6/README.md +++ b/examples/csmsc/voc6/README.md @@ -63,7 +63,7 @@ Train a WaveRNN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG WaveRNN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md index ba7ad6193..581f7930f 100644 --- a/examples/ljspeech/tts0/README.md +++ b/examples/ljspeech/tts0/README.md @@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -133,10 +132,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -182,10 +180,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -198,9 +196,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 7f32522ac..f85991cba 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -61,7 +61,7 @@ Train a TransformerTTS model with LJSpeech TTS dataset. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG TransformerTTS config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index e028fa05d..a6724083d 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ``text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -139,10 +138,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -188,10 +186,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,9 +202,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 4513b2a05..6fd6cbe24 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -65,7 +65,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md index 9b31e2650..afc1bb8be 100644 --- a/examples/ljspeech/voc5/README.md +++ b/examples/ljspeech/voc5/README.md @@ -57,15 +57,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -73,19 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index f373ca6a3..379f5c0fd 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -112,12 +112,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -126,11 +126,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -142,10 +141,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -161,12 +160,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -176,11 +175,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -191,10 +189,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -207,9 +205,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 1c3016f88..c4c40d1d0 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -70,7 +70,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md index 4eb25c02d..c53d46325 100644 --- a/examples/vctk/voc5/README.md +++ b/examples/vctk/voc5/README.md @@ -62,15 +62,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -78,19 +76,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py index c70821e78..4c733dc9b 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py @@ -243,8 +243,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="HiFiGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py index 27ffded63..3b3ebb478 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py @@ -233,7 +233,7 @@ def main(): parser = argparse.ArgumentParser( description="Train a Multi-Band MelGAN model.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="Multi-Band MelGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py index 92de7a2c4..b26407028 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py @@ -208,7 +208,7 @@ def main(): parser = argparse.ArgumentParser( description="Train a ParallelWaveGAN model.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="ParallelWaveGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py index be3ba7425..a87cc7a18 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py @@ -224,8 +224,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a Style MelGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="Style MelGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index 45ecb269b..da48b6b99 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -160,7 +160,7 @@ def main(): parser = argparse.ArgumentParser(description="Train a TransformerTTS " "model with LJSpeech TTS dataset.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="TransformerTTS config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index b921f92af..dbda8b717 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -226,9 +226,8 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser = argparse.ArgumentParser(description="Train a VITS model.") + parser.add_argument("--config", type=str, help="VITS config file") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py index 8661d311d..cf24ea268 100644 --- a/paddlespeech/t2s/exps/wavernn/train.py +++ b/paddlespeech/t2s/exps/wavernn/train.py @@ -180,8 +180,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a WaveRNN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="WaveRNN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") From 537aff9704c5c61e8f5bc334486599996279fa82 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Wed, 25 May 2022 20:54:11 +0800 Subject: [PATCH 07/40] refactor example dir & add aishell build TLG --- speechx/examples/dev/CMakeLists.txt | 3 - speechx/examples/dev/glog/CMakeLists.txt | 8 - speechx/examples/dev/glog/README.md | 25 --- .../dev/glog/glog_logtostderr_test.cc | 25 --- speechx/examples/dev/glog/glog_test.cc | 23 --- speechx/examples/dev/glog/path.sh | 15 -- speechx/examples/dev/glog/run.sh | 22 --- speechx/examples/ds2_ol/aishell/README.md | 37 ++++ .../aishell}/local/aishell_train_lms.sh | 0 .../aishell}/local/text_to_lexicon.py | 0 speechx/examples/ds2_ol/aishell/path.sh | 12 +- .../examples/ds2_ol/aishell/run_build_tlg.sh | 141 +++++++++++++ speechx/examples/ds2_ol/aishell/run_fbank.sh | 1 - speechx/examples/ngram/.gitignore | 2 - speechx/examples/ngram/en/README.md | 0 speechx/examples/ngram/zh/README.md | 101 ---------- speechx/examples/ngram/zh/local/split_data.sh | 30 --- speechx/examples/ngram/zh/path.sh | 12 -- speechx/examples/ngram/zh/run.sh | 68 ------- speechx/examples/ngram/zh/utils | 1 - speechx/examples/wfst/.gitignore | 1 - speechx/examples/wfst/README.md | 186 ------------------ speechx/examples/wfst/path.sh | 19 -- speechx/examples/wfst/run.sh | 29 --- speechx/examples/wfst/utils | 1 - 25 files changed, 189 insertions(+), 573 deletions(-) delete mode 100644 speechx/examples/dev/CMakeLists.txt delete mode 100644 speechx/examples/dev/glog/CMakeLists.txt delete mode 100644 speechx/examples/dev/glog/README.md delete mode 100644 speechx/examples/dev/glog/glog_logtostderr_test.cc delete mode 100644 speechx/examples/dev/glog/glog_test.cc delete mode 100644 speechx/examples/dev/glog/path.sh delete mode 100755 speechx/examples/dev/glog/run.sh rename speechx/examples/{ngram/zh => ds2_ol/aishell}/local/aishell_train_lms.sh (100%) rename speechx/examples/{ngram/zh => ds2_ol/aishell}/local/text_to_lexicon.py (100%) create mode 100755 speechx/examples/ds2_ol/aishell/run_build_tlg.sh delete mode 100644 speechx/examples/ngram/.gitignore delete mode 100644 speechx/examples/ngram/en/README.md delete mode 100644 speechx/examples/ngram/zh/README.md delete mode 100755 speechx/examples/ngram/zh/local/split_data.sh delete mode 100644 speechx/examples/ngram/zh/path.sh delete mode 100755 speechx/examples/ngram/zh/run.sh delete mode 120000 speechx/examples/ngram/zh/utils delete mode 100644 speechx/examples/wfst/.gitignore delete mode 100644 speechx/examples/wfst/README.md delete mode 100644 speechx/examples/wfst/path.sh delete mode 100755 speechx/examples/wfst/run.sh delete mode 120000 speechx/examples/wfst/utils diff --git a/speechx/examples/dev/CMakeLists.txt b/speechx/examples/dev/CMakeLists.txt deleted file mode 100644 index c8445fb82..000000000 --- a/speechx/examples/dev/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - -add_subdirectory(glog) diff --git a/speechx/examples/dev/glog/CMakeLists.txt b/speechx/examples/dev/glog/CMakeLists.txt deleted file mode 100644 index b4b0e6358..000000000 --- a/speechx/examples/dev/glog/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - -add_executable(glog_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_test.cc) -target_link_libraries(glog_test glog) - - -add_executable(glog_logtostderr_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_logtostderr_test.cc) -target_link_libraries(glog_logtostderr_test glog) \ No newline at end of file diff --git a/speechx/examples/dev/glog/README.md b/speechx/examples/dev/glog/README.md deleted file mode 100644 index 996e192e9..000000000 --- a/speechx/examples/dev/glog/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# [GLOG](https://rpg.ifi.uzh.ch/docs/glog.html) - -Unless otherwise specified, glog writes to the filename `/tmp/...log...