From 7fd5abd75d16852c5d6c1ab385b37603d9be7c77 Mon Sep 17 00:00:00 2001 From: megemini Date: Mon, 25 Nov 2024 11:25:37 +0800 Subject: [PATCH 01/36] [Fix] max between int and value (#3903) --- paddlespeech/t2s/modules/masked_fill.py | 22 ++++++++++++++++++- .../t2s/modules/transformer/embedding.py | 2 +- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/paddlespeech/t2s/modules/masked_fill.py b/paddlespeech/t2s/modules/masked_fill.py index 1445a926a..d143fe62f 100644 --- a/paddlespeech/t2s/modules/masked_fill.py +++ b/paddlespeech/t2s/modules/masked_fill.py @@ -29,7 +29,27 @@ def is_broadcastable(shp1, shp2): def broadcast_shape(shp1, shp2): result = [] for a, b in zip(shp1[::-1], shp2[::-1]): - result.append(max(a, b)) + is_a_int = isinstance(a, int) + is_b_int = isinstance(b, int) + + if is_a_int and is_b_int: + result.append(max(a, b)) + + else: + dtype = None + if hasattr(a, 'dtype'): + dtype = a.dtype + if hasattr(b, 'dtype'): + dtype = b.dtype + + if (is_a_int): + a = paddle.full((), a, dtype=dtype) + + if (is_b_int): + b = paddle.full((), b, dtype=dtype) + + result.append(paddle.maximum(a, b)) + return result[::-1] diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py index f90eb44a4..e4331cff0 100644 --- a/paddlespeech/t2s/modules/transformer/embedding.py +++ b/paddlespeech/t2s/modules/transformer/embedding.py @@ -67,7 +67,7 @@ class PositionalEncoding(nn.Layer): pe[:, 0::2] = paddle.sin(position * div_term) pe[:, 1::2] = paddle.cos(position * div_term) pe = pe.unsqueeze(0) - self.pe = pe + self.pe = paddle.assign(pe) def forward(self, x: paddle.Tensor): """Add positional encoding. From 7dc806dc1d957ed7f316530c9ae0db626e19967d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 25 Nov 2024 11:27:48 +0800 Subject: [PATCH 02/36] run with aishell/asr3 (#3904) --- paddlespeech/s2t/exps/wav2vec2/bin/test.py | 2 -- paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py | 5 ----- .../s2t/models/wav2vec2/modules/wav2vec2_model.py | 12 ++++++------ .../models/wav2vec2/processing/signal_processing.py | 4 ++-- .../wav2vec2/processing/speech_augmentation.py | 10 +++++----- 5 files changed, 13 insertions(+), 20 deletions(-) diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test.py b/paddlespeech/s2t/exps/wav2vec2/bin/test.py index c17cee0fd..55a241ffc 100644 --- a/paddlespeech/s2t/exps/wav2vec2/bin/test.py +++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py @@ -37,8 +37,6 @@ if __name__ == "__main__": # save asr result to parser.add_argument( '--dict-path', type=str, default=None, help='dict path.') - parser.add_argument( - "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py index 0295713ff..7747b868b 100644 --- a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py @@ -104,11 +104,6 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to - parser.add_argument( - "--result_file", type=str, help="path of save the asr result") - parser.add_argument( - "--audio_file", type=str, help="path of the input audio file") args = parser.parse_args() config = CfgNode(new_allowed=True) diff --git a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py index 3fbb9426b..b753f6c52 100644 --- a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py @@ -714,13 +714,13 @@ class MultiheadAttention(nn.Layer): else: if self.beam_size > 1 and bsz == key.size(1): # key is [T, bsz*beam_size, C], reduce to [T, bsz, C] - key = key.view( - key.size(0), -1, self.beam_size, - key.size(2))[:, :, 0, :] + key = key.reshape( + [key.size(0), -1, self.beam_size, + key.size(2)])[:, :, 0, :] if key_padding_mask is not None: - key_padding_mask = key_padding_mask.view( - -1, self.beam_size, - key_padding_mask.size(1))[:, 0, :] + key_padding_mask = key_padding_mask.reshape( + [-1, self.beam_size, + key_padding_mask.size(1)])[:, 0, :] k = self.k_proj(key) v = self.v_proj(key) diff --git a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py index 7267e2211..a0e279c30 100644 --- a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py +++ b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py @@ -88,7 +88,7 @@ def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"): out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True) else: wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True) - out = wav_sum / lengths + out = wav_sum / lengths.astype(wav_sum.dtype) elif amp_type == "peak": out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)[0] else: @@ -248,4 +248,4 @@ def notch_filter(notch_freq, filter_width=101, notch_width=0.05): hhpf[pad] += 1 # Adding filters creates notch filter - return (hlpf + hhpf).view(1, -1, 1) + return (hlpf + hhpf).reshape([1, -1, 1]) diff --git a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py index 50a95f0b1..e8a605610 100644 --- a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py +++ b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py @@ -743,7 +743,7 @@ class SpecAugment(paddle.nn.Layer): time = x.shape[2] if time - window <= window: - return x.view(*original_size) + return x.reshape([*original_size]) # compute center and corresponding window c = paddle.randint(window, time - window, (1, ))[0] @@ -762,7 +762,7 @@ class SpecAugment(paddle.nn.Layer): x[:, :, :w] = left x[:, :, w:] = right - return x.view(*original_size) + return x.reshape([*original_size]) def mask_along_axis(self, x, dim): """Mask along time or frequency axis. @@ -775,7 +775,7 @@ class SpecAugment(paddle.nn.Layer): """ original_size = x.shape if x.dim() == 4: - x = x.view(-1, x.shape[2], x.shape[3]) + x = x.reshape([-1, x.shape[2], x.shape[3]]) batch, time, fea = x.shape @@ -795,7 +795,7 @@ class SpecAugment(paddle.nn.Layer): (batch, n_mask)).unsqueeze(2) # compute masks - arange = paddle.arange(end=D).view(1, 1, -1) + arange = paddle.arange(end=D).reshape([1, 1, -1]) mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len)) mask = mask.any(axis=1) @@ -811,7 +811,7 @@ class SpecAugment(paddle.nn.Layer): # same to x.masked_fill_(mask, val) y = paddle.full(x.shape, val, x.dtype) x = paddle.where(mask, y, x) - return x.view(*original_size) + return x.reshape([*original_size]) class TimeDomainSpecAugment(nn.Layer): From e22173f7391a5cce449ade9947f4eef67cdaaa63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 25 Nov 2024 13:57:49 +0800 Subject: [PATCH 03/36] [Hackathon 7th] Update Tiny README.md (#3896) * Update README.md * Update README.md --- examples/tiny/asr1/README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/tiny/asr1/README.md b/examples/tiny/asr1/README.md index 489f5bc3e..8eb45ce5e 100644 --- a/examples/tiny/asr1/README.md +++ b/examples/tiny/asr1/README.md @@ -26,7 +26,6 @@ The document below will describe the scripts in ```run.sh```in detail. The path.sh contains the environment variables. ```bash . ./path.sh -. ./cmd.sh ``` This script needs to be run first. And another script is also needed: ```bash @@ -64,7 +63,6 @@ bash run.sh --stage 0 --stop_stage 0 You can also just run these scripts in your command line. ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh ``` After processing the data, the ``data`` directory will look like this: @@ -100,7 +98,6 @@ bash run.sh --stage 0 --stop_stage 1 or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer ```## Stage 2: Top-k Models Averaging @@ -119,7 +116,6 @@ bash run.sh --stage 0 --stop_stage 2 or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer avg.sh best exp/transformer/checkpoints 1 @@ -139,7 +135,6 @@ bash run.sh --stage 0 --stop_stage 3 or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer avg.sh best exp/transformer/checkpoints 1 @@ -166,7 +161,6 @@ bash run.sh --stage 4 --stop_stage 4 or you can also use these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer avg.sh best exp/transformer/checkpoints 1 From 61728f8db68cc0d9d167ca75dec41eaa19168ddf Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Tue, 26 Nov 2024 17:00:49 +0800 Subject: [PATCH 04/36] NumPy compatibility enhancements (#3907) * NumPy compatibility enhancements * update code --- .../wav2vec2/modules/modeling_wav2vec2.py | 6 +- .../models/wav2vec2/modules/wav2vec2_model.py | 2 +- paddlespeech/s2t/models/wavlm/wavlm_paddle.py | 396 +++++++++--------- paddlespeech/t2s/datasets/get_feats.py | 2 +- paddlespeech/t2s/utils/internals.py | 2 +- 5 files changed, 193 insertions(+), 215 deletions(-) diff --git a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py index 688bf5f84..797c23a0f 100644 --- a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py @@ -129,7 +129,7 @@ def _compute_mask_indices( [sequence_length for _ in range(batch_size)]) # SpecAugment mask to fill - spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool) + spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool_) spec_aug_mask_idxs = [] max_num_masked_span = compute_num_masked_span(sequence_length) @@ -207,9 +207,9 @@ def _sample_negative_indices(features_shape: Tuple, sampled_negative_indices = np.zeros( shape=(batch_size, sequence_length, num_negatives), dtype=np.int32) - mask_time_indices = (mask_time_indices.astype(np.bool) + mask_time_indices = (mask_time_indices.astype(np.bool_) if mask_time_indices is not None else - np.ones(features_shape, dtype=np.bool)) + np.ones(features_shape, dtype=np.bool_)) for batch_idx in range(batch_size): high = mask_time_indices[batch_idx].sum() - 1 diff --git a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py index b753f6c52..be78b516a 100644 --- a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py @@ -1476,7 +1476,7 @@ def compute_mask_indices( lens = np.fromiter( (e - s if e - s >= length + min_space else 0 for s, e in parts), - np.int, ) + np.int_, ) l_sum = np.sum(lens) if l_sum == 0: break diff --git a/paddlespeech/s2t/models/wavlm/wavlm_paddle.py b/paddlespeech/s2t/models/wavlm/wavlm_paddle.py index 6ed9ecd0e..1a0fca531 100644 --- a/paddlespeech/s2t/models/wavlm/wavlm_paddle.py +++ b/paddlespeech/s2t/models/wavlm/wavlm_paddle.py @@ -6,40 +6,38 @@ # Based on fairseq code bases # https://github.com/pytorch/fairseq # -------------------------------------------------------- - -import math import logging -from typing import List, Optional, Tuple +import math +from typing import List +from typing import Optional +from typing import Tuple import numpy as np - import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddle.nn import LayerNorm from paddle import Tensor -from .modules.modules import ( - MultiheadAttention, - SamePad, - get_activation_fn, - TransposeLast, - GLU_Linear, -) +from paddle.nn import LayerNorm + +from .modules.modules import get_activation_fn +from .modules.modules import GLU_Linear +from .modules.modules import MultiheadAttention +from .modules.modules import SamePad +from .modules.modules import TransposeLast logger = logging.getLogger(__name__) def compute_mask_indices( - shape: Tuple[int, int], - padding_mask: Optional[Tensor], - mask_prob: float, - mask_length: int, - mask_type: str = "static", - mask_other: float = 0.0, - min_masks: int = 0, - no_overlap: bool = False, - min_space: int = 0, -) -> np.ndarray: + shape: Tuple[int, int], + padding_mask: Optional[Tensor], + mask_prob: float, + mask_length: int, + mask_type: str="static", + mask_other: float=0.0, + min_masks: int=0, + no_overlap: bool=False, + min_space: int=0, ) -> np.ndarray: """ Computes random mask spans for a given shape @@ -65,9 +63,7 @@ def compute_mask_indices( all_num_mask = int( # add a random number for probabilistic rounding - mask_prob * all_sz / float(mask_length) - + np.random.rand() - ) + mask_prob * all_sz / float(mask_length) + np.random.rand()) all_num_mask = max(min_masks, all_num_mask) @@ -77,9 +73,7 @@ def compute_mask_indices( sz = all_sz - padding_mask[i].long().sum().item() num_mask = int( # add a random number for probabilistic rounding - mask_prob * sz / float(mask_length) - + np.random.rand() - ) + mask_prob * sz / float(mask_length) + np.random.rand()) num_mask = max(min_masks, num_mask) else: sz = all_sz @@ -88,7 +82,8 @@ def compute_mask_indices( if mask_type == "static": lengths = np.full(num_mask, mask_length) elif mask_type == "uniform": - lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask) + lengths = np.random.randint( + mask_other, mask_length * 2 + 1, size=num_mask) elif mask_type == "normal": lengths = np.random.normal(mask_length, mask_other, size=num_mask) lengths = [max(1, int(round(x))) for x in lengths] @@ -119,9 +114,9 @@ def compute_mask_indices( min_length = min(lengths) for length in sorted(lengths, reverse=True): lens = np.fromiter( - (e - s if e - s >= length + min_space else 0 for s, e in parts), - np.int, - ) + (e - s if e - s >= length + min_space else 0 + for s, e in parts), + np.int_, ) l_sum = np.sum(lens) if l_sum == 0: break @@ -137,13 +132,10 @@ def compute_mask_indices( mask_idc = np.random.choice(sz - min_len, num_mask, replace=False) - mask_idc = np.asarray( - [ - mask_idc[j] + offset - for j in range(len(mask_idc)) - for offset in range(lengths[j]) - ] - ) + mask_idc = np.asarray([ + mask_idc[j] + offset + for j in range(len(mask_idc)) for offset in range(lengths[j]) + ]) mask_idcs.append(np.unique(mask_idc[mask_idc < sz])) @@ -158,54 +150,54 @@ def compute_mask_indices( class WavLMConfig: def __init__(self, cfg=None): - self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) - self.encoder_layers: int = 12 # num encoder layers in the transformer + self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) + self.encoder_layers: int = 12 # num encoder layers in the transformer - self.encoder_embed_dim: int = 768 # encoder embedding dimension - self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN - self.encoder_attention_heads: int = 12 # num encoder attention heads - self.activation_fn: str = "gelu" # activation function to use + self.encoder_embed_dim: int = 768 # encoder embedding dimension + self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN + self.encoder_attention_heads: int = 12 # num encoder attention heads + self.activation_fn: str = "gelu" # activation function to use - self.layer_norm_first: bool = False # apply layernorm first in the transformer - self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] - self.conv_bias: bool = False # include bias in conv encoder - self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this + self.layer_norm_first: bool = False # apply layernorm first in the transformer + self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] + self.conv_bias: bool = False # include bias in conv encoder + self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this self.normalize: bool = False # normalize input to have 0 mean and unit variance during training # dropouts - self.dropout: float = 0.1 # dropout probability for the transformer - self.attention_dropout: float = 0.1 # dropout probability for attention weights - self.activation_dropout: float = 0.0 # dropout probability after activation in FFN - self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer - self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr) - self.dropout_features: float = 0.0 # dropout to apply to the features (after feat extr) + self.dropout: float = 0.1 # dropout probability for the transformer + self.attention_dropout: float = 0.1 # dropout probability for attention weights + self.activation_dropout: float = 0.0 # dropout probability after activation in FFN + self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer + self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr) + self.dropout_features: float = 0.0 # dropout to apply to the features (after feat extr) # masking - self.mask_length: int = 10 # mask length - self.mask_prob: float = 0.65 # probability of replacing a token with mask - self.mask_selection: str = "static" # how to choose mask length - self.mask_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh - self.no_mask_overlap: bool = False # whether to allow masks to overlap - self.mask_min_space: int = 1 # min space between spans (if no overlap is enabled) + self.mask_length: int = 10 # mask length + self.mask_prob: float = 0.65 # probability of replacing a token with mask + self.mask_selection: str = "static" # how to choose mask length + self.mask_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh + self.no_mask_overlap: bool = False # whether to allow masks to overlap + self.mask_min_space: int = 1 # min space between spans (if no overlap is enabled) # channel masking - self.mask_channel_length: int = 10 # length of the mask for features (channels) - self.mask_channel_prob: float = 0.0 # probability of replacing a feature with 0 - self.mask_channel_selection: str = "static" # how to choose mask length for channel masking - self.mask_channel_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indices - self.no_mask_channel_overlap: bool = False # whether to allow channel masks to overlap - self.mask_channel_min_space: int = 1 # min space between spans (if no overlap is enabled) + self.mask_channel_length: int = 10 # length of the mask for features (channels) + self.mask_channel_prob: float = 0.0 # probability of replacing a feature with 0 + self.mask_channel_selection: str = "static" # how to choose mask length for channel masking + self.mask_channel_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indices + self.no_mask_channel_overlap: bool = False # whether to allow channel masks to overlap + self.mask_channel_min_space: int = 1 # min space between spans (if no overlap is enabled) # positional embeddings - self.conv_pos: int = 128 # number of filters for convolutional positional embeddings - self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding + self.conv_pos: int = 128 # number of filters for convolutional positional embeddings + self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding # relative position embedding - self.relative_position_embedding: bool = True # apply relative position embedding - self.num_buckets: int = 320 # number of buckets for relative position embedding - self.max_distance: int = 1280 # maximum distance for relative position embedding - self.gru_rel_pos: bool = True # apply gated relative position embedding + self.relative_position_embedding: bool = True # apply relative position embedding + self.num_buckets: int = 320 # number of buckets for relative position embedding + self.max_distance: int = 1280 # maximum distance for relative position embedding + self.gru_rel_pos: bool = True # apply gated relative position embedding if cfg is not None: self.update(cfg) @@ -216,9 +208,8 @@ class WavLMConfig: class WavLM(nn.Layer): def __init__( - self, - cfg: WavLMConfig, - ) -> None: + self, + cfg: WavLMConfig, ) -> None: super().__init__() logger.info(f"WavLM Config: {cfg.__dict__}") @@ -230,14 +221,11 @@ class WavLM(nn.Layer): conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, - conv_bias=cfg.conv_bias, - ) + conv_bias=cfg.conv_bias, ) - self.post_extract_proj = ( - nn.Linear(self.embed, cfg.encoder_embed_dim) - if self.embed != cfg.encoder_embed_dim - else None - ) + self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim) + if self.embed != cfg.encoder_embed_dim else + None) self.mask_prob = cfg.mask_prob self.mask_selection = cfg.mask_selection @@ -260,8 +248,7 @@ class WavLM(nn.Layer): self.mask_emb = self.create_parameter( shape=[cfg.encoder_embed_dim], - default_initializer=nn.initializer.Uniform(), - ) + default_initializer=nn.initializer.Uniform(), ) self.encoder = TransformerEncoder(cfg) self.layer_norm = LayerNorm(self.embed) @@ -278,8 +265,7 @@ class WavLM(nn.Layer): self.mask_other, min_masks=2, no_overlap=self.no_mask_overlap, - min_space=self.mask_min_space, - ) + min_space=self.mask_min_space, ) # mask_indices = torch.from_numpy(mask_indices).to(x.device) mask_indices = paddle.to_tensor(mask_indices, dtype='int64') x[mask_indices] = self.mask_emb @@ -295,40 +281,35 @@ class WavLM(nn.Layer): self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, - min_space=self.mask_channel_min_space, - ) + min_space=self.mask_channel_min_space, ) mask_channel_indices = ( # torch.from_numpy(mask_channel_indices) paddle.to_tensor(mask_channel_indices, dtype='int64') - .to(x.device) - .unsqueeze(1) - .expand(-1, T, -1) - ) + .to(x.device).unsqueeze(1).expand(-1, T, -1)) x[mask_channel_indices] = 0 return x, mask_indices def forward_padding_mask( - self, features: Tensor, padding_mask: Tensor, - ) -> Tensor: + self, + features: Tensor, + padding_mask: Tensor, ) -> Tensor: extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] padding_mask = padding_mask.view( - padding_mask.size(0), features.size(1), -1 - ) + padding_mask.size(0), features.size(1), -1) padding_mask = padding_mask.all(-1) return padding_mask def extract_features( - self, - source: Tensor, - padding_mask: Optional[Tensor] = None, - mask: bool = False, - ret_conv: bool = False, - output_layer: Optional[int] = None, - ret_layer_results: bool = False, - ): + self, + source: Tensor, + padding_mask: Optional[Tensor]=None, + mask: bool=False, + ret_conv: bool=False, + output_layer: Optional[int]=None, + ret_layer_results: bool=False, ): if self.feature_grad_mult > 0: features = self.feature_extractor(source) @@ -339,7 +320,7 @@ class WavLM(nn.Layer): with paddle.no_grad(): features = self.feature_extractor(source) - features = features.transpose([0, 2, 1]) # [1, 49, 512] + features = features.transpose([0, 2, 1]) # [1, 49, 512] features = self.layer_norm(features) if padding_mask is not None: @@ -351,9 +332,7 @@ class WavLM(nn.Layer): features = self.dropout_input(features) if mask: - x, mask_indices = self.apply_mask( - features, padding_mask - ) + x, mask_indices = self.apply_mask(features, padding_mask) else: x = features @@ -362,33 +341,35 @@ class WavLM(nn.Layer): # x: (B, T, D), float # padding_mask: (B, T), bool # mask_indices: (B, T), bool - + x, layer_results = self.encoder( x, padding_mask=padding_mask, - layer=None if output_layer is None else output_layer - 1 - ) + layer=None if output_layer is None else output_layer - 1) # print(f"Debugging: x.shape: {x.shape}, x.mean(): {x.mean()}, x.std(): {x.std()}") - res = {"x": x, "padding_mask": padding_mask, "features": features, "layer_results": layer_results} + res = { + "x": x, + "padding_mask": padding_mask, + "features": features, + "layer_results": layer_results + } feature = res["features"] if ret_conv else res["x"] if ret_layer_results: feature = (feature, res["layer_results"]) return feature, res["padding_mask"] - + def forward(self, x): return self.extract_features(x)[0] class ConvFeatureExtractionModel(nn.Layer): - def __init__( - self, - conv_layers: List[Tuple[int, int, int]], - dropout: float = 0.0, - mode: str = "default", - conv_bias: bool = False, - conv_type: str = "default" - ): + def __init__(self, + conv_layers: List[Tuple[int, int, int]], + dropout: float=0.0, + mode: str="default", + conv_bias: bool=False, + conv_type: str="default"): super().__init__() assert mode in {"default", "layer_norm"} @@ -400,17 +381,20 @@ class ConvFeatureExtractionModel(nn.Layer): stride, is_layer_norm=False, is_group_norm=False, - conv_bias=False, - ): + conv_bias=False, ): def make_conv(): - conv = nn.Conv1D(n_in, n_out, k, stride=stride, bias_attr=conv_bias, - weight_attr=nn.initializer.KaimingNormal()) + conv = nn.Conv1D( + n_in, + n_out, + k, + stride=stride, + bias_attr=conv_bias, + weight_attr=nn.initializer.KaimingNormal()) # nn.init.kaiming_normal_(conv.weight) return conv - assert ( - is_layer_norm and is_group_norm - ) == False, "layer norm and group norm are exclusive" + assert (is_layer_norm and is_group_norm + ) == False, "layer norm and group norm are exclusive" if is_layer_norm: return nn.Sequential( @@ -419,19 +403,18 @@ class ConvFeatureExtractionModel(nn.Layer): nn.Sequential( TransposeLast(), nn.LayerNorm(normalized_shape=dim, epsilon=1e-5), - TransposeLast(), - ), - nn.GELU(), - ) + TransposeLast(), ), + nn.GELU(), ) elif is_group_norm: return nn.Sequential( make_conv(), nn.Dropout(p=dropout), - nn.GroupNorm(num_groups=dim, num_channels=dim, epsilon=1e-5), - nn.GELU(), - ) + nn.GroupNorm( + num_groups=dim, num_channels=dim, epsilon=1e-5), + nn.GELU(), ) else: - return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU()) + return nn.Sequential( + make_conv(), nn.Dropout(p=dropout), nn.GELU()) self.conv_type = conv_type if self.conv_type == "default": @@ -449,9 +432,7 @@ class ConvFeatureExtractionModel(nn.Layer): stride, is_layer_norm=mode == "layer_norm", is_group_norm=mode == "default" and i == 0, - conv_bias=conv_bias, - ) - ) + conv_bias=conv_bias, )) in_d = dim elif self.conv_type == "conv2d": in_d = 1 @@ -460,9 +441,7 @@ class ConvFeatureExtractionModel(nn.Layer): assert len(cl) == 3 (dim, k, stride) = cl - self.conv_layers.append( - paddle.nn.Conv2D(in_d, dim, k, stride) - ) + self.conv_layers.append(paddle.nn.Conv2D(in_d, dim, k, stride)) self.conv_layers.append(paddle.nn.ReLU()) in_d = dim elif self.conv_type == "custom": @@ -473,17 +452,13 @@ class ConvFeatureExtractionModel(nn.Layer): assert len(cl) == 3 (dim, k, stride) = cl self.conv_layers.append( - paddle.nn.Conv2D(in_d, dim, k, stride, padding=1) - ) - self.conv_layers.append( - paddle.nn.LayerNorm([dim, idim]) - ) + paddle.nn.Conv2D(in_d, dim, k, stride, padding=1)) + self.conv_layers.append(paddle.nn.LayerNorm([dim, idim])) self.conv_layers.append(paddle.nn.ReLU()) in_d = dim if (i + 1) % 2 == 0: self.conv_layers.append( - paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True) - ) + paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True)) idim = int(math.ceil(idim / 2)) else: pass @@ -518,8 +493,8 @@ class TransformerEncoder(nn.Layer): self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim dropout = 0 - std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) - + std = math.sqrt( + (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) self.pos_conv = nn.Conv1D( self.embedding_dim, @@ -528,15 +503,16 @@ class TransformerEncoder(nn.Layer): padding=args.conv_pos // 2, groups=args.conv_pos_groups, weight_attr=nn.initializer.Normal(mean=0, std=std), - bias_attr=True - ) + bias_attr=True) # nn.init.normal_(self.pos_conv.weight, mean=0, std=std) # nn.init.constant_(self.pos_conv.bias, 0) # self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) # self.pos_conv.weight_g = self.pos_conv.weight_g.unsqueeze(0).unsqueeze(0) - self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) - self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) + self.pos_conv = nn.utils.weight_norm( + self.pos_conv, name="weight", dim=2) + self.pos_conv = nn.Sequential(self.pos_conv, + SamePad(args.conv_pos), nn.GELU()) if hasattr(args, "relative_position_embedding"): self.relative_position_embedding = args.relative_position_embedding @@ -547,25 +523,23 @@ class TransformerEncoder(nn.Layer): self.num_buckets = 0 self.max_distance = 0 - self.layers = nn.LayerList( - [ - TransformerSentenceEncoderLayer( - embedding_dim=self.embedding_dim, - ffn_embedding_dim=args.encoder_ffn_embed_dim, - num_attention_heads=args.encoder_attention_heads, - dropout=self.dropout, - attention_dropout=args.attention_dropout, - activation_dropout=args.activation_dropout, - activation_fn=args.activation_fn, - layer_norm_first=args.layer_norm_first, - has_relative_attention_bias=(self.relative_position_embedding and i == 0), - num_buckets=self.num_buckets, - max_distance=self.max_distance, - gru_rel_pos=args.gru_rel_pos, - ) - for i in range(args.encoder_layers) - ] - ) + self.layers = nn.LayerList([ + TransformerSentenceEncoderLayer( + embedding_dim=self.embedding_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=self.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + activation_fn=args.activation_fn, + layer_norm_first=args.layer_norm_first, + has_relative_attention_bias=( + self.relative_position_embedding and i == 0), + num_buckets=self.num_buckets, + max_distance=self.max_distance, + gru_rel_pos=args.gru_rel_pos, ) + for i in range(args.encoder_layers) + ]) self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) @@ -574,14 +548,19 @@ class TransformerEncoder(nn.Layer): # self.apply(init_bert_params) def forward(self, x, padding_mask=None, streaming_mask=None, layer=None): - x, layer_results = self.extract_features(x, padding_mask, streaming_mask, layer) + x, layer_results = self.extract_features(x, padding_mask, + streaming_mask, layer) # print("x.shape", x.shape) if self.layer_norm_first and layer is None: x = self.layer_norm(x) return x, layer_results - def extract_features(self, x, padding_mask=None, streaming_mask=None, tgt_layer=None): + def extract_features(self, + x, + padding_mask=None, + streaming_mask=None, + tgt_layer=None): if padding_mask is not None: x[padding_mask] = 0 @@ -598,7 +577,6 @@ class TransformerEncoder(nn.Layer): # x = x.transpose(0, 1) x = x.transpose([1, 0, 2]) - layer_results = [] z = None if tgt_layer is not None: @@ -608,7 +586,12 @@ class TransformerEncoder(nn.Layer): for i, layer in enumerate(self.layers): dropout_probability = np.random.random() if not self.training or (dropout_probability > self.layerdrop): - x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False,self_attn_mask=streaming_mask, pos_bias=pos_bias) + x, z, pos_bias = layer( + x, + self_attn_padding_mask=padding_mask, + need_weights=False, + self_attn_mask=streaming_mask, + pos_bias=pos_bias) if tgt_layer is not None: layer_results.append((x, z)) if i == tgt_layer: @@ -633,20 +616,19 @@ class TransformerSentenceEncoderLayer(nn.Layer): def __init__( self, - embedding_dim: float = 768, - ffn_embedding_dim: float = 3072, - num_attention_heads: float = 8, - dropout: float = 0.1, - attention_dropout: float = 0.1, - activation_dropout: float = 0.1, - activation_fn: str = "relu", - layer_norm_first: bool = False, - has_relative_attention_bias: bool = True, - num_buckets: int = 0, - max_distance: int = 0, - rescale_init: bool = False, - gru_rel_pos: bool = True, - ) -> None: + embedding_dim: float=768, + ffn_embedding_dim: float=3072, + num_attention_heads: float=8, + dropout: float=0.1, + attention_dropout: float=0.1, + activation_dropout: float=0.1, + activation_fn: str="relu", + layer_norm_first: bool=False, + has_relative_attention_bias: bool=True, + num_buckets: int=0, + max_distance: int=0, + rescale_init: bool=False, + gru_rel_pos: bool=True, ) -> None: super().__init__() # Initialize parameters @@ -666,8 +648,7 @@ class TransformerSentenceEncoderLayer(nn.Layer): num_buckets=num_buckets, max_distance=max_distance, rescale_init=rescale_init, - gru_rel_pos=gru_rel_pos, - ) + gru_rel_pos=gru_rel_pos, ) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(self.activation_dropout) @@ -679,7 +660,8 @@ class TransformerSentenceEncoderLayer(nn.Layer): self.self_attn_layer_norm = LayerNorm(self.embedding_dim) if self.activation_name == "glu": - self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish") + self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, + "swish") else: self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) @@ -687,21 +669,19 @@ class TransformerSentenceEncoderLayer(nn.Layer): # layer norm associated with the position wise feed-forward NN self.final_layer_norm = LayerNorm(self.embedding_dim) - def forward( - self, - x: Tensor, - self_attn_mask: Tensor = None, - self_attn_padding_mask: Tensor = None, - need_weights: bool = False, - pos_bias=None - ): + def forward(self, + x: Tensor, + self_attn_mask: Tensor=None, + self_attn_padding_mask: Tensor=None, + need_weights: bool=False, + pos_bias=None): """ LayerNorm is applied either before or after the self-attention/ffn modules similar to the original Transformer imlementation. """ residual = x if self.layer_norm_first: - + x = self.self_attn_layer_norm(x) x, attn, pos_bias = self.self_attn( query=x, @@ -710,8 +690,7 @@ class TransformerSentenceEncoderLayer(nn.Layer): key_padding_mask=self_attn_padding_mask, need_weights=False, attn_mask=self_attn_mask, - position_bias=pos_bias - ) + position_bias=pos_bias) # import pdb; pdb.set_trace() x = self.dropout1(x) x = residual + x @@ -734,8 +713,7 @@ class TransformerSentenceEncoderLayer(nn.Layer): key_padding_mask=self_attn_padding_mask, need_weights=need_weights, attn_mask=self_attn_mask, - position_bias=pos_bias - ) + position_bias=pos_bias) x = self.dropout1(x) x = residual + x diff --git a/paddlespeech/t2s/datasets/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py index ea273e245..116554350 100644 --- a/paddlespeech/t2s/datasets/get_feats.py +++ b/paddlespeech/t2s/datasets/get_feats.py @@ -138,7 +138,7 @@ class Pitch(): input: np.ndarray, use_continuous_f0: bool=True, use_log_f0: bool=True) -> np.ndarray: - input = input.astype(np.float) + input = input.astype(np.float_) frame_period = 1000 * self.hop_length / self.sr f0, timeaxis = pyworld.dio( input, diff --git a/paddlespeech/t2s/utils/internals.py b/paddlespeech/t2s/utils/internals.py index 830e8a80f..56b3ecaae 100644 --- a/paddlespeech/t2s/utils/internals.py +++ b/paddlespeech/t2s/utils/internals.py @@ -36,7 +36,7 @@ def convert_dtype_to_np_dtype_(dtype): elif dtype is core.VarDesc.VarType.FP16: return np.float16 elif dtype is core.VarDesc.VarType.BOOL: - return np.bool + return np.bool_ elif dtype is core.VarDesc.VarType.INT32: return np.int32 elif dtype is core.VarDesc.VarType.INT64: From 77dfdc439f0b9d938bfc6c07dd805b080c0059f1 Mon Sep 17 00:00:00 2001 From: megemini Date: Wed, 27 Nov 2024 11:13:29 +0800 Subject: [PATCH 05/36] [Update] tal_cs readme (#3911) --- examples/tal_cs/asr1/README.md | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/examples/tal_cs/asr1/README.md b/examples/tal_cs/asr1/README.md index 83a27ac1e..176925190 100644 --- a/examples/tal_cs/asr1/README.md +++ b/examples/tal_cs/asr1/README.md @@ -27,7 +27,6 @@ The document below will describe the scripts in `run.sh` in detail. The path.sh contains the environment variables. ```bash . ./path.sh -. ./cmd.sh ``` This script needs to be run first. And another script is also needed: ```bash @@ -67,7 +66,6 @@ bash run.sh --stage 0 --stop_stage 0 You can also just run these scripts in your command line. ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh ``` After processing the data, the `data` directory will look like this: @@ -103,7 +101,6 @@ bash run.sh --stage 0 --stop_stage 1 or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer ``` @@ -124,7 +121,6 @@ or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer avg.sh best exp/conformer/checkpoints 10 @@ -144,11 +140,10 @@ bash run.sh --stage 0 --stop_stage 3 or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer avg.sh best exp/conformer/checkpoints 10 -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10 ``` ## Pretrained Model You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md). @@ -163,7 +158,7 @@ source path.sh # If you have process the data and get the manifest file, you can skip the following 2 steps bash local/data.sh --stage -1 --stop_stage -1 bash local/data.sh --stage 2 --stop_stage 2 -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10 ``` The performance of the released models are shown in [here](./RESULTS.md). @@ -186,5 +181,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wa ``` You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. ```bash -CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav ``` From 3e53497a28ddfc69997d44def1e896df89e60353 Mon Sep 17 00:00:00 2001 From: megemini Date: Fri, 29 Nov 2024 19:16:55 +0800 Subject: [PATCH 06/36] =?UTF-8?q?[Hackathon=207th]=20=E4=BF=AE=E5=A4=8D=20?= =?UTF-8?q?vctk=20=E4=B8=AD=20`spk=5Femb`=20=E7=BB=B4=E5=BA=A6=E9=97=AE?= =?UTF-8?q?=E9=A2=98=20(#3916)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [Fix] vctk spk_emb dim * [Update] dim == 1 --- paddlespeech/t2s/models/fastspeech2/fastspeech2.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index a95a9b288..fcd54f0d2 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -841,6 +841,9 @@ class FastSpeech2(nn.Layer): spk_emb = self.spk_projection(F.normalize(spk_emb)) hs = hs + spk_emb.unsqueeze(1) elif self.spk_embed_integration_type == "concat": + # one wave `spk_emb` under synthesize, the dim is `1` + if spk_emb.dim() == 1: + spk_emb = spk_emb.unsqueeze(0) # concat hidden states with spk embeds and then apply projection spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( shape=[-1, paddle.shape(hs)[1], -1]) From a397ebe207b8a0de2a490b4e2f95c3b8754075e5 Mon Sep 17 00:00:00 2001 From: megemini Date: Fri, 29 Nov 2024 19:23:45 +0800 Subject: [PATCH 07/36] [Fix] import print_arguments (#3918) --- paddlespeech/s2t/exps/hubert/bin/test.py | 2 +- paddlespeech/s2t/exps/hubert/bin/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/s2t/exps/hubert/bin/test.py b/paddlespeech/s2t/exps/hubert/bin/test.py index e0ad09f0a..b08b0209a 100644 --- a/paddlespeech/s2t/exps/hubert/bin/test.py +++ b/paddlespeech/s2t/exps/hubert/bin/test.py @@ -18,7 +18,7 @@ from yacs.config import CfgNode from paddlespeech.s2t.exps.hubert.model import HubertASRTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser -from paddlespeech.s2t.utils.utility import print_arguments +from paddlespeech.utils.argparse import print_arguments def main_sp(config, args): diff --git a/paddlespeech/s2t/exps/hubert/bin/train.py b/paddlespeech/s2t/exps/hubert/bin/train.py index b7c0a924f..391405674 100644 --- a/paddlespeech/s2t/exps/hubert/bin/train.py +++ b/paddlespeech/s2t/exps/hubert/bin/train.py @@ -19,7 +19,7 @@ from yacs.config import CfgNode from paddlespeech.s2t.exps.hubert.model import HubertASRTrainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser -from paddlespeech.s2t.utils.utility import print_arguments +from paddlespeech.utils.argparse import print_arguments def main_sp(config, args): From 5e8c727fd6785e900feaa455e21b1ab93f7dc0b6 Mon Sep 17 00:00:00 2001 From: megemini Date: Fri, 29 Nov 2024 19:29:46 +0800 Subject: [PATCH 08/36] =?UTF-8?q?[Hackathon=207th]=20=E4=BF=AE=E5=A4=8D=20?= =?UTF-8?q?`tal=5Fcs`=20=E6=B5=8B=E8=AF=95=E4=B8=AD=200D=20tensor=20to=201?= =?UTF-8?q?D=20(#3913)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [Fix] 0D tensor to 1D * [Update] feat dim --- paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py | 2 +- paddlespeech/s2t/exps/u2/bin/quant.py | 2 +- paddlespeech/s2t/exps/u2/bin/test_wav.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py index d087405d5..0b763684f 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py @@ -75,7 +75,7 @@ class DeepSpeech2Tester_hub(): feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") - audio_len = paddle.to_tensor(feat.shape[0]) + audio_len = paddle.to_tensor(feat.shape[0]).unsqueeze(0) audio = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) result_transcripts = self.compute_result_transcripts( diff --git a/paddlespeech/s2t/exps/u2/bin/quant.py b/paddlespeech/s2t/exps/u2/bin/quant.py index 73a9794fc..72c64e467 100755 --- a/paddlespeech/s2t/exps/u2/bin/quant.py +++ b/paddlespeech/s2t/exps/u2/bin/quant.py @@ -75,7 +75,7 @@ class U2Infer(): feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") - ilen = paddle.to_tensor(feat.shape[0]) + ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode logger.info(f"decode cfg: {decode_config}") diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index a6228a128..0d1a3b3cc 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -78,7 +78,7 @@ class U2Infer(): if self.args.debug: np.savetxt("feat.transform.txt", feat) - ilen = paddle.to_tensor(feat.shape[0]) + ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode logger.info(f"decode cfg: {decode_config}") From 4015676a425dab5e3f88e3dd02b54f51da4d7929 Mon Sep 17 00:00:00 2001 From: megemini Date: Fri, 29 Nov 2024 19:30:50 +0800 Subject: [PATCH 09/36] =?UTF-8?q?[Hackathon=207th]=20=E4=BF=AE=E5=A4=8D=20?= =?UTF-8?q?`librispeech`=20=E4=B8=AD=20`asr`=20=E7=9A=84=20`readme`=20(#39?= =?UTF-8?q?17)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [Fix] librispeech asr0 readme * [Fix] librispeech asr1 readme --- examples/librispeech/asr0/README.md | 4 ++-- examples/librispeech/asr1/README.md | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/librispeech/asr0/README.md b/examples/librispeech/asr0/README.md index 2d3836c6b..a097dd99f 100644 --- a/examples/librispeech/asr0/README.md +++ b/examples/librispeech/asr0/README.md @@ -144,7 +144,7 @@ source path.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2 avg.sh best exp/deepspeech2/checkpoints 1 -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1 ``` ## Stage 4: Static graph model Export This stage is to transform dygraph to static graph. @@ -185,5 +185,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w ``` You can train a model by yourself, then you need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. ```bash -CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav ``` diff --git a/examples/librispeech/asr1/README.md b/examples/librispeech/asr1/README.md index ca0081444..1b02698c7 100644 --- a/examples/librispeech/asr1/README.md +++ b/examples/librispeech/asr1/README.md @@ -148,7 +148,7 @@ or you can run these scripts in the command line (only use CPU). bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer avg.sh best exp/conformer/checkpoints 20 -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 ``` ## Pretrained Model You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md). @@ -163,7 +163,7 @@ source path.sh # If you have process the data and get the manifest file, you can skip the following 2 steps bash local/data.sh --stage -1 --stop_stage -1 bash local/data.sh --stage 2 --stop_stage 2 -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 ``` The performance of the released models are shown in [here](./RESULTS.md). @@ -192,8 +192,8 @@ bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer avg.sh best exp/conformer/checkpoints 20 # test stage is optional -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 -CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 +CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 ``` ## Stage 5: Single Audio File Inference In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below @@ -214,5 +214,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w ``` You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. ```bash -CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav ``` From 5b3612f27300fa5e7a2bc9e62cd85ba6c5c8c5b1 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Mon, 2 Dec 2024 11:05:49 +0800 Subject: [PATCH 10/36] =?UTF-8?q?=E3=80=90Hackathon=207th=E3=80=91fix=20wh?= =?UTF-8?q?isper=20at=20Paddle=203.0=20(#3880)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix whisper at Paddle 3.0 * fix whisper at Paddle 3.0 * fix whisper at Paddle 3.0 * fix lint * fix * fix whisper ci * Update TTSCppFrontend * Update utils * Update steps * Update utils * Update __init__.py * Update whisper.py * Update utils * Update utils --- demos/TTSArmLinux/src/TTSCppFrontend | 2 +- examples/aishell/asr0/utils | 2 +- examples/librispeech/asr2/steps | 2 +- examples/voxceleb/sv0/utils | 2 +- paddlespeech/s2t/models/whisper/whisper.py | 10 +++++----- runtime/examples/text_lm/utils | 2 +- runtime/examples/u2pp_ol/wenetspeech/utils | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/demos/TTSArmLinux/src/TTSCppFrontend b/demos/TTSArmLinux/src/TTSCppFrontend index 25953976d..820985404 120000 --- a/demos/TTSArmLinux/src/TTSCppFrontend +++ b/demos/TTSArmLinux/src/TTSCppFrontend @@ -1 +1 @@ -../../TTSCppFrontend/ \ No newline at end of file +../../TTSCppFrontend/ diff --git a/examples/aishell/asr0/utils b/examples/aishell/asr0/utils index 256f914ab..94d118d25 120000 --- a/examples/aishell/asr0/utils +++ b/examples/aishell/asr0/utils @@ -1 +1 @@ -../../../utils/ \ No newline at end of file +../../../utils/ diff --git a/examples/librispeech/asr2/steps b/examples/librispeech/asr2/steps index 995eeccb7..7cb6e568e 120000 --- a/examples/librispeech/asr2/steps +++ b/examples/librispeech/asr2/steps @@ -1 +1 @@ -../../../tools/kaldi/egs/wsj/s5/steps/ \ No newline at end of file +../../../tools/kaldi/egs/wsj/s5/steps/ diff --git a/examples/voxceleb/sv0/utils b/examples/voxceleb/sv0/utils index 256f914ab..94d118d25 120000 --- a/examples/voxceleb/sv0/utils +++ b/examples/voxceleb/sv0/utils @@ -1 +1 @@ -../../../utils/ \ No newline at end of file +../../../utils/ diff --git a/paddlespeech/s2t/models/whisper/whisper.py b/paddlespeech/s2t/models/whisper/whisper.py index 9925e7cd5..d20cc04b6 100644 --- a/paddlespeech/s2t/models/whisper/whisper.py +++ b/paddlespeech/s2t/models/whisper/whisper.py @@ -109,11 +109,11 @@ class MultiHeadAttention(nn.Layer): n_batch, n_ctx, n_state = q.shape scale = (n_state // self.n_head)**-0.25 q = paddle.transpose( - q.view(*q.shape[:2], self.n_head, -1), (0, 2, 1, 3)) * scale + q.reshape([*q.shape[:2], self.n_head, -1]), (0, 2, 1, 3)) * scale k = paddle.transpose( - k.view(*k.shape[:2], self.n_head, -1), (0, 2, 3, 1)) * scale + k.reshape([*k.shape[:2], self.n_head, -1]), (0, 2, 3, 1)) * scale v = paddle.transpose( - v.view(*v.shape[:2], self.n_head, -1), (0, 2, 1, 3)) + v.reshape([*v.shape[:2], self.n_head, -1]), (0, 2, 1, 3)) qk = q @ k if mask is not None: @@ -823,7 +823,7 @@ class BeamSearchDecoder(TokenDecoder): if self.finished_sequences is None: # for the first update self.finished_sequences = [{} for _ in range(batch_size)] - logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32) + logprobs = F.log_softmax(logits, axis=-1, dtype='float32') next_tokens, source_indices, finished_sequences = [], [], [] for i in range(batch_size): scores, sources, finished = {}, {}, {} @@ -969,7 +969,7 @@ class ApplyTimestampRules(LogitFilter): logits[:, last_allowed + 1:] = -np.inf # if sum of probability over timestamps is above any other token, sample timestamp - logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32) + logprobs = F.log_softmax(logits, axis=-1, dtype='float32') for k in range(tokens.shape[0]): # When using paddle.logsumexp on a 32GB Tesla-V100 GPU, we encountered CUDA error 700. # To bypass this issue in CI, we have decomposed the operation into separate steps. diff --git a/runtime/examples/text_lm/utils b/runtime/examples/text_lm/utils index 256f914ab..94d118d25 120000 --- a/runtime/examples/text_lm/utils +++ b/runtime/examples/text_lm/utils @@ -1 +1 @@ -../../../utils/ \ No newline at end of file +../../../utils/ diff --git a/runtime/examples/u2pp_ol/wenetspeech/utils b/runtime/examples/u2pp_ol/wenetspeech/utils index c2519a9dd..758320d41 120000 --- a/runtime/examples/u2pp_ol/wenetspeech/utils +++ b/runtime/examples/u2pp_ol/wenetspeech/utils @@ -1 +1 @@ -../../../../utils/ \ No newline at end of file +../../../../utils/ From 890c87ea93f3146666c6825306ceb8e21b18d099 Mon Sep 17 00:00:00 2001 From: megemini Date: Mon, 2 Dec 2024 11:08:28 +0800 Subject: [PATCH 11/36] [Fix] import TimeDomainSpecAugment (#3919) --- paddlespeech/s2t/exps/wavlm/model.py | 43 ++++++++++++++++------------ 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/paddlespeech/s2t/exps/wavlm/model.py b/paddlespeech/s2t/exps/wavlm/model.py index 6ed2c5d87..606867eae 100644 --- a/paddlespeech/s2t/exps/wavlm/model.py +++ b/paddlespeech/s2t/exps/wavlm/model.py @@ -33,7 +33,7 @@ from paddlespeech.s2t.io.speechbrain import data_pipeline from paddlespeech.s2t.io.speechbrain import dataio from paddlespeech.s2t.io.speechbrain import dataset from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader -from paddlespeech.s2t.models.wavlm.processing.speech_augmentation import TimeDomainSpecAugment +from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment from paddlespeech.s2t.models.wavlm.wavlm_asr import WavLMASR from paddlespeech.s2t.training.optimizer import OptimizerFactory from paddlespeech.s2t.training.reporter import ObsScope @@ -211,7 +211,7 @@ class WavLMASRTrainer(Trainer): loss.backward() layer_tools.print_grads(self.model, print_func=None) - + # NOTE: the code below asserted that the backward() is problematic, and as more steps are accumulated, the output from wavlm alone will be the same for all frames # optimizer step old if (batch_index + 1) % train_conf.accum_grad == 0: @@ -428,8 +428,7 @@ class WavLMASRTrainer(Trainer): report("epoch", self.epoch) report('step', self.iteration) report("model_lr", self.model_optimizer.get_lr()) - report("wavlm_lr", - self.wavlm_optimizer.get_lr()) + report("wavlm_lr", self.wavlm_optimizer.get_lr()) self.train_batch(batch_index, batch, msg) self.after_train_batch() report('iter', batch_index + 1) @@ -680,8 +679,7 @@ class WavLMASRTrainer(Trainer): logger.info("optim_model:{},{}", model_optim_type, model_optim_conf) wavlm_optim_type = train_config.wavlm_optim wavlm_optim_conf = train_config.wavlm_optim_conf - logger.info("optim_model:{},{}", wavlm_optim_type, - wavlm_optim_conf) + logger.info("optim_model:{},{}", wavlm_optim_type, wavlm_optim_conf) model_scheduler_type = train_config.model_scheduler model_scheduler_conf = train_config.model_scheduler_conf @@ -698,8 +696,8 @@ class WavLMASRTrainer(Trainer): model_lr_scheduler = LRSchedulerFactory.from_args(model_scheduler_type, model_scheduler_args) - wavlm_lr_scheduler = LRSchedulerFactory.from_args( - wavlm_scheduler_type, wavlm_scheduler_args) + wavlm_lr_scheduler = LRSchedulerFactory.from_args(wavlm_scheduler_type, + wavlm_scheduler_args) def optimizer_args( config, @@ -716,24 +714,31 @@ class WavLMASRTrainer(Trainer): }) return optim_arg - model_optimizer_args = optimizer_args( - config, model_optim_type, - model_optim_conf, - [{'params': model._layers.enc.parameters()}, {'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.enc.parameters()}, {'params': model.ctc.parameters()}], - model_lr_scheduler - ) - # [{'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.ctc.parameters()}], model_lr_scheduler) - + model_optimizer_args = optimizer_args(config, model_optim_type, + model_optim_conf, [{ + 'params': + model._layers.enc.parameters() + }, { + 'params': + model._layers.ctc.parameters() + }] if self.parallel else [{ + 'params': + model.enc.parameters() + }, { + 'params': + model.ctc.parameters() + }], model_lr_scheduler) + # [{'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.ctc.parameters()}], model_lr_scheduler) wavlm_optimizer_args = optimizer_args( config, wavlm_optim_type, wavlm_optim_conf, - model._layers.wavlm.parameters() if self.parallel else - model.wavlm.parameters(), wavlm_lr_scheduler) + model._layers.wavlm.parameters() + if self.parallel else model.wavlm.parameters(), wavlm_lr_scheduler) model_optimizer = OptimizerFactory.from_args(model_optim_type, model_optimizer_args) wavlm_optimizer = OptimizerFactory.from_args(wavlm_optim_type, - wavlm_optimizer_args) + wavlm_optimizer_args) self.model_optimizer = model_optimizer self.wavlm_optimizer = wavlm_optimizer From c33d9bfb50be94119827a01857a17ec477b88956 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 2 Dec 2024 11:29:46 +0800 Subject: [PATCH 12/36] fix vits with CSMSC (#3920) --- paddlespeech/t2s/models/vits/generator.py | 8 +++++--- paddlespeech/t2s/models/vits/posterior_encoder.py | 1 + paddlespeech/t2s/models/vits/text_encoder.py | 1 + paddlespeech/t2s/modules/nets_utils.py | 8 +++++++- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py index 427ae09ed..d82d78e7c 100644 --- a/paddlespeech/t2s/models/vits/generator.py +++ b/paddlespeech/t2s/models/vits/generator.py @@ -577,8 +577,9 @@ class VITSGenerator(nn.Layer): # decoder z_p = m_p + paddle.randn( paddle.shape(m_p)) * paddle.exp(logs_p) * noise_scale - z = self.flow(z_p, y_mask, g=g, inverse=True) - wav = self.decoder((z * y_mask)[:, :, :max_len], g=g) + z = self.flow(z_p, y_mask.astype(z_p.dtype), g=g, inverse=True) + wav = self.decoder( + (z * y_mask.astype(z.dtype))[:, :, :max_len], g=g) return wav.squeeze(1), attn.squeeze(1), dur.squeeze(1) @@ -695,4 +696,5 @@ class VITSGenerator(nn.Layer): path = paddle.cast(path, dtype='float32') pad_tmp = self.pad1d(path)[:, :-1] path = path - pad_tmp - return path.unsqueeze(1).transpose([0, 1, 3, 2]) * mask + return path.unsqueeze(1).transpose( + [0, 1, 3, 2]) * mask.astype(path.dtype) diff --git a/paddlespeech/t2s/models/vits/posterior_encoder.py b/paddlespeech/t2s/models/vits/posterior_encoder.py index 5e3d6b9ce..b0a071b23 100644 --- a/paddlespeech/t2s/models/vits/posterior_encoder.py +++ b/paddlespeech/t2s/models/vits/posterior_encoder.py @@ -129,6 +129,7 @@ class PosteriorEncoder(nn.Layer): """ x_mask = make_non_pad_mask(x_lengths).unsqueeze(1) + x_mask = x_mask.astype(x.dtype) x = self.input_conv(x) * x_mask x = self.encoder(x, x_mask, g=g) stats = self.proj(x) * x_mask diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py index 015ed76c6..5b9de95a9 100644 --- a/paddlespeech/t2s/models/vits/text_encoder.py +++ b/paddlespeech/t2s/models/vits/text_encoder.py @@ -155,6 +155,7 @@ class TextEncoder(nn.Layer): """ x = self.emb(x) * math.sqrt(self.attention_dim) x_mask = make_non_pad_mask(x_lengths).unsqueeze(1) + x_mask = x_mask.astype(x.dtype) # encoder assume the channel last (B, T_text, attention_dim) # but mask shape shoud be (B, 1, T_text) x, _ = self.encoder(x, x_mask) diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 57c46e3a8..0a66a1c88 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -181,6 +181,10 @@ def make_pad_mask(lengths, xs=None, length_dim=-1): if length_dim == 0: raise ValueError("length_dim cannot be 0: {}".format(length_dim)) + # check if ilens is 0-dim tensor, if so, add a dimension + if lengths.ndim == 0: + lengths = lengths.unsqueeze(0) + bs = paddle.shape(lengths) if xs is None: maxlen = paddle.cast(lengths.max(), dtype=bs.dtype) @@ -348,7 +352,9 @@ def get_random_segments( """ b, c, t = paddle.shape(x) max_start_idx = x_lengths - segment_size - start_idxs = paddle.cast(paddle.rand([b]) * max_start_idx, 'int64') + rand_number = paddle.rand([b]) + start_idxs = paddle.cast(rand_number * + max_start_idx.astype(rand_number.dtype), 'int64') segments = get_segments(x, start_idxs, segment_size) return segments, start_idxs From 67ae7c8dd2317882806c439c038c4cdff3aba896 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 2 Dec 2024 11:33:16 +0800 Subject: [PATCH 13/36] [Hackathon 7th] fix Voc5/Jets/TTS2 with CSMSC (#3906) * fix Voc5/Jets with CSMSC * fix Voc5/Jets with CSMSC * Update README.md * Update README.md * Update README.md * Update iSTFTNet.md * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review --- examples/csmsc/jets/README.md | 13 ++++++++++++- examples/csmsc/tts2/README.md | 11 +++++++++++ examples/csmsc/voc5/README.md | 11 +++++++++++ examples/csmsc/voc5/iSTFTNet.md | 11 +++++++++++ paddlespeech/t2s/exps/gan_vocoder/preprocess.py | 2 +- paddlespeech/t2s/models/jets/length_regulator.py | 4 +++- 6 files changed, 49 insertions(+), 3 deletions(-) diff --git a/examples/csmsc/jets/README.md b/examples/csmsc/jets/README.md index 07dade0e6..20314cec0 100644 --- a/examples/csmsc/jets/README.md +++ b/examples/csmsc/jets/README.md @@ -3,7 +3,18 @@ This example contains code used to train a [JETS](https://arxiv.org/abs/2203.168 ## Dataset ### Download and Extract -Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). +Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. + +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS. diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 969567762..7f7cdde0e 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -5,6 +5,17 @@ This example contains code used to train a [SpeedySpeech](http://arxiv.org/abs/2 ### Download and Extract Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 3347c6473..e4d100619 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -4,6 +4,17 @@ This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010. ### Download and Extract Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. diff --git a/examples/csmsc/voc5/iSTFTNet.md b/examples/csmsc/voc5/iSTFTNet.md index 8f121938a..693950c54 100644 --- a/examples/csmsc/voc5/iSTFTNet.md +++ b/examples/csmsc/voc5/iSTFTNet.md @@ -6,6 +6,17 @@ This example contains code used to train a [iSTFTNet](https://arxiv.org/abs/2203 ### Download and Extract Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index a2629a900..c1513e0c4 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -203,9 +203,9 @@ def main(): sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) - # split data into 3 sections if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) + # split data into 3 sections num_train = 9800 num_dev = 100 train_wav_files = wav_files[:num_train] diff --git a/paddlespeech/t2s/models/jets/length_regulator.py b/paddlespeech/t2s/models/jets/length_regulator.py index f7a395a64..f8629382c 100644 --- a/paddlespeech/t2s/models/jets/length_regulator.py +++ b/paddlespeech/t2s/models/jets/length_regulator.py @@ -55,7 +55,9 @@ class GaussianUpsampling(nn.Layer): if h_masks is not None: t = t * paddle.to_tensor(h_masks, dtype="float32") - c = ds.cumsum(axis=-1) - ds / 2 + ds_cumsum = ds.cumsum(axis=-1) + ds_half = ds / 2 + c = ds_cumsum.astype(ds_half.dtype) - ds_half energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1))**2 if d_masks is not None: d_masks = ~(d_masks.unsqueeze(1)) From f582cb6299173c5a0d52128c0da899792cd5a48c Mon Sep 17 00:00:00 2001 From: megemini Date: Thu, 5 Dec 2024 11:17:16 +0800 Subject: [PATCH 14/36] =?UTF-8?q?[Hackathon=207th]=20=E4=BF=AE=E5=A4=8D=20?= =?UTF-8?q?`panns`=20=E4=B8=AD=20`predict.py`=20=E5=AF=B9=E4=BA=8E=20pir?= =?UTF-8?q?=20=E7=9A=84=20json=20=E6=A8=A1=E5=9E=8B=E8=B7=AF=E5=BE=84=20(#?= =?UTF-8?q?3914)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [Fix] panns predict.py * [Update] path exists * [Fix] disable mkldnn and transpose dimension * [Update] model_file check json first * [Update] satisty version * [Update] satisty version * [Update] satisty version * [Update] config disable_mkldnn * [Update] unsqueeze --- paddlespeech/cls/exps/panns/deploy/predict.py | 19 +++++++++++++------ paddlespeech/cls/exps/panns/export_model.py | 3 ++- paddlespeech/utils/__init__.py | 16 ++++++++++++++++ 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/paddlespeech/cls/exps/panns/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py index 1dd0fb531..a6b735335 100644 --- a/paddlespeech/cls/exps/panns/deploy/predict.py +++ b/paddlespeech/cls/exps/panns/deploy/predict.py @@ -15,12 +15,15 @@ import argparse import os import numpy as np +import paddle from paddle import inference from paddle.audio.datasets import ESC50 from paddle.audio.features import LogMelSpectrogram from paddleaudio.backends import soundfile_load as load_audio from scipy.special import softmax +import paddlespeech.utils + # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.") @@ -56,7 +59,6 @@ def extract_features(files: str, **kwargs): feature_extractor = LogMelSpectrogram(sr, **kwargs) feat = feature_extractor(paddle.to_tensor(waveforms[i])) feat = paddle.transpose(feat, perm=[1, 0]).unsqueeze(0) - feats.append(feat) return np.stack(feats, axis=0) @@ -73,13 +75,18 @@ class Predictor(object): enable_mkldnn=False): self.batch_size = batch_size - model_file = os.path.join(model_dir, "inference.pdmodel") - params_file = os.path.join(model_dir, "inference.pdiparams") + if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'): + config = inference.Config(model_dir, 'inference') + config.disable_mkldnn() + else: + model_file = os.path.join(model_dir, 'inference.pdmodel') + params_file = os.path.join(model_dir, "inference.pdiparams") + + assert os.path.isfile(model_file) and os.path.isfile( + params_file), 'Please check model and parameter files.' - assert os.path.isfile(model_file) and os.path.isfile( - params_file), 'Please check model and parameter files.' + config = inference.Config(model_file, params_file) - config = inference.Config(model_file, params_file) if device == "gpu": # set GPU configs accordingly # such as intialize the gpu memory, enable tensorrt diff --git a/paddlespeech/cls/exps/panns/export_model.py b/paddlespeech/cls/exps/panns/export_model.py index 63b22981a..e860b54aa 100644 --- a/paddlespeech/cls/exps/panns/export_model.py +++ b/paddlespeech/cls/exps/panns/export_model.py @@ -39,7 +39,8 @@ if __name__ == '__main__': input_spec=[ paddle.static.InputSpec( shape=[None, None, 64], dtype=paddle.float32) - ]) + ], + full_graph=True) # Save in static graph model. paddle.jit.save(model, os.path.join(args.output_dir, "inference")) diff --git a/paddlespeech/utils/__init__.py b/paddlespeech/utils/__init__.py index 185a92b8d..66c492779 100644 --- a/paddlespeech/utils/__init__.py +++ b/paddlespeech/utils/__init__.py @@ -11,3 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from packaging.version import Version + + +def satisfy_version(source: str, target: str, dev_allowed: bool=True) -> bool: + if dev_allowed and source.startswith('0.0.0'): + target_version = Version('0.0.0') + else: + target_version = Version(target) + + source_version = Version(source) + return source_version >= target_version + + +def satisfy_paddle_version(target: str, dev_allowed: bool=True) -> bool: + import paddle + return satisfy_version(paddle.__version__, target, dev_allowed) From c0fafd0647f88280158ae972cdefbdbc5986242c Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 5 Dec 2024 15:46:50 +0800 Subject: [PATCH 15/36] support new inference interface (#3927) * pir infer * add version control * fix --- .../t2s/exps/speedyspeech/inference.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py index d4958bc49..7a6a43406 100644 --- a/paddlespeech/t2s/exps/speedyspeech/inference.py +++ b/paddlespeech/t2s/exps/speedyspeech/inference.py @@ -18,6 +18,7 @@ from pathlib import Path import soundfile as sf from paddle import inference +import paddlespeech.utils from paddlespeech.t2s.frontend.zh_frontend import Frontend @@ -48,16 +49,27 @@ def main(): phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) print("frontend done!") - speedyspeech_config = inference.Config( - str(Path(args.inference_dir) / "speedyspeech.pdmodel"), - str(Path(args.inference_dir) / "speedyspeech.pdiparams")) + # after paddle 3.0, support new inference interface + if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'): + speedyspeech_config = inference.Config( + str(Path(args.inference_dir)), "speedyspeech") + else: + speedyspeech_config = inference.Config( + str(Path(args.inference_dir) / "speedyspeech.pdmodel"), + str(Path(args.inference_dir) / "speedyspeech.pdiparams")) + speedyspeech_config.enable_use_gpu(100, 0) speedyspeech_config.enable_memory_optim() speedyspeech_predictor = inference.create_predictor(speedyspeech_config) - pwg_config = inference.Config( - str(Path(args.inference_dir) / "pwg.pdmodel"), - str(Path(args.inference_dir) / "pwg.pdiparams")) + # after paddle 3.0, support new inference interface + if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'): + pwg_config = inference.Config(str(Path(args.inference_dir)), "pwg") + else: + pwg_config = inference.Config( + str(Path(args.inference_dir) / "pwg.pdmodel"), + str(Path(args.inference_dir) / "pwg.pdiparams")) + pwg_config.enable_use_gpu(100, 0) pwg_config.enable_memory_optim() pwg_predictor = inference.create_predictor(pwg_config) From ff539ef007abbc64b6b6c9846bc0c0fb28203d23 Mon Sep 17 00:00:00 2001 From: megemini Date: Thu, 5 Dec 2024 20:44:30 +0800 Subject: [PATCH 16/36] [Fix] transpose use numpy (#3933) --- paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py index be78b516a..99cadc3ac 100644 --- a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py @@ -1267,7 +1267,7 @@ class TransposeLast(nn.Layer): def forward(self, x): if self.deconstruct_idx is not None: x = x[self.deconstruct_idx] - trans_dim = paddle.arange(x.dim()) + trans_dim = np.arange(x.dim()) trans_dim[-1], trans_dim[-2] = trans_dim[-2], trans_dim[-1] return x.transpose(trans_dim) From 2985c4daeac906fbea8f0f4beefb141b74f9dfa1 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Fri, 6 Dec 2024 14:49:41 +0800 Subject: [PATCH 17/36] =?UTF-8?q?=E3=80=90asr=E3=80=91add=20chunk=20config?= =?UTF-8?q?=20for=20tal=5Fcs=20(#3936)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add chunk config * fix * fix --- .../tal_cs/asr1/conf/chunk_conformer.yaml | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 examples/tal_cs/asr1/conf/chunk_conformer.yaml diff --git a/examples/tal_cs/asr1/conf/chunk_conformer.yaml b/examples/tal_cs/asr1/conf/chunk_conformer.yaml new file mode 100644 index 000000000..ba0dbb49b --- /dev/null +++ b/examples/tal_cs/asr1/conf/chunk_conformer.yaml @@ -0,0 +1,96 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 # sublayer output dropout + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 # sublayer output dropout + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + init_type: 'kaiming_uniform' # !Warning: need to convergence + +########################################### +# Data # +########################################### + +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test + + +########################################### +# Dataloader # +########################################### + +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: 'data/lang_char/bpe_bpe_11297' +unit_type: 'spm' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 20.0 +window_ms: 30.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 2 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 100 +accum_grad: 4 +global_grad_clip: 5.0 +dist_sampler: False +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 From d17361cf8c44fe21cca444366cf35f45e9f84ccd Mon Sep 17 00:00:00 2001 From: megemini Date: Fri, 6 Dec 2024 16:48:17 +0800 Subject: [PATCH 18/36] [Fix] duplicated arg (#3934) --- paddlespeech/s2t/exps/hubert/bin/test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddlespeech/s2t/exps/hubert/bin/test.py b/paddlespeech/s2t/exps/hubert/bin/test.py index b08b0209a..019741f9d 100644 --- a/paddlespeech/s2t/exps/hubert/bin/test.py +++ b/paddlespeech/s2t/exps/hubert/bin/test.py @@ -37,8 +37,6 @@ if __name__ == "__main__": # save asr result to parser.add_argument( '--dict-path', type=str, default=None, help='dict path.') - parser.add_argument( - "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) From a34bf501a5d342b0cfc5e82723c8621ca9f726b7 Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Fri, 6 Dec 2024 17:06:11 +0800 Subject: [PATCH 19/36] =?UTF-8?q?[Hackathon=207th]=20=E4=BF=AE=E5=A4=8D=20?= =?UTF-8?q?opencopop=E7=9A=84svs1=E4=B8=AD=E7=9A=84shape=E9=97=AE=E9=A2=98?= =?UTF-8?q?=20(#3912)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix svs1 * fix * fix * fix * fix * add comment --- paddlespeech/t2s/modules/diffnet.py | 6 +++++- paddlespeech/t2s/modules/nets_utils.py | 7 ++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/paddlespeech/t2s/modules/diffnet.py b/paddlespeech/t2s/modules/diffnet.py index 2f433ad68..deae4847f 100644 --- a/paddlespeech/t2s/modules/diffnet.py +++ b/paddlespeech/t2s/modules/diffnet.py @@ -120,7 +120,11 @@ class SinusoidalPosEmb(nn.Layer): self.dim = dim def forward(self, x: paddle.Tensor): - x = paddle.cast(x, 'float32') + # check if x is 0-dim tensor, if so, add a dimension + if x.ndim == 0: + x = paddle.cast(x.unsqueeze(0), 'float32') + else: + x = paddle.cast(x, 'float32') half_dim = self.dim // 2 emb = math.log(10000) / (half_dim - 1) emb = paddle.exp(paddle.arange(half_dim) * -emb) diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 0a66a1c88..a3c6947b8 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -181,11 +181,12 @@ def make_pad_mask(lengths, xs=None, length_dim=-1): if length_dim == 0: raise ValueError("length_dim cannot be 0: {}".format(length_dim)) - # check if ilens is 0-dim tensor, if so, add a dimension + # check if lengths is 0-dim tensor, if so, add a dimension if lengths.ndim == 0: - lengths = lengths.unsqueeze(0) + bs = paddle.shape(lengths.unsqueeze(0)) + else: + bs = paddle.shape(lengths) - bs = paddle.shape(lengths) if xs is None: maxlen = paddle.cast(lengths.max(), dtype=bs.dtype) else: From e3c4d4bd7e7fed5aae656f3a947bfe04a8b7520f Mon Sep 17 00:00:00 2001 From: megemini Date: Mon, 9 Dec 2024 15:52:50 +0800 Subject: [PATCH 20/36] [Fix] use reshape instead of view (#3939) --- audio/paddleaudio/utils/tensor_utils.py | 5 +- .../s2t/decoders/scorers/ctc_prefix_score.py | 40 +- .../s2t/decoders/scorers/scorer_interface.py | 2 +- paddlespeech/s2t/models/hubert/hubert_ASR.py | 2 +- paddlespeech/s2t/models/lm/transformer.py | 8 +- paddlespeech/s2t/models/u2_st/u2_st.py | 4 +- .../s2t/models/wavlm/modules/modules.py | 360 +++++++++--------- paddlespeech/s2t/models/wavlm/wavlm_asr.py | 2 +- paddlespeech/s2t/models/wavlm/wavlm_paddle.py | 11 +- paddlespeech/s2t/utils/tensor_utils.py | 5 +- paddlespeech/t2s/models/jets/generator.py | 4 +- 11 files changed, 224 insertions(+), 219 deletions(-) diff --git a/audio/paddleaudio/utils/tensor_utils.py b/audio/paddleaudio/utils/tensor_utils.py index 16f60810e..cfd490b9a 100644 --- a/audio/paddleaudio/utils/tensor_utils.py +++ b/audio/paddleaudio/utils/tensor_utils.py @@ -177,8 +177,9 @@ def th_accuracy(pad_outputs: paddle.Tensor, Returns: float: Accuracy value (0.0 - 1.0). """ - pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1], - pad_outputs.shape[1]).argmax(2) + pad_pred = pad_outputs.reshape( + [pad_targets.shape[0], pad_targets.shape[1], + pad_outputs.shape[1]]).argmax(2) mask = pad_targets != ignore_label #TODO(Hui Zhang): sum not support bool type # numerator = paddle.sum( diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py index a994412e0..2664765da 100644 --- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py +++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py @@ -86,7 +86,7 @@ class CTCPrefixScorePD(): dtype=self.dtype, ) # (T, 2, B, W) r_prev[:, 1] = paddle.cumsum(self.x[0, :, :, self.blank], 0).unsqueeze(2) - r_prev = r_prev.view(-1, 2, n_bh) # (T, 2, BW) + r_prev = r_prev.reshape([-1, 2, n_bh]) # (T, 2, BW) s_prev = 0.0 # score f_min_prev = 0 # eq. 22-23 f_max_prev = 1 # eq. 22-23 @@ -100,23 +100,23 @@ class CTCPrefixScorePD(): (n_bh, self.odim), -1, dtype=paddle.long) snum = self.scoring_num if self.idx_bh is None or n_bh > len(self.idx_bh): - self.idx_bh = paddle.arange(n_bh).view(-1, 1) # (BW, 1) + self.idx_bh = paddle.arange(n_bh).reshape([-1, 1]) # (BW, 1) scoring_idmap[self.idx_bh[:n_bh], scoring_ids] = paddle.arange(snum) scoring_idx = ( - scoring_ids + self.idx_bo.repeat(1, n_hyps).view(-1, - 1) # (BW,1) - ).view(-1) # (BWO) + scoring_ids + self.idx_bo.repeat(1, n_hyps).reshape( + [-1, 1]) # (BW,1) + ).reshape([-1]) # (BWO) # x_ shape (2, T, B*W, O) x_ = paddle.index_select( - self.x.view(2, -1, self.batch * self.odim), scoring_idx, - 2).view(2, -1, n_bh, snum) + self.x.reshape([2, -1, self.batch * self.odim]), scoring_idx, + 2).reshape([2, -1, n_bh, snum]) else: scoring_ids = None scoring_idmap = None snum = self.odim # x_ shape (2, T, B*W, O) - x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).view(2, -1, - n_bh, snum) + x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).reshape( + [2, -1, n_bh, snum]) # new CTC forward probs are prepared as a (T x 2 x BW x S) tensor # that corresponds to r_t^n(h) and r_t^b(h) in a batch. @@ -154,8 +154,8 @@ class CTCPrefixScorePD(): # compute forward probabilities log(r_t^n(h)) and log(r_t^b(h)) for t in range(start, end): rp = r[t - 1] # (2 x BW x O') - rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view( - 2, 2, n_bh, snum) # (2,2,BW,O') + rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).reshape( + [2, 2, n_bh, snum]) # (2,2,BW,O') r[t] = paddle.logsumexp(rr, 1) + x_[:, t] # compute log prefix probabilities log(psi) @@ -197,25 +197,27 @@ class CTCPrefixScorePD(): # convert ids to BHO space n_bh = len(s) n_hyps = n_bh // self.batch - vidx = (best_ids + (self.idx_b * - (n_hyps * self.odim)).view(-1, 1)).view(-1) + vidx = (best_ids + + (self.idx_b * + (n_hyps * self.odim)).reshape([-1, 1])).reshape([-1]) # select hypothesis scores - s_new = paddle.index_select(s.view(-1), vidx, 0) - s_new = s_new.view(-1, 1).repeat(1, self.odim).view(n_bh, self.odim) + s_new = paddle.index_select(s.reshape([-1]), vidx, 0) + s_new = s_new.reshape([-1, 1]).repeat(1, self.odim).reshape( + [n_bh, self.odim]) # convert ids to BHS space (S: scoring_num) if scoring_idmap is not None: snum = self.scoring_num hyp_idx = (best_ids // self.odim + - (self.idx_b * n_hyps).view(-1, 1)).view(-1) - label_ids = paddle.fmod(best_ids, self.odim).view(-1) + (self.idx_b * n_hyps).reshape([-1, 1])).reshape([-1]) + label_ids = paddle.fmod(best_ids, self.odim).reshape([-1]) score_idx = scoring_idmap[hyp_idx, label_ids] score_idx[score_idx == -1] = 0 vidx = score_idx + hyp_idx * snum else: snum = self.odim # select forward probabilities - r_new = paddle.index_select(r.view(-1, 2, n_bh * snum), vidx, 2).view( - -1, 2, n_bh) + r_new = paddle.index_select(r.reshape([-1, 2, n_bh * snum]), vidx, + 2).reshape([-1, 2, n_bh]) return r_new, s_new, f_min, f_max def extend_prob(self, x): diff --git a/paddlespeech/s2t/decoders/scorers/scorer_interface.py b/paddlespeech/s2t/decoders/scorers/scorer_interface.py index 3272e6b7a..6e62ca398 100644 --- a/paddlespeech/s2t/decoders/scorers/scorer_interface.py +++ b/paddlespeech/s2t/decoders/scorers/scorer_interface.py @@ -135,7 +135,7 @@ class BatchScorerInterface(ScorerInterface): score, outstate = self.score(y, state, x) outstates.append(outstate) scores.append(score) - scores = paddle.cat(scores, 0).view(ys.shape[0], -1) + scores = paddle.cat(scores, 0).reshape([ys.shape[0], -1]) return scores, outstates diff --git a/paddlespeech/s2t/models/hubert/hubert_ASR.py b/paddlespeech/s2t/models/hubert/hubert_ASR.py index 4a0dc2aa6..9581879d0 100644 --- a/paddlespeech/s2t/models/hubert/hubert_ASR.py +++ b/paddlespeech/s2t/models/hubert/hubert_ASR.py @@ -213,7 +213,7 @@ class HubertASR(nn.Layer): x_lens = x.shape[1] ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, x_lens) # (B, maxlen) + topk_index = topk_index.reshape([batch_size, x_lens]) # (B, maxlen) hyps = [hyp.tolist() for hyp in topk_index] hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] diff --git a/paddlespeech/s2t/models/lm/transformer.py b/paddlespeech/s2t/models/lm/transformer.py index 04ddddf86..5bdb1f2fe 100644 --- a/paddlespeech/s2t/models/lm/transformer.py +++ b/paddlespeech/s2t/models/lm/transformer.py @@ -122,10 +122,12 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): h, _ = self.encoder(emb, xlen) y = self.decoder(h) loss = F.cross_entropy( - y.view(-1, paddle.shape(y)[-1]), t.view(-1), reduction="none") + y.reshape([-1, paddle.shape(y)[-1]]), + t.reshape([-1]), + reduction="none") mask = xm.to(loss.dtype) - logp = loss * mask.view(-1) - nll = logp.view(batch_size, -1).sum(-1) + logp = loss * mask.reshape([-1]) + nll = logp.reshape([batch_size, -1]).sum(-1) nll_count = mask.sum(-1) logp = logp.sum() count = mask.sum() diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index b4c8c255f..339af4b74 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -176,7 +176,7 @@ class U2STBaseModel(nn.Layer): # 2. Compute attention loss loss_att = self.criterion_att(decoder_out, ys_out_pad) acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), + decoder_out.reshape([-1, self.vocab_size]), ys_out_pad, ignore_label=self.ignore_id, ) return loss_att, acc_att @@ -209,7 +209,7 @@ class U2STBaseModel(nn.Layer): # 2. Compute attention loss loss_att = self.criterion_att(decoder_out, ys_out_pad) acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), + decoder_out.reshape([-1, self.vocab_size]), ys_out_pad, ignore_label=self.ignore_id, ) return loss_att, acc_att diff --git a/paddlespeech/s2t/models/wavlm/modules/modules.py b/paddlespeech/s2t/models/wavlm/modules/modules.py index f14e4016f..c41342d6a 100644 --- a/paddlespeech/s2t/models/wavlm/modules/modules.py +++ b/paddlespeech/s2t/models/wavlm/modules/modules.py @@ -6,17 +6,18 @@ # Based on fairseq code bases # https://github.com/pytorch/fairseq # -------------------------------------------------------- - import math import warnings -from typing import Dict, Optional, Tuple -from .functional import multi_head_attention_forward_paddle +from typing import Dict +from typing import Optional +from typing import Tuple import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import Tensor +from .functional import multi_head_attention_forward_paddle class TransposeLast(nn.Layer): @@ -40,8 +41,7 @@ class Fp32LayerNorm(nn.LayerNorm): self.normalized_shape, self.weight.float() if self.weight is not None else None, self.bias.float() if self.bias is not None else None, - self.eps, - ) + self.eps, ) return output.type_as(input) @@ -55,12 +55,10 @@ class Fp32GroupNorm(nn.GroupNorm): self.num_groups, self.weight.float() if self.weight is not None else None, self.bias.float() if self.bias is not None else None, - self.eps, - ) + self.eps, ) return output.type_as(input) - class SamePad(nn.Layer): def __init__(self, kernel_size, causal=False): super().__init__() @@ -71,7 +69,7 @@ class SamePad(nn.Layer): def forward(self, x): if self.remove > 0: - x = x[:, :, : -self.remove] + x = x[:, :, :-self.remove] return x @@ -89,7 +87,11 @@ class Swish(nn.Layer): class GLU_Linear(nn.Layer): - def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True): + def __init__(self, + input_dim, + output_dim, + glu_type="sigmoid", + bias_in_glu=True): super(GLU_Linear, self).__init__() self.glu_type = glu_type @@ -114,9 +116,11 @@ class GLU_Linear(nn.Layer): x = self.linear(x) if self.glu_type == "bilinear": - x = (x[:, :, 0:self.output_dim] * x[:, :, self.output_dim:self.output_dim * 2]) + x = (x[:, :, 0:self.output_dim] * + x[:, :, self.output_dim:self.output_dim * 2]) else: - x = (x[:, :, 0:self.output_dim] * self.glu_act(x[:, :, self.output_dim:self.output_dim * 2])) + x = (x[:, :, 0:self.output_dim] * + self.glu_act(x[:, :, self.output_dim:self.output_dim * 2])) return x @@ -124,9 +128,8 @@ class GLU_Linear(nn.Layer): def gelu_accurate(x): if not hasattr(gelu_accurate, "_a"): gelu_accurate._a = math.sqrt(2 / math.pi) - return ( - 0.5 * x * (1 + paddle.tanh(gelu_accurate._a * (x + 0.044715 * paddle.pow(x, 3)))) - ) + return (0.5 * x * (1 + paddle.tanh(gelu_accurate._a * + (x + 0.044715 * paddle.pow(x, 3))))) def gelu(x: Tensor) -> Tensor: @@ -142,8 +145,7 @@ def get_activation_fn(activation: str): return gelu elif activation == "gelu_fast": warnings.warn( - "--activation-fn=gelu_fast has been renamed to gelu_accurate" - ) + "--activation-fn=gelu_fast has been renamed to gelu_accurate") return gelu_accurate elif activation == "gelu_accurate": return gelu_accurate @@ -154,7 +156,8 @@ def get_activation_fn(activation: str): elif activation == "glu": return lambda x: x else: - raise RuntimeError("--activation-fn {} not supported".format(activation)) + raise RuntimeError( + "--activation-fn {} not supported".format(activation)) def quant_noise(module, p, block_size): @@ -190,16 +193,15 @@ def quant_noise(module, p, block_size): # 2D matrix if not is_conv: assert ( - module.weight.size(1) % block_size == 0 - ), "Input features must be a multiple of block sizes" + module.weight.size(1) % + block_size == 0), "Input features must be a multiple of block sizes" # 4D matrix else: # 1x1 convolutions if module.kernel_size == (1, 1): - assert ( - module.in_channels % block_size == 0 - ), "Input channels must be a multiple of block sizes" + assert (module.in_channels % block_size == 0 + ), "Input channels must be a multiple of block sizes" # regular convolutions else: k = module.kernel_size[0] * module.kernel_size[1] @@ -216,10 +218,11 @@ def quant_noise(module, p, block_size): # split weight matrix into blocks and randomly drop selected blocks mask = paddle.zeros( - in_features // block_size * out_features, device=weight.device - ) + in_features // block_size * out_features, + device=weight.device) mask.bernoulli_(p) - mask = mask.repeat_interleave(block_size, -1).view(-1, in_features) + mask = mask.repeat_interleave(block_size, -1).reshape( + [-1, in_features]) else: # gather weight and sizes @@ -231,26 +234,21 @@ def quant_noise(module, p, block_size): if mod.kernel_size == (1, 1): mask = paddle.zeros( int(in_channels // block_size * out_channels), - device=weight.device, - ) + device=weight.device, ) mask.bernoulli_(p) - mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels) + mask = mask.repeat_interleave(block_size, -1).reshape( + [-1, in_channels]) else: mask = paddle.zeros( - weight.size(0), weight.size(1), device=weight.device - ) + weight.size(0), weight.size(1), device=weight.device) mask.bernoulli_(p) mask = ( - mask.unsqueeze(2) - .unsqueeze(3) - .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]) - ) + mask.unsqueeze(2).unsqueeze(3) + .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])) # scale weights and apply mask - mask = mask.to( - paddle.bool - ) + mask = mask.to(paddle.bool) s = 1 / (1 - p) mod.weight.data = s * weight.masked_fill(mask, 0) @@ -282,8 +280,7 @@ class MultiheadAttention(nn.Layer): num_buckets=32, max_distance=128, gru_rel_pos=True, - rescale_init=False, - ): + rescale_init=False, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim @@ -302,17 +299,16 @@ class MultiheadAttention(nn.Layer): self.head_dim = embed_dim // num_heads self.q_head_dim = self.head_dim self.k_head_dim = self.head_dim - assert ( - self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" - self.scaling = self.head_dim ** -0.5 + assert (self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" + self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( - "Self-attention requires query, key and " "value to be of the same size" - ) + "Self-attention requires query, key and " + "value to be of the same size") k_bias = True if rescale_init: @@ -322,26 +318,24 @@ class MultiheadAttention(nn.Layer): q_embed_dim = embed_dim self.k_proj = quant_noise( - nn.Linear(self.kdim, k_embed_dim, bias_attr=k_bias), q_noise, qn_block_size - ) + nn.Linear(self.kdim, k_embed_dim, bias_attr=k_bias), q_noise, + qn_block_size) self.v_proj = quant_noise( - nn.Linear(self.vdim, embed_dim, bias_attr=bias), q_noise, qn_block_size - ) + nn.Linear(self.vdim, embed_dim, bias_attr=bias), q_noise, + qn_block_size) self.q_proj = quant_noise( - nn.Linear(embed_dim, q_embed_dim, bias_attr=bias), q_noise, qn_block_size - ) + nn.Linear(embed_dim, q_embed_dim, bias_attr=bias), q_noise, + qn_block_size) self.out_proj = quant_noise( - nn.Linear(embed_dim, embed_dim, bias_attr=bias), q_noise, qn_block_size - ) + nn.Linear(embed_dim, embed_dim, bias_attr=bias), q_noise, + qn_block_size) if add_bias_kv: self.bias_k = self.create_parameter( - shape=[1, 1, embed_dim], dtype="float32" - ) + shape=[1, 1, embed_dim], dtype="float32") self.bias_v = self.create_parameter( - shape=[1, 1, embed_dim], dtype="float32" - ) + shape=[1, 1, embed_dim], dtype="float32") else: self.bias_k = self.bias_v = None @@ -352,40 +346,41 @@ class MultiheadAttention(nn.Layer): if self.gru_rel_pos: self.grep_linear = nn.Linear(self.q_head_dim, 8) self.grep_a = self.create_parameter( - shape=[1, num_heads, 1, 1], dtype="float32" - ) - + shape=[1, num_heads, 1, 1], dtype="float32") self.reset_parameters() def reset_parameters(self): pass - - def _relative_positions_bucket(self, relative_positions, bidirectional=True): + + def _relative_positions_bucket(self, relative_positions, + bidirectional=True): num_buckets = self.num_buckets max_distance = self.max_distance relative_buckets = 0 if bidirectional: num_buckets = num_buckets // 2 - relative_buckets += (relative_positions > 0).astype("int64") * num_buckets + relative_buckets += ( + relative_positions > 0).astype("int64") * num_buckets relative_positions = paddle.abs(relative_positions) else: - relative_positions = -paddle.minimum(relative_positions, paddle.zeros_like(relative_positions)) + relative_positions = -paddle.minimum( + relative_positions, paddle.zeros_like(relative_positions)) max_exact = num_buckets // 2 is_small = relative_positions < max_exact relative_postion_if_large = max_exact + ( - paddle.log(relative_positions.astype("float32") / max_exact) - / math.log(max_distance / max_exact) - * (num_buckets - max_exact) - ).astype("int64") + paddle.log(relative_positions.astype("float32") / + max_exact) / math.log(max_distance / max_exact) * + (num_buckets - max_exact)).astype("int64") relative_postion_if_large = paddle.minimum( - relative_postion_if_large, paddle.full_like(relative_postion_if_large, num_buckets - 1) - ) + relative_postion_if_large, + paddle.full_like(relative_postion_if_large, num_buckets - 1)) - relative_buckets += paddle.where(is_small, relative_positions, relative_postion_if_large) + relative_buckets += paddle.where(is_small, relative_positions, + relative_postion_if_large) return relative_buckets def compute_bias(self, query_length, key_length): @@ -393,28 +388,26 @@ class MultiheadAttention(nn.Layer): memory_position = paddle.arange(key_length, dtype="int64")[None, :] relative_position = memory_position - context_position relative_position_bucket = self._relative_positions_bucket( - relative_position, - bidirectional=True - ) + relative_position, bidirectional=True) # relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device) values = self.relative_attention_bias(relative_position_bucket) values = values.transpose([2, 0, 1]) return values - def forward( - self, - query, - key: Optional[Tensor], - value: Optional[Tensor], - key_padding_mask: Optional[Tensor] = None, - incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, - need_weights: bool = True, - static_kv: bool = False, - attn_mask: Optional[Tensor] = None, - before_softmax: bool = False, - need_head_weights: bool = False, - position_bias: Optional[Tensor] = None - ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]: + def forward(self, + query, + key: Optional[Tensor], + value: Optional[Tensor], + key_padding_mask: Optional[Tensor]=None, + incremental_state: Optional[Dict[str, Dict[str, Optional[ + Tensor]]]]=None, + need_weights: bool=True, + static_kv: bool=False, + attn_mask: Optional[Tensor]=None, + before_softmax: bool=False, + need_head_weights: bool=False, + position_bias: Optional[Tensor]=None + ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]: """Input shape: Time x Batch x Channel Args: @@ -441,17 +434,16 @@ class MultiheadAttention(nn.Layer): assert list(query.shape) == [tgt_len, bsz, embed_dim] if key is not None: src_len, key_bsz, _ = key.shape - + if self.has_relative_attention_bias and position_bias is None: position_bias = self.compute_bias(tgt_len, src_len) position_bias_ = position_bias.unsqueeze(0) - position_bias = paddle.concat([position_bias_ for _ in range(bsz)], axis=0) - position_bias = position_bias.reshape([bsz * self.num_heads, tgt_len, src_len]) - if ( - incremental_state is None - and not static_kv - and self.q_head_dim == self.head_dim - ): + position_bias = paddle.concat( + [position_bias_ for _ in range(bsz)], axis=0) + position_bias = position_bias.reshape( + [bsz * self.num_heads, tgt_len, src_len]) + if (incremental_state is None and not static_kv and + self.q_head_dim == self.head_dim): assert key is not None and value is not None assert attn_mask is None @@ -465,17 +457,21 @@ class MultiheadAttention(nn.Layer): query_layer = query_layer.transpose([0, 2, 1, 3]) _B, _H, _L, __ = query_layer.shape - gate_a, gate_b = paddle.nn.functional.sigmoid(self.grep_linear(query_layer).reshape([_B, _H, _L, 2, 4]).sum(-1, keepdim=False)).chunk(2, axis=-1) - + gate_a, gate_b = paddle.nn.functional.sigmoid( + self.grep_linear(query_layer).reshape( + [_B, _H, _L, 2, 4]).sum(-1, keepdim=False)).chunk( + 2, axis=-1) + gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0 - attn_mask_rel_pos = gate_a_1.reshape([bsz * self.num_heads, -1, 1]) * position_bias + attn_mask_rel_pos = gate_a_1.reshape( + [bsz * self.num_heads, -1, 1]) * position_bias - attn_mask_rel_pos = attn_mask_rel_pos.reshape((-1, tgt_len, tgt_len)) + attn_mask_rel_pos = attn_mask_rel_pos.reshape( + (-1, tgt_len, tgt_len)) k_proj_bias = self.k_proj.bias if k_proj_bias is None: k_proj_bias = paddle.zeros_like(self.q_proj.bias) - x, attn = multi_head_attention_forward_paddle( query, key, @@ -483,7 +479,9 @@ class MultiheadAttention(nn.Layer): self.embed_dim, self.num_heads, paddle.empty([0]), - paddle.concat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias), axis=0), + paddle.concat( + (self.q_proj.bias, self.k_proj.bias, self.v_proj.bias), + axis=0), self.bias_k, self.bias_v, self.add_zero_attn, @@ -497,9 +495,8 @@ class MultiheadAttention(nn.Layer): use_separate_proj_weight=True, q_proj_weight=self.q_proj.weight, k_proj_weight=self.k_proj.weight, - v_proj_weight=self.v_proj.weight, - ) - + v_proj_weight=self.v_proj.weight, ) + return x, attn, position_bias if incremental_state is not None: @@ -540,8 +537,8 @@ class MultiheadAttention(nn.Layer): v = paddle.concat([v, self.bias_v.repeat(1, bsz, 1)], axis=0) if attn_mask is not None: attn_mask = paddle.concat( - [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], axis=1 - ) + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], + axis=1) if key_padding_mask is not None: key_padding_mask = paddle.concat( @@ -549,33 +546,27 @@ class MultiheadAttention(nn.Layer): key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1), ], - axis=1, - ) - - q = ( - q.contiguous() - .view(tgt_len, bsz * self.num_heads, self.q_head_dim) - .transpose([1, 0, 2]) - ) + axis=1, ) + + q = (q.contiguous() + .reshape([tgt_len, bsz * self.num_heads, self.q_head_dim]) + .transpose([1, 0, 2])) if k is not None: - k = ( - k.contiguous() - .view(-1, bsz * self.num_heads, self.k_head_dim) - .transpose([1, 0, 2]) - ) + k = (k.contiguous() + .reshape([-1, bsz * self.num_heads, self.k_head_dim]) + .transpose([1, 0, 2])) if v is not None: - v = ( - v.contiguous() - .view(-1, bsz * self.num_heads, self.head_dim) - .transpose([1, 0, 2]) - ) + v = (v.contiguous() + .reshape([-1, bsz * self.num_heads, self.head_dim]) + .transpose([1, 0, 2])) if saved_state is not None: # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) if "prev_key" in saved_state: _prev_key = saved_state["prev_key"] assert _prev_key is not None - prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim) + prev_key = _prev_key.reshape( + [bsz * self.num_heads, -1, self.head_dim]) if static_kv: k = prev_key else: @@ -585,7 +576,8 @@ class MultiheadAttention(nn.Layer): if "prev_value" in saved_state: _prev_value = saved_state["prev_value"] assert _prev_value is not None - prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim) + prev_value = _prev_value.reshape( + [bsz * self.num_heads, -1, self.head_dim]) if static_kv: v = prev_value else: @@ -600,15 +592,17 @@ class MultiheadAttention(nn.Layer): prev_key_padding_mask=prev_key_padding_mask, batch_size=bsz, src_len=k.size(1), - static_kv=static_kv, - ) + static_kv=static_kv, ) - saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim) - saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim) + saved_state["prev_key"] = k.reshape( + [bsz, self.num_heads, -1, self.head_dim]) + saved_state["prev_value"] = v.reshape( + [bsz, self.num_heads, -1, self.head_dim]) saved_state["prev_key_padding_mask"] = key_padding_mask # In this branch incremental_state is never None assert incremental_state is not None - incremental_state = self._set_input_buffer(incremental_state, saved_state) + incremental_state = self._set_input_buffer(incremental_state, + saved_state) assert k is not None assert k.size(1) == src_len @@ -624,30 +618,31 @@ class MultiheadAttention(nn.Layer): if self.add_zero_attn: assert v is not None src_len += 1 - k = paddle.concat([k, k.new_zeros((k.size(0), 1) + k.shape[2:])], axis=1) - v = paddle.concat([v, v.new_zeros((v.size(0), 1) + v.shape[2:])], axis=1) + k = paddle.concat( + [k, k.new_zeros((k.size(0), 1) + k.shape[2:])], axis=1) + v = paddle.concat( + [v, v.new_zeros((v.size(0), 1) + v.shape[2:])], axis=1) if attn_mask is not None: attn_mask = paddle.concat( - [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], axis=1 - ) + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], + axis=1) if key_padding_mask is not None: key_padding_mask = paddle.concat( [ key_padding_mask, - paddle.zeros(key_padding_mask.size(0), 1).type_as( - key_padding_mask - ), + paddle.zeros(key_padding_mask.size(0), + 1).type_as(key_padding_mask), ], - axis=1, - ) - + axis=1, ) attn_weights = paddle.matmul(q, k.transpose([0, 2, 1])) - attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) + attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, + bsz) - assert list(attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len] + assert list( + attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len] if attn_mask is not None: attn_mask = attn_mask.unsqueeze(0) @@ -655,46 +650,49 @@ class MultiheadAttention(nn.Layer): if key_padding_mask is not None: # don't attend to padding symbols - attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.reshape( + [bsz, self.num_heads, tgt_len, src_len]) attn_weights = attn_weights.masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2).to(paddle.bool), - float("-inf"), - ) - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + float("-inf"), ) + attn_weights = attn_weights.reshape( + [bsz * self.num_heads, tgt_len, src_len]) if before_softmax: return attn_weights, v, position_bias if position_bias is not None: if self.gru_rel_pos == 1: - query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim) + query_layer = q.reshape( + [bsz, self.num_heads, tgt_len, self.q_head_dim]) _B, _H, _L, __ = query_layer.shape - gate_a, gate_b = paddle.sigmoid(self.grep_linear(query_layer).view( - _B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, axis=-1) - + gate_a, gate_b = paddle.sigmoid( + self.grep_linear(query_layer).reshape([_B, _H, _L, 2, 4]) + .sum(-1, keepdim=False)).chunk( + 2, axis=-1) + gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0 - position_bias = gate_a_1.view(bsz * self.num_heads, -1, 1) * position_bias + position_bias = gate_a_1.reshape( + [bsz * self.num_heads, -1, 1]) * position_bias - position_bias = position_bias.view(attn_weights.shape) + position_bias = position_bias.reshape(attn_weights.shape) attn_weights = attn_weights + position_bias - attn_weights_float = F.softmax( - attn_weights, dim=-1 - ) + attn_weights_float = F.softmax(attn_weights, dim=-1) attn_weights = attn_weights_float.type_as(attn_weights) attn_probs = self.dropout_module(attn_weights) assert v is not None attn = paddle.bmm(attn_probs, v) - assert list(attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim] + assert list( + attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim] attn = attn.transpose([1, 0, 2]).reshape([tgt_len, bsz, embed_dim]) attn = self.out_proj(attn) attn_weights: Optional[Tensor] = None if need_weights: - attn_weights = attn_weights_float.view( - bsz, self.num_heads, tgt_len, src_len - ).transpose([1, 0, 2, 3]) + attn_weights = attn_weights_float.reshape( + [bsz, self.num_heads, tgt_len, src_len]).transpose([1, 0, 2, 3]) if not need_head_weights: # average attention weights over heads attn_weights = attn_weights.mean(dim=0) @@ -707,15 +705,14 @@ class MultiheadAttention(nn.Layer): prev_key_padding_mask: Optional[Tensor], batch_size: int, src_len: int, - static_kv: bool, - ) -> Optional[Tensor]: + static_kv: bool, ) -> Optional[Tensor]: # saved key padding masks have shape (bsz, seq_len) if prev_key_padding_mask is not None and static_kv: new_key_padding_mask = prev_key_padding_mask elif prev_key_padding_mask is not None and key_padding_mask is not None: new_key_padding_mask = paddle.concat( - [prev_key_padding_mask.float(), key_padding_mask.float()], axis=1 - ) + [prev_key_padding_mask.float(), key_padding_mask.float()], + axis=1) # During incremental decoding, as the padding token enters and # leaves the frame, there will be a time when prev or current # is None @@ -723,11 +720,9 @@ class MultiheadAttention(nn.Layer): if src_len > prev_key_padding_mask.size(1): filler = paddle.zeros( (batch_size, src_len - prev_key_padding_mask.size(1)), - device=prev_key_padding_mask.device, - ) + device=prev_key_padding_mask.device, ) new_key_padding_mask = paddle.concat( - [prev_key_padding_mask.float(), filler.float()], axis=1 - ) + [prev_key_padding_mask.float(), filler.float()], axis=1) else: new_key_padding_mask = prev_key_padding_mask.float() @@ -735,11 +730,9 @@ class MultiheadAttention(nn.Layer): if src_len > key_padding_mask.size(1): filler = paddle.zeros( (batch_size, src_len - key_padding_mask.size(1)), - device=key_padding_mask.device, - ) + device=key_padding_mask.device, ) new_key_padding_mask = paddle.concat( - [filler.float(), key_padding_mask.float()], axis=1 - ) + [filler.float(), key_padding_mask.float()], axis=1) else: new_key_padding_mask = key_padding_mask.float() @@ -748,7 +741,8 @@ class MultiheadAttention(nn.Layer): return new_key_padding_mask def _get_input_buffer( - self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] + self, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] ) -> Dict[str, Optional[Tensor]]: result = self.get_incremental_state(incremental_state, "attn_state") if result is not None: @@ -760,9 +754,13 @@ class MultiheadAttention(nn.Layer): def _set_input_buffer( self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], - buffer: Dict[str, Optional[Tensor]], - ): - return self.set_incremental_state(incremental_state, "attn_state", buffer) - - def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int): - return attn_weights \ No newline at end of file + buffer: Dict[str, Optional[Tensor]], ): + return self.set_incremental_state(incremental_state, "attn_state", + buffer) + + def apply_sparse_mask(self, + attn_weights, + tgt_len: int, + src_len: int, + bsz: int): + return attn_weights diff --git a/paddlespeech/s2t/models/wavlm/wavlm_asr.py b/paddlespeech/s2t/models/wavlm/wavlm_asr.py index 53dd498d5..de2d32d7c 100644 --- a/paddlespeech/s2t/models/wavlm/wavlm_asr.py +++ b/paddlespeech/s2t/models/wavlm/wavlm_asr.py @@ -188,7 +188,7 @@ class WavLMASR(nn.Layer): x_lens = x.shape[1] ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, x_lens) # (B, maxlen) + topk_index = topk_index.reshape([batch_size, x_lens]) # (B, maxlen) hyps = [hyp.tolist() for hyp in topk_index] hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] diff --git a/paddlespeech/s2t/models/wavlm/wavlm_paddle.py b/paddlespeech/s2t/models/wavlm/wavlm_paddle.py index 1a0fca531..02233557f 100644 --- a/paddlespeech/s2t/models/wavlm/wavlm_paddle.py +++ b/paddlespeech/s2t/models/wavlm/wavlm_paddle.py @@ -297,8 +297,8 @@ class WavLM(nn.Layer): extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] - padding_mask = padding_mask.view( - padding_mask.size(0), features.size(1), -1) + padding_mask = padding_mask.reshape( + [padding_mask.size(0), features.size(1), -1]) padding_mask = padding_mask.all(-1) return padding_mask @@ -475,14 +475,15 @@ class ConvFeatureExtractionModel(nn.Layer): else: x = conv(x) x = x.transpose([0, 1, 3, 2]).contiguous() - x = x.view(x.size(0), -1, x.size(-1)) + x = x.reshape([x.size(0), -1, x.size(-1)]) else: for conv in self.conv_layers: x = conv(x) if self.conv_type == "conv2d": b, c, t, f = x.size() - # x = x.transpose(2, 3).contiguous().view(b, c * f, t) - x = x.transpose([0, 1, 3, 2]).contiguous().view(b, c * f, t) + # x = x.transpose(2, 3).contiguous().reshape([b, c * f, t]) + x = x.transpose([0, 1, 3, 2]).contiguous().reshape( + [b, c * f, t]) return x diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py index 3ac102f3c..0d91b9cfb 100644 --- a/paddlespeech/s2t/utils/tensor_utils.py +++ b/paddlespeech/s2t/utils/tensor_utils.py @@ -181,8 +181,9 @@ def th_accuracy(pad_outputs: paddle.Tensor, Returns: float: Accuracy value (0.0 - 1.0). """ - pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1], - pad_outputs.shape[1]).argmax(2) + pad_pred = pad_outputs.reshape( + [pad_targets.shape[0], pad_targets.shape[1], + pad_outputs.shape[1]]).argmax(2) mask = pad_targets != ignore_label numerator = paddle.sum( diff --git a/paddlespeech/t2s/models/jets/generator.py b/paddlespeech/t2s/models/jets/generator.py index 9580d17d1..1b8e0ce6e 100644 --- a/paddlespeech/t2s/models/jets/generator.py +++ b/paddlespeech/t2s/models/jets/generator.py @@ -751,10 +751,10 @@ class JETSGenerator(nn.Layer): # integrate with SID and LID embeddings if self.spks is not None: - sid_embs = self.sid_emb(sids.view(-1)) + sid_embs = self.sid_emb(sids.reshape([-1])) hs = hs + sid_embs.unsqueeze(1) if self.langs is not None: - lid_embs = self.lid_emb(lids.view(-1)) + lid_embs = self.lid_emb(lids.reshape([-1])) hs = hs + lid_embs.unsqueeze(1) # integrate speaker embedding From 5069111e6dd32308938e03cd2d9457ac0d00864d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:53:34 +0800 Subject: [PATCH 21/36] =?UTF-8?q?[Hackathon=207th]=20=E4=BF=AE=E5=A4=8D=20?= =?UTF-8?q?deepspeech2online=20=E7=9A=84=E5=AF=BC=E5=87=BA=E9=97=AE?= =?UTF-8?q?=E9=A2=98=20(#3935)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update model.py * Update model.py --- paddlespeech/s2t/exps/deepspeech2/model.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index 710757115..7836d3adf 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -23,6 +23,7 @@ import paddle from paddle import distributed as dist from paddle import inference +import paddlespeech.utils from paddlespeech.audio.text.text_featurizer import TextFeaturizer from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel @@ -629,9 +630,19 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): def setup_model(self): super().setup_model() - deepspeech_config = inference.Config( - self.args.export_path + ".pdmodel", - self.args.export_path + ".pdiparams") + + # after paddle 3.0, support new inference interface + if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'): + model_dir = os.path.dirname(self.args.export_path) + model_prefix = os.path.basename(self.args.export_path) + deepspeech_config = inference.Config(model_dir, model_prefix) + else: + deepspeech_config = inference.Config( + self.args.export_path + ".pdmodel", + self.args.export_path + ".pdiparams") + + deepspeech_config.disable_mkldnn() + if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): deepspeech_config.enable_use_gpu(100, 0) deepspeech_config.enable_memory_optim() From 9d16002c235cb4240ca93d4384193be37f93273b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:57:22 +0800 Subject: [PATCH 22/36] [Hackathon 7th] fix aishell3/vctk vc0/ernie (#3928) * fix aishell3-vc0 * fix aishell3-vc0 * Apply suggestions from code review * Apply suggestions from code review --- paddlespeech/t2s/modules/losses.py | 3 ++- paddlespeech/t2s/modules/nets_utils.py | 7 ++++--- paddlespeech/t2s/modules/tacotron2/attentions.py | 3 ++- paddlespeech/t2s/modules/tacotron2/encoder.py | 2 ++ 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index b4d78364c..1f9399b75 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -1115,7 +1115,8 @@ class MLMLoss(nn.Layer): paddle.reshape(xs_pad, (-1, self.odim))), axis=-1) mlm_loss = paddle.sum((loss * paddle.reshape( - mlm_loss_pos, [-1]))) / paddle.sum((mlm_loss_pos) + 1e-10) + mlm_loss_pos, + [-1]).astype(loss.dtype))) / paddle.sum((mlm_loss_pos) + 1e-10) text_mlm_loss = None diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index a3c6947b8..4c86d74f5 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -466,7 +466,7 @@ def phones_masking(xs_pad: paddle.Tensor, for s, e in zip(masked_start, masked_end): masked_pos[idx, s:e] = 1 non_eos_mask = paddle.reshape(src_mask, paddle.shape(xs_pad)[:2]) - masked_pos = masked_pos * non_eos_mask + masked_pos = masked_pos * non_eos_mask.astype(masked_pos.dtype) masked_pos = paddle.cast(masked_pos, 'bool') return masked_pos @@ -550,10 +550,11 @@ def phones_text_masking(xs_pad: paddle.Tensor, for s, e in zip(masked_start, masked_end): masked_pos[idx, s:e] = 1 non_eos_mask = paddle.reshape(src_mask, shape=paddle.shape(xs_pad)[:2]) - masked_pos = masked_pos * non_eos_mask + masked_pos = masked_pos * non_eos_mask.astype(masked_pos.dtype) non_eos_text_mask = paddle.reshape( text_mask, shape=paddle.shape(text_pad)[:2]) - text_masked_pos = text_masked_pos * non_eos_text_mask + text_masked_pos = text_masked_pos * non_eos_text_mask.astype( + text_masked_pos.dtype) masked_pos = paddle.cast(masked_pos, 'bool') text_masked_pos = paddle.cast(text_masked_pos, 'bool') diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py index 5d1a24845..86407e778 100644 --- a/paddlespeech/t2s/modules/tacotron2/attentions.py +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -171,7 +171,8 @@ class AttLoc(nn.Layer): if paddle.sum(att_prev) == 0: # if no bias, 0 0-pad goes 0 att_prev = 1.0 - make_pad_mask(enc_hs_len) - att_prev = att_prev / enc_hs_len.unsqueeze(-1) + att_prev = att_prev / enc_hs_len.unsqueeze(-1).astype( + att_prev.dtype) # att_prev: (utt, frame) -> (utt, 1, 1, frame) # -> (utt, att_conv_chans, 1, frame) diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index 224c82400..7683def83 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -162,6 +162,8 @@ class Encoder(nn.Layer): return xs.transpose([0, 2, 1]) if not isinstance(ilens, paddle.Tensor): ilens = paddle.to_tensor(ilens) + if ilens.ndim == 0: + ilens = ilens.unsqueeze(0) xs = xs.transpose([0, 2, 1]) # for dygraph to static graph # self.blstm.flatten_parameters() From b84e86d718dbecdf9679f4be9ac7361aa9df0006 Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Mon, 9 Dec 2024 16:00:39 +0800 Subject: [PATCH 23/36] [hackathon7] Add timit/asr1 readme.md (#3930) * add timit md file * fix * fix --- examples/timit/README.md | 6 +- examples/timit/asr1/README.md | 195 ++++++++++++++++++++++++++++++++++ examples/timit/asr1/run.sh | 2 +- 3 files changed, 197 insertions(+), 6 deletions(-) create mode 100644 examples/timit/asr1/README.md diff --git a/examples/timit/README.md b/examples/timit/README.md index 51fcfd57c..4f376cc38 100644 --- a/examples/timit/README.md +++ b/examples/timit/README.md @@ -1,7 +1,3 @@ # TIMIT -asr model with phone unit - -* asr0 - deepspeech2 Streaming/Non-Streaming -* asr1 - transformer/conformer Streaming/Non-Streaming -* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature +* asr1 - transformer Streaming/Non-Streaming diff --git a/examples/timit/asr1/README.md b/examples/timit/asr1/README.md new file mode 100644 index 000000000..b725894cd --- /dev/null +++ b/examples/timit/asr1/README.md @@ -0,0 +1,195 @@ +# Transformer ASR with Timit +The phoneme-based continuous speech corpus is a collaboration between Texas Instruments, MIT, and SRI International. The [Timit](https://catalog.ldc.upenn.edu/docs/LDC93S1/) dataset has a voice sampling frequency of 16 khz and contains a total of 6,300 sentences, with 630 people from 8 major U.S. dialects speaking a given 10 sentences each, all sentences are manually segmented and marked at the phone level. Seventy percent of the speakers are male; most of the speakers are white adults. + +## Dataset +### Download and Extract +Download TIMIT from it's [official website](https://catalog.ldc.upenn.edu/LDC93S1) and extract it to `~/datasets`. Assume unzip the dataset in the directory `~/datasets/timit`. + +## Overview +All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function. +| Stage | Function | +|:---- |:----------------------------------------------------------- | +| 0 | Process data. It includes:
(1) Download the dataset
(2) Calculate the CMVN of the train dataset
(3) Get the vocabulary file
(4) Get the manifest files of the train, development and test dataset | +| 1 | Train the model | +| 2 | Get the final model by averaging the top-k models, set k = 1 means to choose the best model | +| 3 | Test the final model performance | +| 4 | Get ctc alignment of test data using the final model | + +You can choose to run a range of stages by setting `stage` and `stop_stage `. + +For example, if you want to execute the code in stage 2 and stage 3, you can run this script: +```bash +bash run.sh --stage 2 --stop_stage 3 +``` +Or you can set `stage` equal to `stop-stage` to only run one stage. +For example, if you only want to run `stage 0`, you can use the script below: +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +The document below will describe the scripts in `run.sh` in detail. +## The Environment Variables +The path.sh contains the environment variables. +```bash +source path.sh +``` +This script needs to be run first. And another script is also needed: +```bash +source ${MAIN_ROOT}/utils/parse_options.sh +``` +It will support the way of using `--variable value` in the shell scripts. +## The Local Variables +Some local variables are set in `run.sh`. +`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. +`stage` denotes the number of the stage you want to start from in the experiments. +`stop stage` denotes the number of the stage you want to end at in the experiments. +`conf_path` denotes the config path of the model. +`avg_num` denotes the number K of top-K models you want to average to get the final model. +`audio_file` denotes the file path of the single file you want to infer in stage 5 +`ckpt` denotes the checkpoint prefix of the model, e.g. "conformer" +You can set the local variables (except `ckpt`) when you use `run.sh` + +For example, you can set the `gpus` and `avg_num` when you use the command line.: +```bash +bash run.sh --gpus 0,1,2,3 --avg_num 10 +``` +## Stage 0: Data Processing +To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below: +```bash + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/timit_data_prep.sh ${TIMIT_path} + bash ./local/data.sh || exit -1 + fi +``` + +Stage 0 is for processing the data. + +If you only want to process the data. You can run +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +You can also just run these scripts in your command line. +```bash +source path.sh +bash ./local/timit_data_prep.sh ${TIMIT_path} +bash ./local/data.sh +``` +After processing the data, the ``data`` directory will look like this: +```bash +data/ +|-- lang_char +| `-- vocab.txt +|-- local +| `-- dev_sph.flist +| `-- dev_sph.scp +| `-- dev.text +| `-- dev.trans +| `-- dev.uttids +| `-- test_sph.flist +| `-- test_sph.scp +| `-- test.text +| `-- test.trans +| `-- test.uttids +| `-- train_sph.flist +| `-- train_sph.scp +| `-- train.text +| `-- train.trans +| `-- train.uttids +|-- manifest.dev +|-- manifest.dev.raw +|-- manifest.test +|-- manifest.test.raw +|-- manifest.train +|-- manifest.train.raw +|-- mean_std.json +|-- test.meta +``` +## Stage 1: Model Training +If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. +```bash +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + fi +``` +If you want to train the model, you can use the script below to execute stage 0 and stage 1: +```bash +bash run.sh --stage 0 --stop_stage 1 +``` +or you can run these scripts in the command line. +```bash +source path.sh +bash ./local/timit_data_prep.sh ${TIMIT_path} +bash ./local/data.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer +``` +## Stage 2: Top-k Models Averaging +After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below: +```bash + if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh best exp/${ckpt}/checkpoints ${avg_num} + fi +``` +The `avg.sh`is in the `../../../utils/` which is define in the `path.sh`. +If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2: +```bash +bash run.sh --stage 0 --stop_stage 2 +``` +or you can run these scripts in the command line. +```bash +bash ./local/timit_data_prep.sh ${TIMIT_path} +source path.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer +avg.sh best exp/conformer/checkpoints 10 +``` +## Stage 3: Model Testing +The test stage is to evaluate the model performance. The code of the test stage is shown below: +```bash + if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + fi +``` +If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 : +```bash +bash run.sh --stage 0 --stop_stage 3 +``` +or you can run these scripts in the command line. +```bash +source path.sh +bash ./local/timit_data_prep.sh ${TIMIT_path} +bash ./local/data.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer +avg.sh best exp/transformer/checkpoints 10 +CUDA_VISIBLE_DEVICES=0 ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10 +``` +## Stage 4: CTC Alignment +If you want to get the alignment between the audio and the text, you can use the ctc alignment. The code of this stage is shown below: +```bash + if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + fi +``` +If you want to train the model, test it and do the alignment, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 : +```bash +bash run.sh --stage 0 --stop_stage 4 +``` +or if you only need to train a model and do the alignment, you can use these scripts to escape stage 3(test stage): +```bash +bash run.sh --stage 0 --stop_stage 2 +bash run.sh --stage 4 --stop_stage 4 +``` +or you can also use these scripts in the command line. +```bash +source path.sh +bash ./local/timit_data_prep.sh ${TIMIT_path} +bash ./local/data.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer +avg.sh best exp/transformer/checkpoints 10 +# test stage is optional +CUDA_VISIBLE_DEVICES=0 ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10 +CUDA_VISIBLE_DEVICES=0 ./local/align.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10 +``` diff --git a/examples/timit/asr1/run.sh b/examples/timit/asr1/run.sh index 0d84be9f3..ebced07d6 100755 --- a/examples/timit/asr1/run.sh +++ b/examples/timit/asr1/run.sh @@ -9,7 +9,7 @@ stop_stage=50 conf_path=conf/transformer.yaml decode_conf_path=conf/tuning/decode.yaml avg_num=10 -TIMIT_path=/path/to/TIMIT +TIMIT_path=~/datasets/timit/data/lisa/data/timit/raw/TIMIT . ${MAIN_ROOT}/utils/parse_options.sh || exit 1; From e4038b4b6e931edccc0e4b2a483d37a864ffa42c Mon Sep 17 00:00:00 2001 From: megemini Date: Mon, 9 Dec 2024 20:22:04 +0800 Subject: [PATCH 24/36] =?UTF-8?q?[Hackathon=207th]=20=E4=BF=AE=E5=A4=8D=20?= =?UTF-8?q?`vctk`=20=E7=9A=84=20`ernie=5Fsat`=20=E8=AE=AD=E7=BB=83?= =?UTF-8?q?=E6=97=B6=E5=87=BA=E7=8E=B0=E7=9A=84=E7=B1=BB=E5=9E=8B=E6=8F=90?= =?UTF-8?q?=E5=8D=87=E9=97=AE=E9=A2=98=20(#3943)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [Fix] vctk type promotion * [Fix] type promotion --- paddlespeech/t2s/modules/losses.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 1f9399b75..e675dcab7 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -1114,6 +1114,7 @@ class MLMLoss(nn.Layer): paddle.reshape(after_outs, (-1, self.odim)), paddle.reshape(xs_pad, (-1, self.odim))), axis=-1) + mlm_loss_pos = (mlm_loss_pos).astype(loss.dtype) mlm_loss = paddle.sum((loss * paddle.reshape( mlm_loss_pos, [-1]).astype(loss.dtype))) / paddle.sum((mlm_loss_pos) + 1e-10) From f0b7f5b995f6d1987b604d9e6e3da299f75c3fab Mon Sep 17 00:00:00 2001 From: megemini Date: Mon, 9 Dec 2024 20:29:20 +0800 Subject: [PATCH 25/36] [Fix] type promotion (#3944) --- paddlespeech/vector/io/signal_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/vector/io/signal_processing.py b/paddlespeech/vector/io/signal_processing.py index ee939bdb1..143d2a80a 100644 --- a/paddlespeech/vector/io/signal_processing.py +++ b/paddlespeech/vector/io/signal_processing.py @@ -37,7 +37,7 @@ def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"): out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True) else: wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True) - out = wav_sum / lengths + out = wav_sum / lengths.astype(wav_sum.dtype) elif amp_type == "peak": out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True) else: From 2d7cf7f0e66c60a6e24cb59aedfef2abb571a8d9 Mon Sep 17 00:00:00 2001 From: megemini Date: Mon, 9 Dec 2024 20:33:41 +0800 Subject: [PATCH 26/36] =?UTF-8?q?[Hackathon=207th]=20=E4=BF=AE=E5=A4=8D=20?= =?UTF-8?q?`asr4`=20=E7=9A=84=20`test=5Fwav`=20=E5=A4=9A=E4=BD=99=E7=9A=84?= =?UTF-8?q?=20argument=20(#3940)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [Fix] test_wav parse arg * [Fix] remove line --- paddlespeech/s2t/exps/hubert/bin/test_wav.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/paddlespeech/s2t/exps/hubert/bin/test_wav.py b/paddlespeech/s2t/exps/hubert/bin/test_wav.py index 94d7f76a8..58910eabb 100644 --- a/paddlespeech/s2t/exps/hubert/bin/test_wav.py +++ b/paddlespeech/s2t/exps/hubert/bin/test_wav.py @@ -97,11 +97,6 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to - parser.add_argument( - "--result_file", type=str, help="path of save the asr result") - parser.add_argument( - "--audio_file", type=str, help="path of the input audio file") args = parser.parse_args() config = CfgNode(new_allowed=True) From 73beb187da28cad56a46c87dd2beb1a271fd144b Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Thu, 12 Dec 2024 15:32:39 +0800 Subject: [PATCH 27/36] [Hackathon 7th] fix voc1 readme.md in CSMSC (#3915) * fix * fix * fix md * fix --- examples/csmsc/voc1/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index 252c2b920..6c148fe9b 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -4,6 +4,18 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a ### Download and Extract Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +After processing the data, the ``BZNSYP`` directory will look like this: +```text +BZNSYP +├── Wave +│ └─ *.wav files (audio speech) +├── PhoneLabeling +│ └─ *.interval files (alignment between phoneme and duration) +└── ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` +This experiment only uses *.wav files from the Wave file + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. From 8ee3a7ee40f528f5c81e59e5b391fd246ae6a235 Mon Sep 17 00:00:00 2001 From: megemini Date: Mon, 16 Dec 2024 14:25:41 +0800 Subject: [PATCH 28/36] [Fix] fastspeech2 0d (#3951) --- paddlespeech/t2s/models/fastspeech2/fastspeech2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index fcd54f0d2..91bfc540a 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -903,14 +903,14 @@ class FastSpeech2(nn.Layer): # initialize alpha in scaled positional encoding if self.encoder_type == "transformer" and self.use_scaled_pos_enc: - init_enc_alpha = paddle.to_tensor(init_enc_alpha) + init_enc_alpha = paddle.to_tensor(init_enc_alpha).reshape([1]) self.encoder.embed[-1].alpha = paddle.create_parameter( shape=init_enc_alpha.shape, dtype=str(init_enc_alpha.numpy().dtype), default_initializer=paddle.nn.initializer.Assign( init_enc_alpha)) if self.decoder_type == "transformer" and self.use_scaled_pos_enc: - init_dec_alpha = paddle.to_tensor(init_dec_alpha) + init_dec_alpha = paddle.to_tensor(init_dec_alpha).reshape([1]) self.decoder.embed[-1].alpha = paddle.create_parameter( shape=init_dec_alpha.shape, dtype=str(init_dec_alpha.numpy().dtype), From 1b9217f9f6a5202243ed19f8bcc83c718d45f780 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Wed, 18 Dec 2024 14:23:29 +0800 Subject: [PATCH 29/36] fix example led_en_zh st1 (#3955) --- examples/ted_en_zh/st1/local/data.sh | 2 +- utils/addjson.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index 511ebd231..9076f53bd 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -203,7 +203,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "stage 3: Format the Json Data" for (( i=0; i<${#x[*]}; ++i)); do python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \ - --json-file ${x[$i]}/data_${bpemode}${nbpe}.json + --json-file ${x[$i]}/data_${bpemode}${nbpe}.json \ --manifest-file data/manifest.${y[$i]} done fi diff --git a/utils/addjson.py b/utils/addjson.py index f90f7afab..5c87080ac 100755 --- a/utils/addjson.py +++ b/utils/addjson.py @@ -11,8 +11,6 @@ import json import logging import sys -from espnet.utils.cli_utils import get_commandline_args - from paddlespeech.utils.argparse import strtobool is_python2 = sys.version_info[0] == 2 @@ -44,7 +42,7 @@ if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) - logging.info(get_commandline_args()) + logging.info(args) # make intersection set for utterance keys js = [] From b4c2f3bae3d158442fc47ea6e27dc2f024919c83 Mon Sep 17 00:00:00 2001 From: megemini Date: Wed, 18 Dec 2024 14:32:37 +0800 Subject: [PATCH 30/36] =?UTF-8?q?[Hackathon=207th]=20=E4=BF=AE=E5=A4=8D=20?= =?UTF-8?q?`s2t`=20=E7=A4=BA=E4=BE=8B=E9=94=99=E8=AF=AF=20(#3950)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [Fix] s2t * [Fix] s2t test --- examples/ted_en_zh/st0/README.md | 2 +- paddlespeech/s2t/exps/u2_st/bin/test.py | 3 --- .../s2t/frontend/featurizer/text_featurizer.py | 4 ++++ paddlespeech/s2t/io/dataloader.py | 6 ++++++ paddlespeech/s2t/models/u2_st/u2_st.py | 8 ++++---- paddlespeech/s2t/modules/decoder.py | 16 ++++++++-------- 6 files changed, 23 insertions(+), 16 deletions(-) diff --git a/examples/ted_en_zh/st0/README.md b/examples/ted_en_zh/st0/README.md index 112d63c71..4c08e0fe1 100644 --- a/examples/ted_en_zh/st0/README.md +++ b/examples/ted_en_zh/st0/README.md @@ -127,7 +127,7 @@ source path.h bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer_mtl_noam.yaml transformer_mtl_noam avg.sh latest exp/transformer_mtl_noam/checkpoints 5 -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer_mtl_noam.yaml exp/transformer_mtl_noam/checkpoints/avg_5 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer_mtl_noam.yaml conf/tuning/decode.yaml exp/transformer_mtl_noam/checkpoints/avg_5 ``` The performance of the released models are shown below: ### Transformer diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py index 30a903ceb..a2e37e84d 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/test.py +++ b/paddlespeech/s2t/exps/u2_st/bin/test.py @@ -34,9 +34,6 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to - parser.add_argument( - "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py index 7623d0b87..0db0d63b9 100644 --- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py @@ -115,6 +115,10 @@ class TextFeaturizer(): """ assert self.vocab_path_or_list, "toidx need vocab path or vocab list" tokens = [] + # unwrap `idxs`` like `[[1,2,3]]` + if idxs and isinstance(idxs[0], (list, tuple)) and len(idxs) == 1: + idxs = idxs[0] + for idx in idxs: if idx == self.eos_id: break diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py index db6292f2c..5065c31ed 100644 --- a/paddlespeech/s2t/io/dataloader.py +++ b/paddlespeech/s2t/io/dataloader.py @@ -404,6 +404,12 @@ class DataLoaderFactory(): config['subsampling_factor'] = 1 config['num_encs'] = 1 config['shortest_first'] = False + config['minibatches'] = 0 + config['batch_count'] = 'auto' + config['batch_bins'] = 0 + config['batch_frames_in'] = 0 + config['batch_frames_out'] = 0 + config['batch_frames_inout'] = 0 elif mode == 'valid': config['manifest'] = config.dev_manifest config['train_mode'] = False diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index 339af4b74..3fe1d352f 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -170,8 +170,8 @@ class U2STBaseModel(nn.Layer): ys_in_lens = ys_pad_lens + 1 # 1. Forward decoder - decoder_out, _ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad, - ys_in_lens) + decoder_out, *_ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad, + ys_in_lens) # 2. Compute attention loss loss_att = self.criterion_att(decoder_out, ys_out_pad) @@ -203,8 +203,8 @@ class U2STBaseModel(nn.Layer): ys_in_lens = ys_pad_lens + 1 # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad, - ys_in_lens) + decoder_out, *_ = self.decoder(encoder_out, encoder_mask, ys_in_pad, + ys_in_lens) # 2. Compute attention loss loss_att = self.criterion_att(decoder_out, ys_out_pad) diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index 4ddf057b6..1881a865c 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -110,14 +110,14 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): concat_after=concat_after, ) for _ in range(num_blocks) ]) - def forward( - self, - memory: paddle.Tensor, - memory_mask: paddle.Tensor, - ys_in_pad: paddle.Tensor, - ys_in_lens: paddle.Tensor, - r_ys_in_pad: paddle.Tensor=paddle.empty([0]), - reverse_weight: float=0.0) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward(self, + memory: paddle.Tensor, + memory_mask: paddle.Tensor, + ys_in_pad: paddle.Tensor, + ys_in_lens: paddle.Tensor, + r_ys_in_pad: paddle.Tensor=paddle.empty([0]), + reverse_weight: float=0.0 + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Forward decoder. Args: memory: encoded memory, float32 (batch, maxlen_in, feat) From c11b19df9002b5664d57707656108229613def61 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Wed, 18 Dec 2024 14:42:41 +0800 Subject: [PATCH 31/36] [Hackathon 7th] updating the paths to utility scripts and modifying method parameters (#3942) * fix path error * Update examples/aishell/asr0/local/test.sh * remove some broken symbolic link --- examples/aishell/asr0/local/test.sh | 10 +++++----- examples/aishell/asr0/utils | 1 - examples/aishell/asr1/local/test.sh | 14 +++++++------- examples/aishell/asr3/local/test.sh | 10 +++++----- examples/librispeech/asr0/local/test.sh | 10 +++++----- examples/librispeech/asr1/local/test.sh | 16 ++++++++-------- examples/librispeech/asr2/utils | 1 - examples/librispeech/asr3/local/test.sh | 10 +++++----- examples/librispeech/asr4/local/test.sh | 10 +++++----- examples/librispeech/asr5/utils | 1 - paddlespeech/s2t/exps/deepspeech2/model.py | 1 - paddlespeech/s2t/models/ds2/deepspeech2.py | 8 +++++--- 12 files changed, 45 insertions(+), 47 deletions(-) delete mode 120000 examples/aishell/asr0/utils delete mode 120000 examples/librispeech/asr2/utils delete mode 100644 examples/librispeech/asr5/utils diff --git a/examples/aishell/asr0/local/test.sh b/examples/aishell/asr0/local/test.sh index 778c7142e..fd7b062db 100755 --- a/examples/aishell/asr0/local/test.sh +++ b/examples/aishell/asr0/local/test.sh @@ -22,7 +22,7 @@ fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # format the reference test file - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test.raw \ --trans_ref data/manifest.test.text @@ -39,20 +39,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then fi # format the hyp file - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.rsl \ --trans_hyp ${ckpt_prefix}.rsl.text - python3 utils/compute-wer.py --char=1 --v=1 \ + python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error fi if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test.raw \ --trans_ref_sclite data/manifest.test.text.sclite - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.rsl \ --trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite diff --git a/examples/aishell/asr0/utils b/examples/aishell/asr0/utils deleted file mode 120000 index 94d118d25..000000000 --- a/examples/aishell/asr0/utils +++ /dev/null @@ -1 +0,0 @@ -../../../utils/ diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh index 8487e9904..2f55f48a8 100755 --- a/examples/aishell/asr1/local/test.sh +++ b/examples/aishell/asr1/local/test.sh @@ -34,7 +34,7 @@ fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # format the reference test file - python utils/format_rsl.py \ + python ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test.raw \ --trans_ref data/manifest.test.text @@ -63,10 +63,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then fi # format the hyp file - python utils/format_rsl.py \ + python ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${output_dir}/${type}.rsl \ --trans_hyp ${output_dir}/${type}.rsl.text - python utils/compute-wer.py --char=1 --v=1 \ + python ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test.text ${output_dir}/${type}.rsl.text > ${output_dir}/${type}.error done @@ -89,10 +89,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then echo "Failed in evaluation!" exit 1 fi - python utils/format_rsl.py \ + python ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${output_dir}/${type}.rsl \ --trans_hyp ${output_dir}/${type}.rsl.text - python utils/compute-wer.py --char=1 --v=1 \ + python ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test.text ${output_dir}/${type}.rsl.text > ${output_dir}/${type}.error done fi @@ -100,13 +100,13 @@ fi if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then echo "using sclite to compute cer..." # format the reference test file for sclite - python utils/format_rsl.py \ + python ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test.raw \ --trans_ref_sclite data/manifest.test.text.sclite output_dir=${ckpt_prefix} for type in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do - python utils/format_rsl.py \ + python ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${output_dir}/${type}.rsl \ --trans_hyp_sclite ${output_dir}/${type}.rsl.text.sclite diff --git a/examples/aishell/asr3/local/test.sh b/examples/aishell/asr3/local/test.sh index 91e1c5457..b3a4cf5d2 100755 --- a/examples/aishell/asr3/local/test.sh +++ b/examples/aishell/asr3/local/test.sh @@ -22,7 +22,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; # exit 1 #fi -python3 utils/format_rsl.py \ +python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test.raw \ --trans_ref data/manifest.test.text @@ -43,11 +43,11 @@ for type in ctc_greedy_search; do echo "Failed in evaluation!" exit 1 fi - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.${type}.rsl \ --trans_hyp ${ckpt_prefix}.${type}.rsl.text - python3 utils/compute-wer.py --char=1 --v=1 \ + python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error echo "decoding ${type} done." done @@ -68,11 +68,11 @@ for type in ctc_prefix_beam_search; do echo "Failed in evaluation!" exit 1 fi - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.${type}.rsl \ --trans_hyp ${ckpt_prefix}.${type}.rsl.text - python3 utils/compute-wer.py --char=1 --v=1 \ + python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error echo "decoding ${type} done." done diff --git a/examples/librispeech/asr0/local/test.sh b/examples/librispeech/asr0/local/test.sh index 728569d1f..d6258f5c8 100755 --- a/examples/librispeech/asr0/local/test.sh +++ b/examples/librispeech/asr0/local/test.sh @@ -22,7 +22,7 @@ fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # format the reference test file - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test-clean.raw \ --trans_ref data/manifest.test-clean.text @@ -38,20 +38,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then exit 1 fi - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.rsl \ --trans_hyp ${ckpt_prefix}.rsl.text - python3 utils/compute-wer.py --char=1 --v=1 \ + python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test-clean.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error fi if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test-clean.raw \ --trans_ref_sclite data/manifest.test.text-clean.sclite - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.rsl \ --trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite diff --git a/examples/librispeech/asr1/local/test.sh b/examples/librispeech/asr1/local/test.sh index 03cef9a62..491c8ae77 100755 --- a/examples/librispeech/asr1/local/test.sh +++ b/examples/librispeech/asr1/local/test.sh @@ -43,7 +43,7 @@ echo "chunk mode ${chunk_mode}" if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # format the reference test file - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test-clean.raw \ --trans_ref data/manifest.test-clean.text @@ -68,11 +68,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then echo "Failed in evaluation!" exit 1 fi - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.${type}.rsl \ --trans_hyp ${ckpt_prefix}.${type}.rsl.text - python3 utils/compute-wer.py --char=1 --v=1 \ + python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error echo "decoding ${type} done." done @@ -98,7 +98,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then echo "Failed in evaluation!" exit 1 fi - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.${type}.rsl \ --trans_hyp ${ckpt_prefix}.${type}.rsl.text @@ -125,25 +125,25 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then echo "Failed in evaluation!" exit 1 fi - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.${type}.rsl \ --trans_hyp ${ckpt_prefix}.${type}.rsl.text - python3 utils/compute-wer.py --char=1 --v=1 \ + python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error echo "decoding ${type} done." done fi if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test-clean.raw \ --trans_ref_sclite data/manifest.test.text-clean.sclite output_dir=${ckpt_prefix} for type in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do - python utils/format_rsl.py \ + python ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${output_dir}/${type}.rsl \ --trans_hyp_sclite ${output_dir}/${type}.rsl.text.sclite diff --git a/examples/librispeech/asr2/utils b/examples/librispeech/asr2/utils deleted file mode 120000 index f49247da8..000000000 --- a/examples/librispeech/asr2/utils +++ /dev/null @@ -1 +0,0 @@ -../../../tools/kaldi/egs/wsj/s5/utils \ No newline at end of file diff --git a/examples/librispeech/asr3/local/test.sh b/examples/librispeech/asr3/local/test.sh index ccc0d84de..c59376771 100755 --- a/examples/librispeech/asr3/local/test.sh +++ b/examples/librispeech/asr3/local/test.sh @@ -24,7 +24,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; # exit 1 #fi -python3 utils/format_rsl.py \ +python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test-clean.raw \ --trans_ref data/manifest.test-clean.text @@ -45,11 +45,11 @@ for type in ctc_greedy_search; do echo "Failed in evaluation!" exit 1 fi - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.${type}.rsl \ --trans_hyp ${ckpt_prefix}.${type}.rsl.text - python3 utils/compute-wer.py --char=1 --v=1 \ + python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error echo "decoding ${type} done." done @@ -70,11 +70,11 @@ for type in ctc_prefix_beam_search; do echo "Failed in evaluation!" exit 1 fi - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.${type}.rsl \ --trans_hyp ${ckpt_prefix}.${type}.rsl.text - python3 utils/compute-wer.py --char=1 --v=1 \ + python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error echo "decoding ${type} done." done diff --git a/examples/librispeech/asr4/local/test.sh b/examples/librispeech/asr4/local/test.sh index dfbd56ac2..8c17bd350 100755 --- a/examples/librispeech/asr4/local/test.sh +++ b/examples/librispeech/asr4/local/test.sh @@ -23,7 +23,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; # exit 1 #fi -python3 utils/format_rsl.py \ +python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test-clean.raw \ --trans_ref data/manifest.test-clean.text @@ -44,11 +44,11 @@ for type in ctc_greedy_search; do echo "Failed in evaluation!" exit 1 fi - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.${type}.rsl \ --trans_hyp ${ckpt_prefix}.${type}.rsl.text - python3 utils/compute-wer.py --char=1 --v=1 \ + python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error echo "decoding ${type} done." done @@ -69,11 +69,11 @@ for type in ctc_prefix_beam_search; do echo "Failed in evaluation!" exit 1 fi - python3 utils/format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.${type}.rsl \ --trans_hyp ${ckpt_prefix}.${type}.rsl.text - python3 utils/compute-wer.py --char=1 --v=1 \ + python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \ data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error echo "decoding ${type} done." done diff --git a/examples/librispeech/asr5/utils b/examples/librispeech/asr5/utils deleted file mode 100644 index 973afe674..000000000 --- a/examples/librispeech/asr5/utils +++ /dev/null @@ -1 +0,0 @@ -../../../utils \ No newline at end of file diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index 7836d3adf..283680a94 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -422,7 +422,6 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): else: raise Exception("wrong model type") - self.predictor.clear_intermediate_tensor() self.predictor.try_shrink_memory() #replace the with ' ' diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index b7ee80a7d..050a79185 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -398,14 +398,15 @@ class DeepSpeech2InferModel(DeepSpeech2Model): paddle.static.InputSpec( shape=[None, None, self.encoder.feat_size ], #[B, chunk_size, feat_dim] - dtype='float32'), + dtype='float32', ), paddle.static.InputSpec(shape=[None], dtype='int64'), # audio_length, [B] paddle.static.InputSpec( shape=[None, None, None], dtype='float32'), paddle.static.InputSpec( shape=[None, None, None], dtype='float32') - ]) + ], + full_graph=True) elif self.encoder.rnn_direction == "bidirect": static_model = paddle.jit.to_static( self, @@ -415,7 +416,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model): dtype='float32'), # audio, [B,T,D] paddle.static.InputSpec(shape=[None], dtype='int64'), # audio_length, [B] - ]) + ], + full_graph=True) else: raise Exception("wrong model type") return static_model From 9752f0a03b4553621450298c40907b19d4b9afa1 Mon Sep 17 00:00:00 2001 From: megemini Date: Wed, 18 Dec 2024 14:43:28 +0800 Subject: [PATCH 32/36] =?UTF-8?q?[Hackathon=207th]=20=E4=BF=AE=E5=A4=8D=20?= =?UTF-8?q?`asr5`=20=E7=9A=84=20`test.sh`=20=E8=84=9A=E6=9C=AC=E8=B7=AF?= =?UTF-8?q?=E5=BE=84=E9=94=99=E8=AF=AF=20(#3941)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [Fix] test parsearg and test.sh * [Update] use short path --- examples/librispeech/asr5/local/test.sh | 6 +++--- paddlespeech/s2t/exps/wavlm/bin/test.py | 5 ++--- paddlespeech/s2t/exps/wavlm/bin/test_wav.py | 4 ---- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/librispeech/asr5/local/test.sh b/examples/librispeech/asr5/local/test.sh index 18158bd50..07c0deec9 100644 --- a/examples/librispeech/asr5/local/test.sh +++ b/examples/librispeech/asr5/local/test.sh @@ -23,7 +23,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; # exit 1 #fi -python3 format_rsl.py \ +python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_ref data/manifest.test-clean.raw \ --trans_ref data/manifest.test-clean.text @@ -44,7 +44,7 @@ for type in ctc_greedy_search; do echo "Failed in evaluation!" exit 1 fi - python3 format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.${type}.rsl \ --trans_hyp ${ckpt_prefix}.${type}.rsl.text @@ -69,7 +69,7 @@ for type in ctc_prefix_beam_search; do echo "Failed in evaluation!" exit 1 fi - python3 format_rsl.py \ + python3 ${MAIN_ROOT}/utils/format_rsl.py \ --origin_hyp ${ckpt_prefix}.${type}.rsl \ --trans_hyp ${ckpt_prefix}.${type}.rsl.text diff --git a/paddlespeech/s2t/exps/wavlm/bin/test.py b/paddlespeech/s2t/exps/wavlm/bin/test.py index f56b418bc..b84421b54 100644 --- a/paddlespeech/s2t/exps/wavlm/bin/test.py +++ b/paddlespeech/s2t/exps/wavlm/bin/test.py @@ -18,7 +18,8 @@ from yacs.config import CfgNode from paddlespeech.s2t.exps.wavlm.model import WavLMASRTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser -from paddlespeech.utils.argparse import print_arguments, add_arguments +from paddlespeech.utils.argparse import add_arguments +from paddlespeech.utils.argparse import print_arguments def main_sp(config, args): @@ -37,8 +38,6 @@ if __name__ == "__main__": # save asr result to parser.add_argument( '--dict-path', type=str, default=None, help='dict path.') - parser.add_argument( - "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) diff --git a/paddlespeech/s2t/exps/wavlm/bin/test_wav.py b/paddlespeech/s2t/exps/wavlm/bin/test_wav.py index e6c07629d..2f4728b09 100644 --- a/paddlespeech/s2t/exps/wavlm/bin/test_wav.py +++ b/paddlespeech/s2t/exps/wavlm/bin/test_wav.py @@ -105,10 +105,6 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() # save asr result to - parser.add_argument( - "--result_file", type=str, help="path of save the asr result") - parser.add_argument( - "--audio_file", type=str, help="path of the input audio file") args = parser.parse_args() config = CfgNode(new_allowed=True) From fac4adb0b5ff31a2040a6e494655a3a65ad3209c Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Mon, 23 Dec 2024 19:48:03 +0800 Subject: [PATCH 33/36] =?UTF-8?q?=E3=80=90TTS=E3=80=91add=20some=20PIR=20m?= =?UTF-8?q?odel=20(#3956)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add pir model * add more --- examples/aishell3/tts3/README.md | 3 +++ examples/aishell3/voc1/README.md | 3 +++ examples/aishell3/voc5/README.md | 3 +++ examples/csmsc/voc3/README.md | 3 +++ examples/csmsc/voc5/README.md | 3 +++ examples/vctk/voc5/README.md | 3 +++ 6 files changed, 18 insertions(+) diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index c33d665c8..f97a84b50 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -223,6 +223,9 @@ Pretrained FastSpeech2 model with no silence in the edge of audios: The static model can be downloaded here: - [fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip) +The PIR static model can be downloaded here: +- [fastspeech2_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2) + The ONNX model can be downloaded here: - [fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip) diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index 467653cbe..e453c8ae8 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -136,6 +136,9 @@ Pretrained models can be downloaded here: The static model can be downloaded here: - [pwgan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_1.1.0.zip) +The PIR static model can be downloaded here: +- [pwgan_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2) + The ONNX model can be downloaded here: - [pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip) diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md index 7f62ed0d0..676f56c28 100644 --- a/examples/aishell3/voc5/README.md +++ b/examples/aishell3/voc5/README.md @@ -119,6 +119,9 @@ The pretrained model can be downloaded here: The static model can be downloaded here: - [hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip) +The PIR static model can be downloaded here: +- [hifigan_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2) + The ONNX model can be downloaded here: - [hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip) diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index f2a1eef7f..a5319a368 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -161,6 +161,9 @@ The finetuned model can be downloaded here: The static model can be downloaded here: - [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) +The PIR static model can be downloaded here: +- [mb_melgan_csmsc_static_pir_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_pir_0.1.1.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2) + The ONNX model can be downloaded here: - [mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip) diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index e4d100619..eed8c670c 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -129,6 +129,9 @@ The pretrained model can be downloaded here: The static model can be downloaded here: - [hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip) +The PIR static model can be downloaded here: +- [hifigan_csmsc_static_pir_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_pir_0.1.1.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2) + The ONNX model can be downloaded here: - [hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip) diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md index 5a104f56f..f91bc99aa 100644 --- a/examples/vctk/voc5/README.md +++ b/examples/vctk/voc5/README.md @@ -124,6 +124,9 @@ The pretrained model can be downloaded here: The static model can be downloaded here: - [hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip) +The PIR static model can be downloaded here: +- [hifigan_vctk_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2) + The ONNX model can be downloaded here: - [hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip) From cf6a23d5fa0ef962d0b7c06c9a96413efdcaace4 Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Tue, 24 Dec 2024 14:21:49 +0800 Subject: [PATCH 34/36] =?UTF-8?q?[Hackathon=207th]=20=E6=B7=BB=E5=8A=A0csm?= =?UTF-8?q?sc/voc1=E7=9A=84synthesize=5Fe2e.sh=EF=BC=8C=E4=BF=AE=E5=A4=8Dr?= =?UTF-8?q?un.sh=20(#3945)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix * fix * fix * fix * fix --- examples/csmsc/voc1/README.md | 103 ++++++++++++++++++++ examples/csmsc/voc1/local/synthesize_e2e.sh | 22 +++++ examples/csmsc/voc1/run.sh | 7 +- 3 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 examples/csmsc/voc1/local/synthesize_e2e.sh diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index 6c148fe9b..30102cd9e 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -29,6 +29,7 @@ Run the command below to 3. train the model. 4. synthesize wavs. - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. ```bash ./run.sh ``` @@ -106,6 +107,18 @@ benchmark: 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesizing +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it. +```bash +unzip pwg_baker_ckpt_0.4.zip +``` +Parallel WaveGAN checkpoint contains files listed below. +```text +pwg_baker_ckpt_0.4 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} @@ -138,18 +151,97 @@ optional arguments: 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +We use [Fastspeech2](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3) as the acoustic model. +Download pretrained fastspeech2_nosil model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)and unzip it. +```bash +unzip fastspeech2_nosil_baker_ckpt_0.4.zip +``` +Fastspeech2 checkpoint contains files listed below. +```text +fastspeech2_nosil_baker_ckpt_0.4 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_76000.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` + +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. + +``` +1. `--am` is acoustic model type with the format {model_name}_{dataset} +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. +3. `--voc` is vocoder type with the format {model_name}_{dataset} +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +5. `--lang` is the model language, which can be `zh` or `en`. +6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +7. `--text` is the text file, which contains sentences to synthesize. +8. `--output_dir` is the directory to save synthesized audio files. +9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + ## Pretrained Models The pretrained model can be downloaded here: - [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) +- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) The static model can be downloaded here: - [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip) +- [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip) The ONNX model can be downloaded here: - [pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip) +- [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip) The Paddle-Lite model can be downloaded here: - [pwgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_pdlite_1.3.0.zip) +- [fastspeech2_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_pdlite_1.3.0.zip) Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------: @@ -163,5 +255,16 @@ pwg_baker_ckpt_0.4 ├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` + +FastSpeech2 checkpoint contains files listed below. + +```text +fastspeech2_nosil_baker_ckpt_0.4 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_76000.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` + ## Acknowledgement We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/csmsc/voc1/local/synthesize_e2e.sh b/examples/csmsc/voc1/local/synthesize_e2e.sh new file mode 100644 index 000000000..428c234ff --- /dev/null +++ b/examples/csmsc/voc1/local/synthesize_e2e.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ No newline at end of file diff --git a/examples/csmsc/voc1/run.sh b/examples/csmsc/voc1/run.sh index d11226202..89826bbca 100755 --- a/examples/csmsc/voc1/run.sh +++ b/examples/csmsc/voc1/run.sh @@ -31,7 +31,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi -# PTQ_static if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan by default + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +# PTQ_static +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} pwgan_csmsc || exit -1 fi From ee4f15826bb4556c6407e8882012776191782a23 Mon Sep 17 00:00:00 2001 From: Ryan Date: Tue, 24 Dec 2024 14:24:48 +0800 Subject: [PATCH 35/36] [Hackathon 7th No.55] Add `fft_conv1d` to `PaddleSpeech` (#3947) * add fft_conv1d * add unitest 2 shell * fix paddle version * rename * add comment * bias -> bias_attr * fix unitest * fix sth --- paddlespeech/t2s/modules/__init__.py | 1 + paddlespeech/t2s/modules/fftconv1d.py | 214 ++++++++++++++++++++++++++ tests/unit/ci.sh | 1 + tests/unit/tts/test_fftconv1d.py | 128 +++++++++++++++ 4 files changed, 344 insertions(+) create mode 100644 paddlespeech/t2s/modules/fftconv1d.py create mode 100644 tests/unit/tts/test_fftconv1d.py diff --git a/paddlespeech/t2s/modules/__init__.py b/paddlespeech/t2s/modules/__init__.py index 1e3312002..88e74db7a 100644 --- a/paddlespeech/t2s/modules/__init__.py +++ b/paddlespeech/t2s/modules/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from .conv import * +from .fftconv1d import * from .geometry import * from .losses import * from .positional_encoding import * diff --git a/paddlespeech/t2s/modules/fftconv1d.py b/paddlespeech/t2s/modules/fftconv1d.py new file mode 100644 index 000000000..cbdb84bda --- /dev/null +++ b/paddlespeech/t2s/modules/fftconv1d.py @@ -0,0 +1,214 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import typing +from typing import Optional +from typing import Sequence + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ...utils import satisfy_paddle_version + +__all__ = [ + "fft_conv1d", + "FFTConv1D", +] + + +def __unfold(x, kernel_size: int, stride: int): + """1D only unfolding similar to the one from Paddlepaddle. + + Notes + ------ + Given a tensor `x` of size `[*, T]` this will return + a tensor `[*, F, K]` with `K` the kernel size, and `F` the number + of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`. + This will automatically pad `x` to cover at least once all entries in `x`. + + Args: + x (Tensor): + tensor for which to return the frames. + kernel_size (int): + size of each frame. + stride (int): + stride between each frame. + """ + shape = list(x.shape) + length = shape.pop(-1) + n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1 + tgt_length = (n_frames - 1) * stride + kernel_size + padded = F.pad(x, (0, tgt_length - length), data_format="NCL") + strides: typing.List[int] = [] + for dim in range(padded.dim()): + strides.append(padded.strides[dim]) + assert strides.pop(-1) == 1, "data should be contiguous" + strides = strides + [stride, 1] + return padded.as_strided(shape + [n_frames, kernel_size], strides) + + +def fft_conv1d( + x: paddle.Tensor, + weight: paddle.Tensor, + bias: Optional[paddle.Tensor]=None, + stride: int=1, + padding: int=0, + block_ratio: float=5, ): + """ + Same as `paddle.nn.functional.conv1d` but using FFT for the convolution. + Please check PaddlePaddle documentation for more information. + + Notes + ------ + This function is faster than `paddle.nn.functional.conv1d` only in specific cases. + Typically, the kernel size should be of the order of 256 to see any real gain, + for a stride of 1. + Dilation and groups are not supported at the moment. This function might use + more memory than the default Conv1d implementation. + + Args: + x (Tensor): + x signal of shape `[B, C, T]`. + weight (Tensor): + weight of the convolution `[D, C, K]` with `D` the number of output channels. + bias (Tensor or None): + if not None, bias term for the convolution. + stride (int): + stride of convolution. + padding (int): + padding to apply to x. + block_ratio (float): + can be tuned for speed. x is splitted in chunks with a size of `int(block_ratio * kernel_size)`. + + Shape: + + - Inputs: `x` is `[B, C, T]`, `weight` is `[D, C, K]` and bias is `[D]`. + - Output: `(*, T)` + """ + x = F.pad(x, (padding, padding), data_format="NCL") + batch, _, length = x.shape + out_channels, _, kernel_size = weight.shape + + if length < kernel_size: + raise RuntimeError( + f"Input should be at least as large as the kernel size {kernel_size}, " + f"but it is only {length} samples long.") + if block_ratio < 1: + raise RuntimeError("Block ratio must be greater than 1.") + + block_size: int = min(int(kernel_size * block_ratio), length) + fold_stride = block_size - kernel_size + 1 + # weight = pad_to(weight, block_size) + + weight = F.pad( + weight, (0, block_size - weight.shape[-1]), + mode="constant", + value=0.0, + data_format="NCL") + + weight_z = paddle.fft.rfft(weight, axis=-1) + + # We pad `x` and get the different frames, on which + frames = __unfold(x, block_size, fold_stride) + + frames_z = paddle.fft.rfft(frames, axis=-1) + weight_z_coml = paddle.conj(weight_z) + out_z = paddle.einsum("bcft,dct->bdft", frames_z, weight_z_coml) + out = paddle.fft.irfft(out_z, n=block_size, axis=-1) + + # The last bit is invalid, because FFT will do a circular convolution. + out = out[..., :-kernel_size + 1] + out = out.reshape([batch, out_channels, -1]) + out = out[..., ::stride] + target_length = (length - kernel_size) // stride + 1 + out = out[..., :target_length] + if bias is not None: + out += bias[:, None] + return out + + +class FFTConv1D(paddle.nn.Layer): + """ + Same as `paddle.nn.Conv1D` but based on a custom FFT-based convolution. + Please check PaddlePaddle documentation for more information on `paddle.nn.Conv1D`. + + Notes + ------ + This module is faster than `paddle.nn.Conv1D` only in specific cases. + Typically, `kernel_size` should be of the order of 256 to see any real gain, + for a stride of 1. + Dilation and groups are not supported at the moment. This module might use + more memory than the default Conv1D implementation. + + Args: + in_channels (int): + number of `x` channels. + out_channels (int): + number of output channels. + kernel_size (int): + kernel size of convolution. + stride (int): + stride of convolution. + padding (int): + padding to apply to `x`. + bias_attr (bool): + if True, use a bias term. + + Examples: + >>> fftconv = FFTConv1D(12, 24, 128, 4) + >>> x = paddle.randn([4, 12, 1024]) + >>> print(list(fftconv(x).shape)) + [4, 24, 225] + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int=1, + padding: int=0, + bias_attr: bool=True, ): + super(FFTConv1D, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + + # Create a Conv1D layer to initialize weights and bias + conv = paddle.nn.Conv1D( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias_attr=bias_attr) + self.weight = conv.weight + if bias_attr: + self.bias = conv.bias + else: + self.bias = None + + def forward(self, x: paddle.Tensor): + return fft_conv1d(x, self.weight, self.bias, self.stride, self.padding) + + +# Currently, the API unfold in Paddle is extremely slow, so __unfold is implemented +# using the `.strides` and `.as_strided` APIs. However, these are only supported in +# Paddle version 2.6 and above, so F.conv1d and Conv1D are used as replacements. +if not satisfy_paddle_version('2.6'): + fft_conv1d = F.conv1d + FFTConv1D = nn.Conv1D diff --git a/tests/unit/ci.sh b/tests/unit/ci.sh index 72b4678d6..daf40f721 100644 --- a/tests/unit/ci.sh +++ b/tests/unit/ci.sh @@ -14,6 +14,7 @@ function main(){ cd ${speech_ci_path}/tts python test_data_table.py python test_enfrontend.py + python test_fftconv1d.py python test_mixfrontend.py echo "End TTS" diff --git a/tests/unit/tts/test_fftconv1d.py b/tests/unit/tts/test_fftconv1d.py new file mode 100644 index 000000000..88ea397ec --- /dev/null +++ b/tests/unit/tts/test_fftconv1d.py @@ -0,0 +1,128 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import unittest + +import numpy as np +import paddle +from paddle.nn import Conv1D + +from paddlespeech.t2s.modules import fft_conv1d +from paddlespeech.t2s.modules import FFTConv1D + + +class TestFFTConv1D(unittest.TestCase): + def setUp(self): + self.batch_size = 4 + self.in_channels = 3 + self.out_channels = 16 + self.kernel_size = 5 + self.stride = 1 + self.padding = 1 + self.input_length = 32 + + def _init_models(self, in_channels, out_channels, kernel_size, stride, + padding): + x = paddle.randn([self.batch_size, in_channels, self.input_length]) + conv1d = paddle.nn.Conv1D( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding) + fft_conv1d = FFTConv1D( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding) + fft_conv1d.weight.set_value(conv1d.weight.numpy()) + if conv1d.bias is not None: + fft_conv1d.bias.set_value(conv1d.bias.numpy()) + return x, conv1d, fft_conv1d + + def test_fft_conv1d_vs_conv1d_default(self): + x, conv1d, fft_conv1d = self._init_models( + self.in_channels, self.out_channels, self.kernel_size, self.stride, + self.padding) + out_conv1d = conv1d(x) + out_fft_conv1d = fft_conv1d(x) + self.assertTrue( + np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6)) + + def test_fft_conv1d_vs_conv1d_no_padding(self): + x, conv1d, fft_conv1d = self._init_models( + self.in_channels, self.out_channels, self.kernel_size, self.stride, + 0) + out_conv1d = conv1d(x) + out_fft_conv1d = fft_conv1d(x) + self.assertTrue( + np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6)) + + def test_fft_conv1d_vs_conv1d_large_kernel(self): + kernel_size = 256 + padding = kernel_size - 1 + x, conv1d, fft_conv1d = self._init_models( + self.in_channels, self.out_channels, kernel_size, self.stride, + padding) + out_conv1d = conv1d(x) + out_fft_conv1d = fft_conv1d(x) + self.assertTrue( + np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6)) + + def test_fft_conv1d_vs_conv1d_stride_2(self): + x, conv1d, fft_conv1d = self._init_models( + self.in_channels, self.out_channels, self.kernel_size, 2, + self.padding) + out_conv1d = conv1d(x) + out_fft_conv1d = fft_conv1d(x) + self.assertTrue( + np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6)) + + def test_fft_conv1d_vs_conv1d_different_input_length(self): + input_length = 1024 + x, conv1d, fft_conv1d = self._init_models( + self.in_channels, self.out_channels, self.kernel_size, self.stride, + self.padding) + x = paddle.randn([self.batch_size, self.in_channels, input_length]) + out_conv1d = conv1d(x) + out_fft_conv1d = fft_conv1d(x) + self.assertTrue( + np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6)) + + def test_fft_conv1d_vs_conv1d_no_bias(self): + conv1d = paddle.nn.Conv1D( + self.in_channels, + self.out_channels, + self.kernel_size, + stride=self.stride, + padding=self.padding, + bias_attr=False) + fft_conv1d = FFTConv1D( + self.in_channels, + self.out_channels, + self.kernel_size, + stride=self.stride, + padding=self.padding, + bias_attr=False) + fft_conv1d.weight.set_value(conv1d.weight.numpy()) + x = paddle.randn([self.batch_size, self.in_channels, self.input_length]) + out_conv1d = conv1d(x) + out_fft_conv1d = fft_conv1d(x) + self.assertTrue( + np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6)) + + +if __name__ == '__main__': + unittest.main() From c7d5b39ccfba214dafe2981bf5c6a5f450e45870 Mon Sep 17 00:00:00 2001 From: Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com> Date: Fri, 27 Dec 2024 10:25:41 +0700 Subject: [PATCH 36/36] docs: text frontend intended links (#3958) * docs(install_cn): bit misspell * docs(install): bit misspell * docs(models_introduction): text frontend links --- docs/source/install.md | 2 +- docs/source/install_cn.md | 2 +- docs/source/tts/models_introduction.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/install.md b/docs/source/install.md index 3607d7185..205d3e600 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -19,7 +19,7 @@ There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, t - If you are newer to `PaddleSpeech` and want to experience it easily without your machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step [tutorial](https://aistudio.baidu.com/aistudio/education/group/info/25130) for `PaddleSpeech`, and you can use the basic function of `PaddleSpeech` with a free machine. - If you want to use the command line function of Paddlespeech, you need to complete the following steps to install `PaddleSpeech`. For more information about how to use the command line function, you can see the [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli). ### Install Conda -Conda is a management system of the environment. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) (select a version py>=3.7) to download and install the conda. +Conda is a management system of the environment. You can go to [miniconda](https://docs.conda.io/en/latest/miniconda.html) (select a version py>=3.7) to download and install the conda. And then Install conda dependencies for `paddlespeech` : ```bash diff --git a/docs/source/install_cn.md b/docs/source/install_cn.md index 01ae21fe7..ecfb22f59 100644 --- a/docs/source/install_cn.md +++ b/docs/source/install_cn.md @@ -17,7 +17,7 @@ - 如果你是一个刚刚接触 `PaddleSpeech` 的新人并且想要很方便地体验一下该项目。我们建议你体验一下 [AI Studio](https://aistudio.baidu.com/aistudio/index)。我们在 AI Studio上面建立了一个让你一步一步运行体验来使用 `PaddleSpeech` 的[教程](https://aistudio.baidu.com/aistudio/education/group/info/25130)。 - 如果你想使用 `PaddleSpeech` 的命令行功能,你需要跟随下面的步骤来安装 `PaddleSpeech`。如果你想了解更多关于使用 `PaddleSpeech` 命令行功能的信息,你可以参考 [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli)。 ### 安装 Conda -Conda是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda(请下载 py>=3.7 的版本)。 +Conda是一个包管理的环境。你可以前往 [miniconda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda(请下载 py>=3.7 的版本)。 然后你需要安装 `paddlespeech` 的 conda 依赖: ```bash conda install -y -c conda-forge sox libsndfile bzip2 diff --git a/docs/source/tts/models_introduction.md b/docs/source/tts/models_introduction.md index 52c514801..d031ac826 100644 --- a/docs/source/tts/models_introduction.md +++ b/docs/source/tts/models_introduction.md @@ -1,5 +1,5 @@ # Models introduction -TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We introduce a rule-based Chinese text frontend in [cn_text_frontend.md](./cn_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable. +TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We introduce a rule-based Chinese text frontend in [zh_text_frontend](./zh_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable. The main processes of TTS include: 1. Convert the original text into characters/phonemes, through the `text frontend` module.