diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh index f28779452..0b40e0649 100644 --- a/examples/aishell/s1/run.sh +++ b/examples/aishell/s1/run.sh @@ -41,10 +41,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml index 90816e7d7..0159c12f9 100644 --- a/examples/aishell3/tts3/conf/default.yaml +++ b/examples/aishell3/tts3/conf/default.yaml @@ -24,7 +24,7 @@ f0max: 400 # Minimum f0 for pitch extraction. # DATA SETTING # ########################################################### batch_size: 64 -num_workers: 4 +num_workers: 2 ########################################################### @@ -45,7 +45,6 @@ model: postnet_layers: 5 # number of layers of postnset postnet_filts: 5 # filter size of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet - use_masking: True # whether to apply masking for padded part in loss calculation use_scaled_pos_enc: True # whether to use scaled positional encoding encoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh index 656710763..95e4d38fe 100755 --- a/examples/aishell3/tts3/run.sh +++ b/examples/aishell3/tts3/run.sh @@ -7,7 +7,6 @@ gpus=0,1 stage=0 stop_stage=100 - conf_path=conf/default.yaml train_output_path=exp/default ckpt_name=snapshot_iter_482.pdz diff --git a/examples/aishell3/vc0/local/preprocess.sh b/examples/aishell3/vc0/local/preprocess.sh index eeb1923f1..5bf880667 100755 --- a/examples/aishell3/vc0/local/preprocess.sh +++ b/examples/aishell3/vc0/local/preprocess.sh @@ -9,7 +9,7 @@ alignment=$3 ge2e_ckpt_path=$4 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - python3 ${BIN_DIR}/../../ge2e/inference.py \ + python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \ --input=${input}/wav \ --output=${preprocess_path}/embed \ --checkpoint_path=${ge2e_ckpt_path} diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml index bdd2a765e..78c325257 100644 --- a/examples/aishell3/vc1/conf/default.yaml +++ b/examples/aishell3/vc1/conf/default.yaml @@ -45,7 +45,6 @@ model: postnet_layers: 5 # number of layers of postnset postnet_filts: 5 # filter size of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet - use_masking: True # whether to apply masking for padded part in loss calculation use_scaled_pos_enc: True # whether to use scaled positional encoding encoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml new file mode 100644 index 000000000..a34ef318d --- /dev/null +++ b/examples/csmsc/tts3/conf/conformer.yaml @@ -0,0 +1,109 @@ +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size. +n_shift: 300 # Hop size. +win_length: 1200 # Window length. + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +# Only used for the model using pitch features (e.g. FastSpeech2) +f0min: 80 # Maximum f0 for pitch extraction. +f0max: 400 # Minimum f0 for pitch extraction. + + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 64 +num_workers: 4 + + +########################################################### +# MODEL SETTING # +########################################################### +model: + adim: 384 # attention dimension + aheads: 2 # number of attention heads + elayers: 4 # number of encoder layers + eunits: 1536 # number of encoder ff units + dlayers: 4 # number of decoder layers + dunits: 1536 # number of decoder ff units + positionwise_layer_type: conv1d # type of position-wise layer + positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer + duration_predictor_layers: 2 # number of layers of duration predictor + duration_predictor_chans: 256 # number of channels of duration predictor + duration_predictor_kernel_size: 3 # filter size of duration predictor + postnet_layers: 5 # number of layers of postnset + postnet_filts: 5 # filter size of conv layers in postnet + postnet_chans: 256 # number of channels of conv layers in postnet + encoder_normalize_before: True # whether to perform layer normalization before the input + decoder_normalize_before: True # whether to perform layer normalization before the input + reduction_factor: 1 # reduction factor + encoder_type: conformer # encoder type + decoder_type: conformer # decoder type + conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type + conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type + conformer_activation_type: swish # conformer activation type + use_macaron_style_in_conformer: true # whether to use macaron style in conformer + use_cnn_in_conformer: true # whether to use CNN in conformer + conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder + conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder + init_type: xavier_uniform # initialization type + transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer + transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding + transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer + transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer + transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding + transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer + pitch_predictor_layers: 5 # number of conv layers in pitch predictor + pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor + pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor + pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor + pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch + pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch + stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + energy_predictor_layers: 2 # number of conv layers in energy predictor + energy_predictor_chans: 256 # number of channels of conv layers in energy predictor + energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor + energy_predictor_dropout: 0.5 # dropout rate in energy predictor + energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy + energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy + stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + + + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer: + optim: adam # optimizer type + learning_rate: 0.001 # learning rate + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 1000 +num_snapshots: 5 + + +########################################################### +# OTHER SETTING # +########################################################### +seed: 10086 diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml index 32e58c4c6..55dca6d85 100644 --- a/examples/csmsc/tts3/conf/default.yaml +++ b/examples/csmsc/tts3/conf/default.yaml @@ -45,7 +45,6 @@ model: postnet_layers: 5 # number of layers of postnset postnet_filts: 5 # filter size of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet - use_masking: True # whether to apply masking for padded part in loss calculation use_scaled_pos_enc: True # whether to use scaled positional encoding encoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml index 5628b7f7c..1363b454f 100644 --- a/examples/csmsc/voc1/conf/default.yaml +++ b/examples/csmsc/voc1/conf/default.yaml @@ -80,7 +80,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. batch_size: 8 # Batch size. batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size. pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 4 # Number of workers in Pytorch DataLoader. +num_workers: 2 # Number of workers in Pytorch DataLoader. remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh index 4396597f6..74f7cbc1a 100755 --- a/examples/librispeech/s1/run.sh +++ b/examples/librispeech/s1/run.sh @@ -43,10 +43,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then # test a single .wav file diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml index d77329f50..de1ac347a 100644 --- a/examples/librispeech/s2/conf/transformer.yaml +++ b/examples/librispeech/s2/conf/transformer.yaml @@ -1,6 +1,8 @@ # https://yaml.org/type/float.html # network architecture model: + cmvn_file: + cmvn_file_type: "json" # encoder related encoder: transformer encoder_conf: diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh index 0c5b585b8..facaafcb4 100755 --- a/examples/librispeech/s2/run.sh +++ b/examples/librispeech/s2/run.sh @@ -48,10 +48,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # export ckpt avg_n - ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then +# # export ckpt avg_n +# ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then ./local/cacu_perplexity.sh || exit -1 diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml index cabcca80b..e96422a19 100644 --- a/examples/ljspeech/tts3/conf/default.yaml +++ b/examples/ljspeech/tts3/conf/default.yaml @@ -45,7 +45,6 @@ model: postnet_layers: 5 # number of layers of postnset postnet_filts: 5 # filter size of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet - use_masking: True # whether to apply masking for padded part in loss calculation use_scaled_pos_enc: True # whether to use scaled positional encoding encoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input diff --git a/examples/other/ge2e/path.sh b/examples/other/ge2e/path.sh index b4f779859..24305ef78 100755 --- a/examples/other/ge2e/path.sh +++ b/examples/other/ge2e/path.sh @@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} MODEL=ge2e -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} +export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL} diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/t0/run.sh index 2e2bc37d6..ed9ab5f87 100755 --- a/examples/ted_en_zh/t0/run.sh +++ b/examples/ted_en_zh/t0/run.sh @@ -35,7 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi diff --git a/examples/timit/s1/run.sh b/examples/timit/s1/run.sh index 74226c53f..a95b5f3ad 100755 --- a/examples/timit/s1/run.sh +++ b/examples/timit/s1/run.sh @@ -42,7 +42,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi diff --git a/examples/tiny/s1/run.sh b/examples/tiny/s1/run.sh index 23b2206cf..155eca171 100755 --- a/examples/tiny/s1/run.sh +++ b/examples/tiny/s1/run.sh @@ -39,8 +39,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml index 09bd34833..4f945a31c 100644 --- a/examples/vctk/tts3/conf/default.yaml +++ b/examples/vctk/tts3/conf/default.yaml @@ -45,7 +45,6 @@ model: postnet_layers: 5 # number of layers of postnset postnet_filts: 5 # filter size of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet - use_masking: True # whether to apply masking for padded part in loss calculation use_scaled_pos_enc: True # whether to use scaled positional encoding encoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py b/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py index c089f96cd..d06125b7b 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py +++ b/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py @@ -126,8 +126,12 @@ decoders_module = [ ] setup( - name='swig_decoders', - version='1.1', - description="""CTC decoders""", + name='paddlespeech_ctcdecoders', + version='0.0.1a', + description="CTC decoders in paddlespeech", + author="PaddlePaddle Speech and Language Team", + author_email="paddlesl@baidu.com", + url="https://github.com/PaddlePaddle/PaddleSpeech", + license='Apache 2.0', ext_modules=decoders_module, - py_modules=['swig_decoders'], ) + py_modules=['swig_decoders']) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index fd9982716..9977cecc4 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -860,7 +860,7 @@ class U2Model(U2DecodeModel): int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ # cmvn - if configs['cmvn_file'] is not None: + if 'cmvn_file' in configs and configs['cmvn_file'] is not None: mean, istd = load_cmvn(configs['cmvn_file'], configs['cmvn_file_type']) global_cmvn = GlobalCMVN( diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 5ed9aa7af..9470f9234 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples): def fastspeech2_multi_spk_batch_fn(examples): - # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"] + # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"] text = [np.array(item["text"], dtype=np.int64) for item in examples] speech = [np.array(item["speech"], dtype=np.float32) for item in examples] pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples] @@ -114,7 +114,6 @@ def fastspeech2_multi_spk_batch_fn(examples): speech_lengths = [ np.array(item["speech_lengths"], dtype=np.int64) for item in examples ] - spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] text = batch_sequences(text) pitch = batch_sequences(pitch) @@ -130,7 +129,6 @@ def fastspeech2_multi_spk_batch_fn(examples): energy = paddle.to_tensor(energy) text_lengths = paddle.to_tensor(text_lengths) speech_lengths = paddle.to_tensor(speech_lengths) - spk_id = paddle.to_tensor(spk_id) batch = { "text": text, @@ -139,9 +137,20 @@ def fastspeech2_multi_spk_batch_fn(examples): "speech": speech, "speech_lengths": speech_lengths, "pitch": pitch, - "energy": energy, - "spk_id": spk_id + "energy": energy } + # spk_emb has a higher priority than spk_id + if "spk_emb" in examples[0]: + spk_emb = [ + np.array(item["spk_emb"], dtype=np.float32) for item in examples + ] + spk_emb = batch_sequences(spk_emb) + spk_emb = paddle.to_tensor(spk_emb) + batch["spk_emb"] = spk_emb + elif "spk_id" in examples[0]: + spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] + spk_id = paddle.to_tensor(spk_id) + batch["spk_id"] = spk_id return batch diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py index ee9fe0579..1839415e9 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py @@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config): print("vocab_size:", vocab_size) with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) - print("num_speakers:", num_speakers) + spk_num = len(spk_id) + print("spk_num:", spk_num) odim = fastspeech2_config.n_mels model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py index b5d0ce171..095d20821 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py @@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config): print("vocab_size:", vocab_size) with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) - print("num_speakers:", num_speakers) + spk_num = len(spk_id) + print("spk_num:", spk_num) odim = fastspeech2_config.n_mels model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( diff --git a/paddlespeech/t2s/exps/fastspeech2/normalize.py b/paddlespeech/t2s/exps/fastspeech2/normalize.py index 7283f6b43..8ec20ebf0 100644 --- a/paddlespeech/t2s/exps/fastspeech2/normalize.py +++ b/paddlespeech/t2s/exps/fastspeech2/normalize.py @@ -167,6 +167,10 @@ def main(): "pitch": str(pitch_path), "energy": str(energy_path) } + # add spk_emb for voice cloning + if "spk_emb" in item: + record["spk_emb"] = str(item["spk_emb"]) + output_metadata.append(record) output_metadata.sort(key=itemgetter('utt_id')) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index 3702ecd31..b874b3a70 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -44,7 +44,8 @@ def process_sentence(config: Dict[str, Any], mel_extractor=None, pitch_extractor=None, energy_extractor=None, - cut_sil: bool=True): + cut_sil: bool=True, + spk_emb_dir: Path=None): utt_id = fp.stem # for vctk if utt_id.endswith("_mic2"): @@ -116,6 +117,14 @@ def process_sentence(config: Dict[str, Any], "energy": str(energy_path), "speaker": speaker } + if spk_emb_dir: + if speaker in os.listdir(spk_emb_dir): + embed_name = utt_id + ".npy" + embed_path = spk_emb_dir / speaker / embed_name + if embed_path.is_file(): + record["spk_emb"] = str(embed_path) + else: + return None return record @@ -127,13 +136,14 @@ def process_sentences(config, pitch_extractor=None, energy_extractor=None, nprocs: int=1, - cut_sil: bool=True): + cut_sil: bool=True, + spk_emb_dir: Path=None): if nprocs == 1: results = [] for fp in fps: record = process_sentence(config, fp, sentences, output_dir, mel_extractor, pitch_extractor, - energy_extractor, cut_sil) + energy_extractor, cut_sil, spk_emb_dir) if record: results.append(record) else: @@ -144,7 +154,7 @@ def process_sentences(config, future = pool.submit(process_sentence, config, fp, sentences, output_dir, mel_extractor, pitch_extractor, energy_extractor, - cut_sil) + cut_sil, spk_emb_dir) future.add_done_callback(lambda p: progress.update()) futures.append(future) @@ -202,6 +212,11 @@ def main(): default=True, help="whether cut sil in the edge of audio") + parser.add_argument( + "--spk_emb_dir", + default=None, + type=str, + help="directory to speaker embedding files.") args = parser.parse_args() rootdir = Path(args.rootdir).expanduser() @@ -211,6 +226,11 @@ def main(): dumpdir.mkdir(parents=True, exist_ok=True) dur_file = Path(args.dur_file).expanduser() + if args.spk_emb_dir: + spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve() + else: + spk_emb_dir = None + assert rootdir.is_dir() assert dur_file.is_file() @@ -251,6 +271,7 @@ def main(): test_wav_files += wav_files[-sub_num_dev:] else: train_wav_files += wav_files + elif args.dataset == "ljspeech": wav_files = sorted(list((rootdir / "wavs").rglob("*.wav"))) # split data into 3 sections @@ -317,7 +338,8 @@ def main(): pitch_extractor, energy_extractor, nprocs=args.num_cpu, - cut_sil=args.cut_sil) + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) if dev_wav_files: process_sentences( config, @@ -327,7 +349,8 @@ def main(): mel_extractor, pitch_extractor, energy_extractor, - cut_sil=args.cut_sil) + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) if test_wav_files: process_sentences( config, @@ -338,7 +361,8 @@ def main(): pitch_extractor, energy_extractor, nprocs=args.num_cpu, - cut_sil=args.cut_sil) + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) if __name__ == "__main__": diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize.py b/paddlespeech/t2s/exps/fastspeech2/synthesize.py index 207275f90..249845e4d 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize.py @@ -40,16 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config): fields = ["utt_id", "text"] + spk_num = None if args.speaker_dict is not None: print("multiple speaker fastspeech2!") with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) + spk_num = len(spk_id) fields += ["spk_id"] + elif args.voice_cloning: + print("voice cloning!") + fields += ["spk_emb"] else: print("single speaker fastspeech2!") - num_speakers = None - print("num_speakers:", num_speakers) + print("spk_num:", spk_num) test_dataset = DataTable(data=test_metadata, fields=fields) @@ -62,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config): model = FastSpeech2( idim=vocab_size, odim=odim, - num_speakers=num_speakers, + spk_num=spk_num, **fastspeech2_config["model"]) model.set_state_dict( @@ -96,12 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config): for datum in test_dataset: utt_id = datum["utt_id"] text = paddle.to_tensor(datum["text"]) - if "spk_id" in datum: + spk_emb = None + spk_id = None + if args.voice_cloning and "spk_emb" in datum: + spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) + elif "spk_id" in datum: spk_id = paddle.to_tensor(datum["spk_id"]) - else: - spk_id = None with paddle.no_grad(): - wav = pwg_inference(fastspeech2_inference(text, spk_id=spk_id)) + wav = pwg_inference( + fastspeech2_inference(text, spk_id=spk_id, spk_emb=spk_emb)) sf.write( str(output_dir / (utt_id + ".wav")), wav.numpy(), @@ -142,6 +148,15 @@ def main(): type=str, default=None, help="speaker id map file for multiple speaker model.") + + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--voice-cloning", + type=str2bool, + default=False, + help="whether training voice cloning model.") parser.add_argument("--test-metadata", type=str, help="test metadata.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index 38ac2fe3f..fafded6fc 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -61,18 +61,24 @@ def train_sp(args, config): "text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy" ] + converters = {"speech": np.load, "pitch": np.load, "energy": np.load} + spk_num = None if args.speaker_dict is not None: print("multiple speaker fastspeech2!") collate_fn = fastspeech2_multi_spk_batch_fn with open(args.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) + spk_num = len(spk_id) fields += ["spk_id"] + elif args.voice_cloning: + print("Training voice cloning!") + collate_fn = fastspeech2_multi_spk_batch_fn + fields += ["spk_emb"] + converters["spk_emb"] = np.load else: print("single speaker fastspeech2!") collate_fn = fastspeech2_single_spk_batch_fn - num_speakers = None - print("num_speakers:", num_speakers) + print("spk_num:", spk_num) # dataloader has been too verbose logging.getLogger("DataLoader").disabled = True @@ -83,17 +89,13 @@ def train_sp(args, config): train_dataset = DataTable( data=train_metadata, fields=fields, - converters={"speech": np.load, - "pitch": np.load, - "energy": np.load}, ) + converters=converters, ) with jsonlines.open(args.dev_metadata, 'r') as reader: dev_metadata = list(reader) dev_dataset = DataTable( data=dev_metadata, fields=fields, - converters={"speech": np.load, - "pitch": np.load, - "energy": np.load}, ) + converters=converters, ) # collate function and dataloader @@ -127,10 +129,7 @@ def train_sp(args, config): odim = config.n_mels model = FastSpeech2( - idim=vocab_size, - odim=odim, - num_speakers=num_speakers, - **config["model"]) + idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"]) if world_size > 1: model = DataParallel(model) print("model done!") @@ -184,6 +183,15 @@ def main(): default=None, help="speaker id map file for multiple speaker model.") + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--voice-cloning", + type=str2bool, + default=False, + help="whether training voice cloning model.") + args = parser.parse_args() with open(args.config) as f: diff --git a/paddlespeech/t2s/exps/ge2e/random_cycle.py b/paddlespeech/t2s/exps/ge2e/random_cycle.py deleted file mode 100644 index 290fd2fa2..000000000 --- a/paddlespeech/t2s/exps/ge2e/random_cycle.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import random - - -def cycle(iterable): - # cycle('ABCD') --> A B C D A B C D A B C D ... - saved = [] - for element in iterable: - yield element - saved.append(element) - while saved: - for element in saved: - yield element - - -def random_cycle(iterable): - # cycle('ABCD') --> A B C D B C D A A D B C ... - saved = [] - for element in iterable: - yield element - saved.append(element) - random.shuffle(saved) - while saved: - for element in saved: - yield element - random.shuffle(saved) diff --git a/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py b/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py deleted file mode 100644 index a13219969..000000000 --- a/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import random -from pathlib import Path - -import numpy as np -from paddle.io import BatchSampler -from paddle.io import Dataset - -from paddlespeech.t2s.exps.ge2e.random_cycle import random_cycle - - -class MultiSpeakerMelDataset(Dataset): - """A 2 layer directory thatn contains mel spectrograms in *.npy format. - An Example file structure tree is shown below. We prefer to preprocess - raw datasets and organized them like this. - - dataset_root/ - speaker1/ - utterance1.npy - utterance2.npy - utterance3.npy - speaker2/ - utterance1.npy - utterance2.npy - utterance3.npy - """ - - def __init__(self, dataset_root: Path): - self.root = Path(dataset_root).expanduser() - speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()] - - speaker_utterances = { - speaker_dir: list(speaker_dir.glob("*.npy")) - for speaker_dir in speaker_dirs - } - - self.speaker_dirs = speaker_dirs - self.speaker_to_utterances = speaker_utterances - - # meta data - self.num_speakers = len(self.speaker_dirs) - self.num_utterances = np.sum( - len(utterances) - for speaker, utterances in self.speaker_to_utterances.items()) - - def get_example_by_index(self, speaker_index, utterance_index): - speaker_dir = self.speaker_dirs[speaker_index] - fpath = self.speaker_to_utterances[speaker_dir][utterance_index] - return self[fpath] - - def __getitem__(self, fpath): - return np.load(fpath) - - def __len__(self): - return int(self.num_utterances) - - -class MultiSpeakerSampler(BatchSampler): - """A multi-stratal sampler designed for speaker verification task. - First, N speakers from all speakers are sampled randomly. Then, for each - speaker, randomly sample M utterances from their corresponding utterances. - """ - - def __init__(self, - dataset: MultiSpeakerMelDataset, - speakers_per_batch: int, - utterances_per_speaker: int): - self._speakers = list(dataset.speaker_dirs) - self._speaker_to_utterances = dataset.speaker_to_utterances - - self.speakers_per_batch = speakers_per_batch - self.utterances_per_speaker = utterances_per_speaker - - def __iter__(self): - # yield list of Paths - speaker_generator = iter(random_cycle(self._speakers)) - speaker_utterances_generator = { - s: iter(random_cycle(us)) - for s, us in self._speaker_to_utterances.items() - } - - while True: - speakers = [] - for _ in range(self.speakers_per_batch): - speakers.append(next(speaker_generator)) - - utterances = [] - for s in speakers: - us = speaker_utterances_generator[s] - for _ in range(self.utterances_per_speaker): - utterances.append(next(us)) - yield utterances - - -class RandomClip(object): - def __init__(self, frames): - self.frames = frames - - def __call__(self, spec): - # spec [T, C] - T = spec.shape[0] - start = random.randint(0, T - self.frames) - return spec[start:start + self.frames, :] - - -class Collate(object): - def __init__(self, num_frames): - self.random_crop = RandomClip(num_frames) - - def __call__(self, examples): - frame_clips = [self.random_crop(mel) for mel in examples] - batced_clips = np.stack(frame_clips) - return batced_clips - - -if __name__ == "__main__": - mydataset = MultiSpeakerMelDataset( - Path("/home/chenfeiyu/datasets/SV2TTS/encoder")) - print(mydataset.get_example_by_index(0, 10)) diff --git a/paddlespeech/t2s/exps/ge2e/train.py b/paddlespeech/t2s/exps/ge2e/train.py deleted file mode 100644 index 55c6daf73..000000000 --- a/paddlespeech/t2s/exps/ge2e/train.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import time - -from paddle import DataParallel -from paddle import distributed as dist -from paddle.io import DataLoader -from paddle.nn.clip import ClipGradByGlobalNorm -from paddle.optimizer import Adam - -from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults -from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import Collate -from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset -from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler -from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder -from paddlespeech.t2s.training import default_argument_parser -from paddlespeech.t2s.training import ExperimentBase - - -class Ge2eExperiment(ExperimentBase): - def setup_model(self): - config = self.config - model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers, - config.model.hidden_size, - config.model.embedding_size) - optimizer = Adam( - config.training.learning_rate_init, - parameters=model.parameters(), - grad_clip=ClipGradByGlobalNorm(3)) - self.model = DataParallel(model) if self.parallel else model - self.model_core = model - self.optimizer = optimizer - - def setup_dataloader(self): - config = self.config - train_dataset = MultiSpeakerMelDataset(self.args.data) - sampler = MultiSpeakerSampler(train_dataset, - config.training.speakers_per_batch, - config.training.utterances_per_speaker) - train_loader = DataLoader( - train_dataset, - batch_sampler=sampler, - collate_fn=Collate(config.data.partial_n_frames), - num_workers=16) - - self.train_dataset = train_dataset - self.train_loader = train_loader - - def train_batch(self): - start = time.time() - batch = self.read_batch() - data_loader_time = time.time() - start - - self.optimizer.clear_grad() - self.model.train() - specs = batch - loss, eer = self.model(specs, self.config.training.speakers_per_batch) - loss.backward() - self.model_core.do_gradient_ops() - self.optimizer.step() - iteration_time = time.time() - start - - # logging - loss_value = float(loss) - msg = "Rank: {}, ".format(dist.get_rank()) - msg += "step: {}, ".format(self.iteration) - msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, - iteration_time) - msg += 'loss: {:>.6f} err: {:>.6f}'.format(loss_value, eer) - self.logger.info(msg) - - if dist.get_rank() == 0: - self.visualizer.add_scalar("train/loss", loss_value, self.iteration) - self.visualizer.add_scalar("train/eer", eer, self.iteration) - self.visualizer.add_scalar("param/w", - float(self.model_core.similarity_weight), - self.iteration) - self.visualizer.add_scalar("param/b", - float(self.model_core.similarity_bias), - self.iteration) - - def valid(self): - pass - - -def main_sp(config, args): - exp = Ge2eExperiment(config, args) - exp.setup() - exp.resume_or_load() - exp.run() - - -def main(config, args): - if args.ngpu > 1: - dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) - else: - main_sp(config, args) - - -if __name__ == "__main__": - config = get_cfg_defaults() - parser = default_argument_parser() - args = parser.parse_args() - if args.config: - config.merge_from_file(args.config) - if args.opts: - config.merge_from_list(args.opts) - config.freeze() - print(config) - print(args) - - main(config, args) diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py index 2f005e723..4e6b8d362 100644 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py +++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py @@ -20,14 +20,14 @@ import paddle import soundfile as sf from matplotlib import pyplot as plt -from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence -from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder from paddlespeech.t2s.models.tacotron2 import Tacotron2 from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow from paddlespeech.t2s.utils import display +from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder def voice_cloning(args): diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 2e52c1037..8ff07fa5c 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -32,9 +32,7 @@ from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredic from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor from paddlespeech.t2s.modules.tacotron2.decoder import Postnet -from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding -from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding -from paddlespeech.t2s.modules.transformer.encoder import Encoder as TransformerEncoder +from paddlespeech.t2s.modules.transformer.encoder import Encoder class FastSpeech2(nn.Layer): @@ -66,6 +64,7 @@ class FastSpeech2(nn.Layer): postnet_layers: int=5, postnet_chans: int=512, postnet_filts: int=5, + postnet_dropout_rate: float=0.5, positionwise_layer_type: str="conv1d", positionwise_conv_kernel_size: int=1, use_scaled_pos_enc: bool=True, @@ -77,10 +76,27 @@ class FastSpeech2(nn.Layer): reduction_factor: int=1, encoder_type: str="transformer", decoder_type: str="transformer", + # for transformer + transformer_enc_dropout_rate: float=0.1, + transformer_enc_positional_dropout_rate: float=0.1, + transformer_enc_attn_dropout_rate: float=0.1, + transformer_dec_dropout_rate: float=0.1, + transformer_dec_positional_dropout_rate: float=0.1, + transformer_dec_attn_dropout_rate: float=0.1, + # for conformer + conformer_pos_enc_layer_type: str="rel_pos", + conformer_self_attn_layer_type: str="rel_selfattn", + conformer_activation_type: str="swish", + use_macaron_style_in_conformer: bool=True, + use_cnn_in_conformer: bool=True, + zero_triu: bool=False, + conformer_enc_kernel_size: int=7, + conformer_dec_kernel_size: int=31, # duration predictor duration_predictor_layers: int=2, duration_predictor_chans: int=384, duration_predictor_kernel_size: int=3, + duration_predictor_dropout_rate: float=0.1, # energy predictor energy_predictor_layers: int=2, energy_predictor_chans: int=384, @@ -98,28 +114,150 @@ class FastSpeech2(nn.Layer): pitch_embed_dropout: float=0.5, stop_gradient_from_pitch_predictor: bool=False, # spk emb - num_speakers: int=None, + spk_num: int=None, spk_embed_dim: int=None, spk_embed_integration_type: str="add", - # tone emb - num_tones: int=None, + # tone emb + tone_num: int=None, tone_embed_dim: int=None, tone_embed_integration_type: str="add", # training related - transformer_enc_dropout_rate: float=0.1, - transformer_enc_positional_dropout_rate: float=0.1, - transformer_enc_attn_dropout_rate: float=0.1, - transformer_dec_dropout_rate: float=0.1, - transformer_dec_positional_dropout_rate: float=0.1, - transformer_dec_attn_dropout_rate: float=0.1, - duration_predictor_dropout_rate: float=0.1, - postnet_dropout_rate: float=0.5, init_type: str="xavier_uniform", init_enc_alpha: float=1.0, - init_dec_alpha: float=1.0, - use_masking: bool=False, - use_weighted_masking: bool=False, ): - """Initialize FastSpeech2 module.""" + init_dec_alpha: float=1.0, ): + """Initialize FastSpeech2 module. + Parameters + ---------- + idim : int + Dimension of the inputs. + odim : int + Dimension of the outputs. + adim : int + Attention dimension. + aheads : int + Number of attention heads. + elayers : int + Number of encoder layers. + eunits : int + Number of encoder hidden units. + dlayers : int + Number of decoder layers. + dunits : int + Number of decoder hidden units. + postnet_layers : int + Number of postnet layers. + postnet_chans : int + Number of postnet channels. + postnet_filts : int + Kernel size of postnet. + postnet_dropout_rate : float + Dropout rate in postnet. + use_scaled_pos_enc : bool + Whether to use trainable scaled pos encoding. + use_batch_norm : bool + Whether to use batch normalization in encoder prenet. + encoder_normalize_before : bool + Whether to apply layernorm layer before encoder block. + decoder_normalize_before : bool + Whether to apply layernorm layer before + decoder block. + encoder_concat_after : bool + Whether to concatenate attention layer's input and output in encoder. + decoder_concat_after : bool + Whether to concatenate attention layer's input and output in decoder. + reduction_factor : int + Reduction factor. + encoder_type : str + Encoder type ("transformer" or "conformer"). + decoder_type : str + Decoder type ("transformer" or "conformer"). + transformer_enc_dropout_rate : float + Dropout rate in encoder except attention and positional encoding. + transformer_enc_positional_dropout_rate (float): Dropout rate after encoder + positional encoding. + transformer_enc_attn_dropout_rate (float): Dropout rate in encoder + self-attention module. + transformer_dec_dropout_rate (float): Dropout rate in decoder except + attention & positional encoding. + transformer_dec_positional_dropout_rate (float): Dropout rate after decoder + positional encoding. + transformer_dec_attn_dropout_rate (float): Dropout rate in decoder + self-attention module. + conformer_pos_enc_layer_type : str + Pos encoding layer type in conformer. + conformer_self_attn_layer_type : str + Self-attention layer type in conformer + conformer_activation_type : str + Activation function type in conformer. + use_macaron_style_in_conformer : bool + Whether to use macaron style FFN. + use_cnn_in_conformer : bool + Whether to use CNN in conformer. + zero_triu : bool + Whether to use zero triu in relative self-attention module. + conformer_enc_kernel_size : int + Kernel size of encoder conformer. + conformer_dec_kernel_size : int + Kernel size of decoder conformer. + duration_predictor_layers : int + Number of duration predictor layers. + duration_predictor_chans : int + Number of duration predictor channels. + duration_predictor_kernel_size : int + Kernel size of duration predictor. + duration_predictor_dropout_rate : float + Dropout rate in duration predictor. + pitch_predictor_layers : int + Number of pitch predictor layers. + pitch_predictor_chans : int + Number of pitch predictor channels. + pitch_predictor_kernel_size : int + Kernel size of pitch predictor. + pitch_predictor_dropout_rate : float + Dropout rate in pitch predictor. + pitch_embed_kernel_size : float + Kernel size of pitch embedding. + pitch_embed_dropout_rate : float + Dropout rate for pitch embedding. + stop_gradient_from_pitch_predictor : bool + Whether to stop gradient from pitch predictor to encoder. + energy_predictor_layers : int + Number of energy predictor layers. + energy_predictor_chans : int + Number of energy predictor channels. + energy_predictor_kernel_size : int + Kernel size of energy predictor. + energy_predictor_dropout_rate : float + Dropout rate in energy predictor. + energy_embed_kernel_size : float + Kernel size of energy embedding. + energy_embed_dropout_rate : float + Dropout rate for energy embedding. + stop_gradient_from_energy_predictor : bool + Whether to stop gradient from energy predictor to encoder. + spk_num : Optional[int] + Number of speakers. If not None, assume that the spk_embed_dim is not None, + spk_ids will be provided as the input and use spk_embedding_table. + spk_embed_dim : Optional[int] + Speaker embedding dimension. If not None, + assume that spk_emb will be provided as the input or spk_num is not None. + spk_embed_integration_type : str + How to integrate speaker embedding. + tone_num : Optional[int] + Number of tones. If not None, assume that the + tone_ids will be provided as the input and use tone_embedding_table. + tone_embed_dim : Optional[int] + Tone embedding dimension. If not None, assume that tone_num is not None. + tone_embed_integration_type : str + How to integrate tone embedding. + init_type : str + How to initialize transformer parameters. + init_enc_alpha : float + Initial value of alpha in scaled pos encoding of the encoder. + init_dec_alpha : float + Initial value of alpha in scaled pos encoding of the decoder. + + """ assert check_argument_types() super().__init__() @@ -148,30 +286,50 @@ class FastSpeech2(nn.Layer): # initialize parameters initialize(self, init_type) - if self.spk_embed_dim is not None: + if spk_num and self.spk_embed_dim: self.spk_embedding_table = nn.Embedding( - num_embeddings=num_speakers, + num_embeddings=spk_num, embedding_dim=self.spk_embed_dim, padding_idx=self.padding_idx) if self.tone_embed_dim is not None: self.tone_embedding_table = nn.Embedding( - num_embeddings=num_tones, + num_embeddings=tone_num, embedding_dim=self.tone_embed_dim, padding_idx=self.padding_idx) - # get positional encoding class - pos_enc_class = (ScaledPositionalEncoding - if self.use_scaled_pos_enc else PositionalEncoding) + # get positional encoding layer type + transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos" # define encoder encoder_input_layer = nn.Embedding( num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx) - + # add encoder type here + # 测试模型还能跑通不 + # 记得改 transformer tts if encoder_type == "transformer": - self.encoder = TransformerEncoder( + print("encoder_type is transformer") + self.encoder = Encoder( + idim=idim, + attention_dim=adim, + attention_heads=aheads, + linear_units=eunits, + num_blocks=elayers, + input_layer=encoder_input_layer, + dropout_rate=transformer_enc_dropout_rate, + positional_dropout_rate=transformer_enc_positional_dropout_rate, + attention_dropout_rate=transformer_enc_attn_dropout_rate, + pos_enc_layer_type=transformer_pos_enc_layer_type, + normalize_before=encoder_normalize_before, + concat_after=encoder_concat_after, + positionwise_layer_type=positionwise_layer_type, + positionwise_conv_kernel_size=positionwise_conv_kernel_size, + encoder_type=encoder_type) + elif encoder_type == "conformer": + print("encoder_type is conformer") + self.encoder = Encoder( idim=idim, attention_dim=adim, attention_heads=aheads, @@ -181,11 +339,18 @@ class FastSpeech2(nn.Layer): dropout_rate=transformer_enc_dropout_rate, positional_dropout_rate=transformer_enc_positional_dropout_rate, attention_dropout_rate=transformer_enc_attn_dropout_rate, - pos_enc_class=pos_enc_class, normalize_before=encoder_normalize_before, concat_after=encoder_concat_after, positionwise_layer_type=positionwise_layer_type, - positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) + positionwise_conv_kernel_size=positionwise_conv_kernel_size, + macaron_style=use_macaron_style_in_conformer, + pos_enc_layer_type=conformer_pos_enc_layer_type, + selfattention_layer_type=conformer_self_attn_layer_type, + activation_type=conformer_activation_type, + use_cnn_module=use_cnn_in_conformer, + cnn_module_kernel=conformer_enc_kernel_size, + zero_triu=zero_triu, + encoder_type=encoder_type) else: raise ValueError(f"{encoder_type} is not supported.") @@ -251,7 +416,8 @@ class FastSpeech2(nn.Layer): # NOTE: we use encoder as decoder # because fastspeech's decoder is the same as encoder if decoder_type == "transformer": - self.decoder = TransformerEncoder( + print("decoder_type is transformer") + self.decoder = Encoder( idim=0, attention_dim=adim, attention_heads=aheads, @@ -262,11 +428,35 @@ class FastSpeech2(nn.Layer): dropout_rate=transformer_dec_dropout_rate, positional_dropout_rate=transformer_dec_positional_dropout_rate, attention_dropout_rate=transformer_dec_attn_dropout_rate, - pos_enc_class=pos_enc_class, + pos_enc_layer_type=transformer_pos_enc_layer_type, normalize_before=decoder_normalize_before, concat_after=decoder_concat_after, positionwise_layer_type=positionwise_layer_type, - positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) + positionwise_conv_kernel_size=positionwise_conv_kernel_size, + encoder_type=decoder_type) + elif decoder_type == "conformer": + print("decoder_type is conformer") + self.decoder = Encoder( + idim=0, + attention_dim=adim, + attention_heads=aheads, + linear_units=dunits, + num_blocks=dlayers, + input_layer=None, + dropout_rate=transformer_dec_dropout_rate, + positional_dropout_rate=transformer_dec_positional_dropout_rate, + attention_dropout_rate=transformer_dec_attn_dropout_rate, + normalize_before=decoder_normalize_before, + concat_after=decoder_concat_after, + positionwise_layer_type=positionwise_layer_type, + positionwise_conv_kernel_size=positionwise_conv_kernel_size, + macaron_style=use_macaron_style_in_conformer, + pos_enc_layer_type=conformer_pos_enc_layer_type, + selfattention_layer_type=conformer_self_attn_layer_type, + activation_type=conformer_activation_type, + use_cnn_module=use_cnn_in_conformer, + cnn_module_kernel=conformer_dec_kernel_size, + encoder_type=decoder_type) else: raise ValueError(f"{decoder_type} is not supported.") @@ -299,7 +489,7 @@ class FastSpeech2(nn.Layer): pitch: paddle.Tensor, energy: paddle.Tensor, tone_id: paddle.Tensor=None, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, spk_id: paddle.Tensor=None ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. @@ -322,7 +512,7 @@ class FastSpeech2(nn.Layer): Batch of padded token-averaged energy (B, Tmax, 1). tone_id : Tensor, optional(int64) Batch of padded tone ids (B, Tmax). - spembs : Tensor, optional + spk_emb : Tensor, optional Batch of speaker embeddings (B, spk_embed_dim). spk_id : Tnesor, optional(int64) Batch of speaker ids (B,) @@ -366,7 +556,7 @@ class FastSpeech2(nn.Layer): ps, es, is_inference=False, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id) # modify mod part of groundtruth @@ -387,7 +577,7 @@ class FastSpeech2(nn.Layer): es: paddle.Tensor=None, is_inference: bool=False, alpha: float=1.0, - spembs=None, + spk_emb=None, spk_id=None, tone_id=None) -> Sequence[paddle.Tensor]: # forward encoder @@ -397,11 +587,12 @@ class FastSpeech2(nn.Layer): # integrate speaker embedding if self.spk_embed_dim is not None: - if spembs is not None: - hs = self._integrate_with_spk_embed(hs, spembs) + # spk_emb has a higher priority than spk_id + if spk_emb is not None: + hs = self._integrate_with_spk_embed(hs, spk_emb) elif spk_id is not None: - spembs = self.spk_embedding_table(spk_id) - hs = self._integrate_with_spk_embed(hs, spembs) + spk_emb = self.spk_embedding_table(spk_id) + hs = self._integrate_with_spk_embed(hs, spk_emb) # integrate tone embedding if self.tone_embed_dim is not None: @@ -489,7 +680,7 @@ class FastSpeech2(nn.Layer): energy: paddle.Tensor=None, alpha: float=1.0, use_teacher_forcing: bool=False, - spembs=None, + spk_emb=None, spk_id=None, tone_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: @@ -512,7 +703,7 @@ class FastSpeech2(nn.Layer): use_teacher_forcing : bool, optional Whether to use teacher forcing. If true, groundtruth of duration, pitch and energy will be used. - spembs : Tensor, optional + spk_emb : Tensor, optional peaker embedding vector (spk_embed_dim,). spk_id : Tensor, optional(int64) Batch of padded spk ids (1,). @@ -527,7 +718,6 @@ class FastSpeech2(nn.Layer): # input of embedding must be int64 x = paddle.cast(text, 'int64') y = speech - spemb = spembs d, p, e = durations, pitch, energy # setup batch axis ilens = paddle.shape(x)[0] @@ -537,8 +727,8 @@ class FastSpeech2(nn.Layer): if y is not None: ys = y.unsqueeze(0) - if spemb is not None: - spembs = spemb.unsqueeze(0) + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) if tone_id is not None: tone_id = tone_id.unsqueeze(0) @@ -548,7 +738,7 @@ class FastSpeech2(nn.Layer): ds = d.unsqueeze(0) if d is not None else None ps = p.unsqueeze(0) if p is not None else None es = e.unsqueeze(0) if e is not None else None - # ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0) + # (1, L, odim) _, outs, d_outs, p_outs, e_outs = self._forward( xs, @@ -557,7 +747,7 @@ class FastSpeech2(nn.Layer): ds=ds, ps=ps, es=es, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id, is_inference=True) @@ -569,19 +759,19 @@ class FastSpeech2(nn.Layer): ys, is_inference=True, alpha=alpha, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id) return outs[0], d_outs[0], p_outs[0], e_outs[0] - def _integrate_with_spk_embed(self, hs, spembs): + def _integrate_with_spk_embed(self, hs, spk_emb): """Integrate speaker embedding with hidden states. Parameters ---------- hs : Tensor Batch of hidden state sequences (B, Tmax, adim). - spembs : Tensor + spk_emb : Tensor Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -591,13 +781,13 @@ class FastSpeech2(nn.Layer): """ if self.spk_embed_integration_type == "add": # apply projection and then add to hidden states - spembs = self.spk_projection(F.normalize(spembs)) - hs = hs + spembs.unsqueeze(1) + spk_emb = self.spk_projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) elif self.spk_embed_integration_type == "concat": # concat hidden states with spk embeds and then apply projection - spembs = F.normalize(spembs).unsqueeze(1).expand( + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( shape=[-1, hs.shape[1], -1]) - hs = self.spk_projection(paddle.concat([hs, spembs], axis=-1)) + hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1)) else: raise NotImplementedError("support only add or concat.") @@ -682,9 +872,9 @@ class FastSpeech2Inference(nn.Layer): self.normalizer = normalizer self.acoustic_model = model - def forward(self, text, spk_id=None): + def forward(self, text, spk_id=None, spk_emb=None): normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( - text, spk_id=spk_id) + text, spk_id=spk_id, spk_emb=spk_emb) logmel = self.normalizer.inverse(normalized_mel) return logmel diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py index 4297c8b61..0dabf934c 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py @@ -54,6 +54,10 @@ class FastSpeech2Updater(StandardUpdater): losses_dict = {} # spk_id!=None in multiple spk fastspeech2 spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + # No explicit speaker identifier labels are used during voice cloning training. + if spk_emb is not None: + spk_id = None before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( text=batch["text"], @@ -63,7 +67,8 @@ class FastSpeech2Updater(StandardUpdater): durations=batch["durations"], pitch=batch["pitch"], energy=batch["energy"], - spk_id=spk_id) + spk_id=spk_id, + spk_emb=spk_emb) l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( after_outs=after_outs, @@ -126,6 +131,9 @@ class FastSpeech2Evaluator(StandardEvaluator): losses_dict = {} # spk_id!=None in multiple spk fastspeech2 spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + if spk_emb is not None: + spk_id = None before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( text=batch["text"], @@ -135,7 +143,8 @@ class FastSpeech2Evaluator(StandardEvaluator): durations=batch["durations"], pitch=batch["pitch"], energy=batch["energy"], - spk_id=spk_id) + spk_id=spk_id, + spk_emb=spk_emb) l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( after_outs=after_outs, diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index 03620fd4e..e8adafb29 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -257,9 +257,9 @@ class TransformerTTS(nn.Layer): self.padding_idx = 0 # set_global_initializer 会影响后面的全局,包括 create_parameter initialize(self, init_type) - # get positional encoding class - pos_enc_class = (ScaledPositionalEncoding - if self.use_scaled_pos_enc else PositionalEncoding) + + # get positional encoding layer type + transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos" # define transformer encoder if eprenet_conv_layers != 0: @@ -291,7 +291,7 @@ class TransformerTTS(nn.Layer): dropout_rate=transformer_enc_dropout_rate, positional_dropout_rate=transformer_enc_positional_dropout_rate, attention_dropout_rate=transformer_enc_attn_dropout_rate, - pos_enc_class=pos_enc_class, + pos_enc_layer_type=transformer_pos_enc_layer_type, normalize_before=encoder_normalize_before, concat_after=encoder_concat_after, positionwise_layer_type=positionwise_layer_type, @@ -330,6 +330,9 @@ class TransformerTTS(nn.Layer): nn.Linear(dprenet_units, adim), ) else: decoder_input_layer = "linear" + # get positional encoding class + pos_enc_class = (ScaledPositionalEncoding + if self.use_scaled_pos_enc else PositionalEncoding) self.decoder = Decoder( odim=odim, # odim is needed when no prenet is used attention_dim=adim, @@ -391,7 +394,7 @@ class TransformerTTS(nn.Layer): text_lengths: paddle.Tensor, speech: paddle.Tensor, speech_lengths: paddle.Tensor, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. @@ -405,7 +408,7 @@ class TransformerTTS(nn.Layer): Batch of padded target features (B, Lmax, odim). speech_lengths : Tensor(int64) Batch of the lengths of each target (B,). - spembs : Tensor, optional + spk_emb : Tensor, optional Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -439,7 +442,7 @@ class TransformerTTS(nn.Layer): # calculate transformer outputs after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens, - spembs) + spk_emb) # modifiy mod part of groundtruth @@ -467,7 +470,7 @@ class TransformerTTS(nn.Layer): ilens: paddle.Tensor, ys: paddle.Tensor, olens: paddle.Tensor, - spembs: paddle.Tensor, + spk_emb: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: # forward encoder x_masks = self._source_mask(ilens) @@ -480,7 +483,7 @@ class TransformerTTS(nn.Layer): # integrate speaker embedding if self.spk_embed_dim is not None: - hs = self._integrate_with_spk_embed(hs, spembs) + hs = self._integrate_with_spk_embed(hs, spk_emb) # thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim) if self.reduction_factor > 1: @@ -514,7 +517,7 @@ class TransformerTTS(nn.Layer): self, text: paddle.Tensor, speech: paddle.Tensor=None, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, threshold: float=0.5, minlenratio: float=0.0, maxlenratio: float=10.0, @@ -528,7 +531,7 @@ class TransformerTTS(nn.Layer): Input sequence of characters (T,). speech : Tensor, optional Feature sequence to extract style (N, idim). - spembs : Tensor, optional + spk_emb : Tensor, optional Speaker embedding vector (spk_embed_dim,). threshold : float, optional Threshold in inference. @@ -551,7 +554,6 @@ class TransformerTTS(nn.Layer): """ # input of embedding must be int64 y = speech - spemb = spembs # add eos at the last of sequence text = numpy.pad( @@ -564,12 +566,12 @@ class TransformerTTS(nn.Layer): # get teacher forcing outputs xs, ys = x.unsqueeze(0), y.unsqueeze(0) - spembs = None if spemb is None else spemb.unsqueeze(0) + spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0) ilens = paddle.to_tensor( [xs.shape[1]], dtype=paddle.int64, place=xs.place) olens = paddle.to_tensor( [ys.shape[1]], dtype=paddle.int64, place=ys.place) - outs, *_ = self._forward(xs, ilens, ys, olens, spembs) + outs, *_ = self._forward(xs, ilens, ys, olens, spk_emb) # get attention weights att_ws = [] @@ -590,9 +592,9 @@ class TransformerTTS(nn.Layer): hs = hs + style_embs.unsqueeze(1) # integrate speaker embedding - if self.spk_embed_dim is not None: - spembs = spemb.unsqueeze(0) - hs = self._integrate_with_spk_embed(hs, spembs) + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + hs = self._integrate_with_spk_embed(hs, spk_emb) # set limits of length maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor) @@ -726,14 +728,14 @@ class TransformerTTS(nn.Layer): def _integrate_with_spk_embed(self, hs: paddle.Tensor, - spembs: paddle.Tensor) -> paddle.Tensor: + spk_emb: paddle.Tensor) -> paddle.Tensor: """Integrate speaker embedding with hidden states. Parameters ---------- hs : Tensor Batch of hidden state sequences (B, Tmax, adim). - spembs : Tensor + spk_emb : Tensor Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -744,13 +746,13 @@ class TransformerTTS(nn.Layer): """ if self.spk_embed_integration_type == "add": # apply projection and then add to hidden states - spembs = self.projection(F.normalize(spembs)) - hs = hs + spembs.unsqueeze(1) + spk_emb = self.projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) elif self.spk_embed_integration_type == "concat": # concat hidden states with spk embeds and then apply projection - spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1], - -1) - hs = self.projection(paddle.concat([hs, spembs], axis=-1)) + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(-1, hs.shape[1], + -1) + hs = self.projection(paddle.concat([hs, spk_emb], axis=-1)) else: raise NotImplementedError("support only add or concat.") diff --git a/paddlespeech/t2s/modules/conformer/encoder.py b/paddlespeech/t2s/modules/conformer/encoder.py deleted file mode 100644 index 568597ba5..000000000 --- a/paddlespeech/t2s/modules/conformer/encoder.py +++ /dev/null @@ -1,274 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from espnet(https://github.com/espnet/espnet) -"""Encoder definition.""" -import logging - -import paddle - -from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule -from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer -from paddlespeech.t2s.modules.layer_norm import LayerNorm -from paddlespeech.t2s.modules.nets_utils import get_activation -from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention -from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention -from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention -from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding -from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding -from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding -from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding -from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear -from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d -from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward -from paddlespeech.t2s.modules.transformer.repeat import repeat -from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling - - -class Encoder(paddle.nn.Layer): - """Conformer encoder module. - Parameters - ---------- - idim : int - Input dimension. - attention_dim : int - Dimension of attention. - attention_heads : int - The number of heads of multi head attention. - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - attention_dropout_rate : float - Dropout rate in attention. - input_layer : Union[str, paddle.nn.Layer] - Input layer type. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type : str - "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size : int - Kernel size of positionwise conv1d layer. - macaron_style : bool - Whether to use macaron style for positionwise layer. - pos_enc_layer_type : str - Encoder positional encoding layer type. - selfattention_layer_type : str - Encoder attention layer type. - activation_type : str - Encoder activation function type. - use_cnn_module : bool - Whether to use convolution module. - zero_triu : bool - Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel : int - Kernerl size of convolution module. - padding_idx : int - Padding idx for input_layer=embed. - stochastic_depth_rate : float - Maximum probability to skip the encoder layer. - intermediate_layers : Union[List[int], None] - indices of intermediate CTC layer. - indices start from 1. - if not None, intermediate outputs are returned (which changes return type - signature.) - """ - - def __init__( - self, - idim, - attention_dim=256, - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - positional_dropout_rate=0.1, - attention_dropout_rate=0.0, - input_layer="conv2d", - normalize_before=True, - concat_after=False, - positionwise_layer_type="linear", - positionwise_conv_kernel_size=1, - macaron_style=False, - pos_enc_layer_type="abs_pos", - selfattention_layer_type="selfattn", - activation_type="swish", - use_cnn_module=False, - zero_triu=False, - cnn_module_kernel=31, - padding_idx=-1, - stochastic_depth_rate=0.0, - intermediate_layers=None, ): - """Construct an Encoder object.""" - super(Encoder, self).__init__() - - activation = get_activation(activation_type) - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "scaled_abs_pos": - pos_enc_class = ScaledPositionalEncoding - elif pos_enc_layer_type == "rel_pos": - assert selfattention_layer_type == "rel_selfattn" - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "legacy_rel_pos": - pos_enc_class = LegacyRelPositionalEncoding - assert selfattention_layer_type == "legacy_rel_selfattn" - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - self.conv_subsampling_factor = 1 - if input_layer == "linear": - self.embed = paddle.nn.Sequential( - paddle.nn.Linear(idim, attention_dim), - paddle.nn.LayerNorm(attention_dim), - paddle.nn.Dropout(dropout_rate), - pos_enc_class(attention_dim, positional_dropout_rate), ) - elif input_layer == "conv2d": - self.embed = Conv2dSubsampling( - idim, - attention_dim, - dropout_rate, - pos_enc_class(attention_dim, positional_dropout_rate), ) - self.conv_subsampling_factor = 4 - - elif input_layer == "embed": - self.embed = paddle.nn.Sequential( - paddle.nn.Embedding( - idim, attention_dim, padding_idx=padding_idx), - pos_enc_class(attention_dim, positional_dropout_rate), ) - elif isinstance(input_layer, paddle.nn.Layer): - self.embed = paddle.nn.Sequential( - input_layer, - pos_enc_class(attention_dim, positional_dropout_rate), ) - elif input_layer is None: - self.embed = paddle.nn.Sequential( - pos_enc_class(attention_dim, positional_dropout_rate)) - else: - raise ValueError("unknown input_layer: " + input_layer) - self.normalize_before = normalize_before - - # self-attention module definition - if selfattention_layer_type == "selfattn": - logging.info("encoder self-attention layer type = self-attention") - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = (attention_heads, attention_dim, - attention_dropout_rate, ) - elif selfattention_layer_type == "legacy_rel_selfattn": - assert pos_enc_layer_type == "legacy_rel_pos" - encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = (attention_heads, attention_dim, - attention_dropout_rate, ) - elif selfattention_layer_type == "rel_selfattn": - logging.info( - "encoder self-attention layer type = relative self-attention") - assert pos_enc_layer_type == "rel_pos" - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = (attention_heads, attention_dim, - attention_dropout_rate, zero_triu, ) - else: - raise ValueError("unknown encoder_attn_layer: " + - selfattention_layer_type) - - # feed-forward module definition - if positionwise_layer_type == "linear": - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = (attention_dim, linear_units, - dropout_rate, activation, ) - elif positionwise_layer_type == "conv1d": - positionwise_layer = MultiLayeredConv1d - positionwise_layer_args = (attention_dim, linear_units, - positionwise_conv_kernel_size, - dropout_rate, ) - elif positionwise_layer_type == "conv1d-linear": - positionwise_layer = Conv1dLinear - positionwise_layer_args = (attention_dim, linear_units, - positionwise_conv_kernel_size, - dropout_rate, ) - else: - raise NotImplementedError("Support only linear or conv1d.") - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (attention_dim, cnn_module_kernel, activation) - - self.encoders = repeat( - num_blocks, - lambda lnum: EncoderLayer( - attention_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer(*positionwise_layer_args) if macaron_style else None, - convolution_layer(*convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - stochastic_depth_rate * float(1 + lnum) / num_blocks, ), ) - if self.normalize_before: - self.after_norm = LayerNorm(attention_dim) - - self.intermediate_layers = intermediate_layers - - def forward(self, xs, masks): - """Encode input sequence. - Parameters - ---------- - xs : paddle.Tensor - Input tensor (#batch, time, idim). - masks (paddle.Tensor): Mask tensor (#batch, 1, time). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, attention_dim). - paddle.Tensor - Mask tensor (#batch, time). - """ - if isinstance(self.embed, (Conv2dSubsampling)): - xs, masks = self.embed(xs, masks) - else: - xs = self.embed(xs) - - if self.intermediate_layers is None: - xs, masks = self.encoders(xs, masks) - else: - intermediate_outputs = [] - for layer_idx, encoder_layer in enumerate(self.encoders): - xs, masks = encoder_layer(xs, masks) - - if (self.intermediate_layers is not None and - layer_idx + 1 in self.intermediate_layers): - # intermediate branches also require normalization. - encoder_output = xs - if isinstance(encoder_output, tuple): - encoder_output = encoder_output[0] - if self.normalize_before: - encoder_output = self.after_norm(encoder_output) - intermediate_outputs.append(encoder_output) - - if isinstance(xs, tuple): - xs = xs[0] - - if self.normalize_before: - xs = self.after_norm(xs) - - if self.intermediate_layers is not None: - return xs, masks, intermediate_outputs - return xs, masks diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py index b11329b03..34386f2a5 100644 --- a/paddlespeech/t2s/modules/transformer/attention.py +++ b/paddlespeech/t2s/modules/transformer/attention.py @@ -37,7 +37,7 @@ class MultiHeadedAttention(nn.Layer): def __init__(self, n_head, n_feat, dropout_rate): """Construct an MultiHeadedAttention object.""" - super(MultiHeadedAttention, self).__init__() + super().__init__() assert n_feat % n_head == 0 # We assume d_v always equals d_k self.d_k = n_feat // n_head @@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer): paddle.Tensor Transformed value tensor (#batch, n_head, time2, d_k). """ - n_batch = query.shape[0] + n_batch = paddle.shape(query)[0] q = paddle.reshape( self.linear_q(query), [n_batch, -1, self.h, self.d_k]) @@ -104,7 +104,7 @@ class MultiHeadedAttention(nn.Layer): Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2). """ - n_batch = value.shape[0] + n_batch = paddle.shape(value)[0] softmax = paddle.nn.Softmax(axis=-1) if mask is not None: mask = mask.unsqueeze(1) @@ -126,8 +126,8 @@ class MultiHeadedAttention(nn.Layer): # (batch, time1, d_model) x = (paddle.reshape( x.transpose((0, 2, 1, 3)), (n_batch, -1, self.h * self.d_k))) - - return self.linear_out(x) # (batch, time1, d_model) + # (batch, time1, d_model) + return self.linear_out(x) def forward(self, query, key, value, mask=None): """Compute scaled dot product attention. @@ -153,3 +153,113 @@ class MultiHeadedAttention(nn.Layer): (0, 1, 3, 2))) / math.sqrt(self.d_k) return self.forward_attention(v, scores, mask) + + +class RelPositionMultiHeadedAttention(MultiHeadedAttention): + """Multi-Head Attention layer with relative position encoding (new implementation). + Details can be found in https://github.com/espnet/espnet/pull/2816. + Paper: https://arxiv.org/abs/1901.02860 + Parameters + ---------- + n_head : int + The number of heads. + n_feat : int + The number of features. + dropout_rate : float + Dropout rate. + zero_triu : bool + Whether to zero the upper triangular part of attention matrix. + """ + + def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): + """Construct an RelPositionMultiHeadedAttention object.""" + super().__init__(n_head, n_feat, dropout_rate) + self.zero_triu = zero_triu + # linear transformation for positional encoding + self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + + self.pos_bias_u = paddle.create_parameter( + shape=(self.h, self.d_k), + dtype='float32', + default_initializer=paddle.nn.initializer.XavierUniform()) + self.pos_bias_v = paddle.create_parameter( + shape=(self.h, self.d_k), + dtype='float32', + default_initializer=paddle.nn.initializer.XavierUniform()) + + def rel_shift(self, x): + """Compute relative positional encoding. + Parameters + ---------- + x : paddle.Tensor + Input tensor (batch, head, time1, 2*time1-1). + time1 means the length of query vector. + Returns + ---------- + paddle.Tensor + Output tensor. + """ + b, h, t1, t2 = paddle.shape(x) + zero_pad = paddle.zeros((b, h, t1, 1)) + x_padded = paddle.concat([zero_pad, x], axis=-1) + x_padded = x_padded.reshape([b, h, t2 + 1, t1]) + # only keep the positions from 0 to time2 + x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1] + + if self.zero_triu: + ones = paddle.ones((t1, t2)) + x = x * paddle.tril(ones, t2 - 1)[None, None, :, :] + + return x + + def forward(self, query, key, value, pos_emb, mask): + """Compute 'Scaled Dot Product Attention' with rel. positional encoding. + Parameters + ---------- + query : paddle.Tensor + Query tensor (#batch, time1, size). + key : paddle.Tensor + Key tensor (#batch, time2, size). + value : paddle.Tensor + Value tensor (#batch, time2, size). + pos_emb : paddle.Tensor + Positional embedding tensor + (#batch, 2*time1-1, size). + mask : paddle.Tensor + Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + Returns + ---------- + paddle.Tensor + Output tensor (#batch, time1, d_model). + """ + q, k, v = self.forward_qkv(query, key, value) + # (batch, time1, head, d_k) + q = q.transpose([0, 2, 1, 3]) + + n_batch_pos = paddle.shape(pos_emb)[0] + p = self.linear_pos(pos_emb).reshape( + [n_batch_pos, -1, self.h, self.d_k]) + # (batch, head, 2*time1-1, d_k) + p = p.transpose([0, 2, 1, 3]) + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3]) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3]) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch, head, time1, time2) + matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2])) + + # compute matrix b and matrix d + # (batch, head, time1, 2*time1-1) + matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2])) + matrix_bd = self.rel_shift(matrix_bd) + # (batch, head, time1, time2) + scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k) + + return self.forward_attention(v, scores, mask) diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py index f26c9dcba..3c3f36168 100644 --- a/paddlespeech/t2s/modules/transformer/embedding.py +++ b/paddlespeech/t2s/modules/transformer/embedding.py @@ -96,14 +96,14 @@ class ScaledPositionalEncoding(PositionalEncoding): Parameters ---------- - d_model : int - Embedding dimension. - dropout_rate : float - Dropout rate. - max_len : int - Maximum input length. - dtype : str - dtype of param + d_model : int + Embedding dimension. + dropout_rate : float + Dropout rate. + max_len : int + Maximum input length. + dtype : str + dtype of param """ def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): @@ -128,14 +128,87 @@ class ScaledPositionalEncoding(PositionalEncoding): Parameters ---------- - x : paddle.Tensor - Input tensor (batch, time, `*`). + x : paddle.Tensor + Input tensor (batch, time, `*`). Returns ---------- - paddle.Tensor - Encoded tensor (batch, time, `*`). + paddle.Tensor + Encoded tensor (batch, time, `*`). """ self.extend_pe(x) T = paddle.shape(x)[1] x = x + self.alpha * self.pe[:, :T] return self.dropout(x) + + +class RelPositionalEncoding(paddle.nn.Layer): + """Relative positional encoding module (new implementation). + Details can be found in https://github.com/espnet/espnet/pull/2816. + See : Appendix B in https://arxiv.org/abs/1901.02860 + Parameters + ---------- + d_model : int + Embedding dimension. + dropout_rate : float + Dropout rate. + max_len : int + Maximum input length. + """ + + def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): + """Construct an PositionalEncoding object.""" + super(RelPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = paddle.nn.Dropout(p=dropout_rate) + self.pe = None + self.dtype = dtype + self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len))) + + def extend_pe(self, x): + """Reset the positional encodings.""" + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if paddle.shape(self.pe)[1] >= paddle.shape(x)[1] * 2 - 1: + return + # Suppose `i` means to the position of query vecotr and `j` means the + # position of key vector. We use position relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i