From c088b9a304811775d26dd252f3fa987662917f0c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 12 Jan 2022 04:02:56 +0000 Subject: [PATCH 001/107] add csmsc tacotron2 --- examples/aishell3/tts3/conf/default.yaml | 5 +- examples/aishell3/vc1/conf/default.yaml | 4 +- examples/aishell3/voc1/conf/default.yaml | 6 +- examples/csmsc/tts0/README.md | 264 +++++++++ examples/csmsc/tts0/conf/default.yaml | 95 ++++ examples/csmsc/tts0/local/preprocess.sh | 62 +++ examples/csmsc/tts0/local/synthesize.sh | 20 + examples/csmsc/tts0/local/synthesize_e2e.sh | 91 +++ examples/csmsc/tts0/local/train.sh | 12 + examples/csmsc/tts0/path.sh | 13 + examples/csmsc/tts0/run.sh | 37 ++ examples/csmsc/tts3/conf/conformer.yaml | 8 +- examples/csmsc/tts3/conf/default.yaml | 5 +- examples/csmsc/tts3/run.sh | 2 +- examples/csmsc/voc1/conf/default.yaml | 10 +- examples/csmsc/voc3/conf/default.yaml | 6 +- examples/csmsc/voc3/conf/finetune.yaml | 6 +- examples/csmsc/voc4/conf/default.yaml | 6 +- examples/csmsc/voc5/conf/default.yaml | 32 +- examples/csmsc/voc5/conf/finetune.yaml | 32 +- examples/ljspeech/tts1/conf/default.yaml | 4 +- examples/ljspeech/tts3/conf/default.yaml | 4 +- examples/ljspeech/voc1/conf/default.yaml | 6 +- examples/vctk/tts3/conf/default.yaml | 4 +- examples/vctk/voc1/conf/default.yaml | 6 +- paddlespeech/t2s/datasets/am_batch_fn.py | 31 +- .../t2s/exps/fastspeech2/gen_gta_mel.py | 36 +- .../t2s/exps/new_tacotron2/normalize.py | 1 + .../t2s/exps/new_tacotron2/preprocess.py | 353 ++++++++++++ paddlespeech/t2s/exps/new_tacotron2/train.py | 190 +++++++ paddlespeech/t2s/exps/synthesize.py | 13 +- paddlespeech/t2s/exps/synthesize_e2e.py | 10 +- .../t2s/models/fastspeech2/fastspeech2.py | 3 +- .../models/fastspeech2/fastspeech2_updater.py | 10 +- .../t2s/models/new_tacotron2/__init__.py | 15 + .../t2s/models/new_tacotron2/tacotron2.py | 496 +++++++++++++++++ .../models/new_tacotron2/tacotron2_updater.py | 217 ++++++++ .../transformer_tts_updater.py | 32 +- paddlespeech/t2s/modules/losses.py | 244 ++++++++ .../t2s/modules/tacotron2/attentions.py | 519 +++++++++++++++++ paddlespeech/t2s/modules/tacotron2/decoder.py | 527 ++++++++++++++++++ paddlespeech/t2s/modules/tacotron2/encoder.py | 8 +- paddlespeech/t2s/training/optimizer.py | 24 +- 43 files changed, 3335 insertions(+), 134 deletions(-) create mode 100644 examples/csmsc/tts0/README.md create mode 100644 examples/csmsc/tts0/conf/default.yaml create mode 100755 examples/csmsc/tts0/local/preprocess.sh create mode 100755 examples/csmsc/tts0/local/synthesize.sh create mode 100755 examples/csmsc/tts0/local/synthesize_e2e.sh create mode 100755 examples/csmsc/tts0/local/train.sh create mode 100755 examples/csmsc/tts0/path.sh create mode 100755 examples/csmsc/tts0/run.sh create mode 120000 paddlespeech/t2s/exps/new_tacotron2/normalize.py create mode 100644 paddlespeech/t2s/exps/new_tacotron2/preprocess.py create mode 100644 paddlespeech/t2s/exps/new_tacotron2/train.py create mode 100644 paddlespeech/t2s/models/new_tacotron2/__init__.py create mode 100644 paddlespeech/t2s/models/new_tacotron2/tacotron2.py create mode 100644 paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py create mode 100644 paddlespeech/t2s/modules/tacotron2/attentions.py diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml index 3a57e902..69307049 100644 --- a/examples/aishell3/tts3/conf/default.yaml +++ b/examples/aishell3/tts3/conf/default.yaml @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type @@ -84,7 +84,6 @@ updater: use_masking: True # whether to apply masking for padded part in loss calculation - ########################################################### # OPTIMIZER SETTING # ########################################################### diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml index 557a5a0a..69307049 100644 --- a/examples/aishell3/vc1/conf/default.yaml +++ b/examples/aishell3/vc1/conf/default.yaml @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml index 7fbffbdd..e2102d6e 100644 --- a/examples/aishell3/voc1/conf/default.yaml +++ b/examples/aishell3/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md new file mode 100644 index 00000000..13d291b5 --- /dev/null +++ b/examples/csmsc/tts0/README.md @@ -0,0 +1,264 @@ +# FastSpeech2 with CSMSC +This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). + +## Dataset +### Download and Extract +Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/BZNSYP`. +Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from a text file. +5. inference using the static model. +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── energy_stats.npy + ├── norm + ├── pitch_stats.npy + ├── raw + └── speech_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech、pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, the path of pitch features, the path of energy features, speaker, and the id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] + +Train a FastSpeech2 model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG fastspeech2 config file. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu=0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. + --speaker-dict SPEAKER_DICT + speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. +``` +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. + +### Synthesizing +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it. +```bash +unzip pwg_baker_ckpt_0.4.zip +``` +Parallel WaveGAN checkpoint contains files listed below. +```text +pwg_baker_ckpt_0.4 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] + [--voice-cloning VOICE_CLONING] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --voice-cloning VOICE_CLONING + whether training voice cloning model. + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. +``` +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. +``` +1. `--am` is acoustic model type with the format {model_name}_{dataset} +2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. +3. `--voc` is vocoder type with the format {model_name}_{dataset} +4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +5. `--lang` is the model language, which can be `zh` or `en`. +6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +7. `--text` is the text file, which contains sentences to synthesize. +8. `--output_dir` is the directory to save synthesized audio files. +9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +### Inferencing +After synthesizing, we will get static models of fastspeech2 and pwgan in `${train_output_path}/inference`. +`./local/inference.sh` calls `${BIN_DIR}/inference.py`, which provides a paddle static model inference example for fastspeech2 + pwgan synthesize. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} +``` + +## Pretrained Model +Pretrained FastSpeech2 model with no silence in the edge of audios: +- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) +- [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip) + +The static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip). + +Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss +:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: +default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287| +conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509| + +FastSpeech2 checkpoint contains files listed below. +```text +fastspeech2_nosil_baker_ckpt_0.4 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_76000.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. +```bash +source path.sh + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --am_ckpt=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --am_stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=exp/default/test_e2e \ + --inference_dir=exp/default/inference \ + --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt +``` diff --git a/examples/csmsc/tts0/conf/default.yaml b/examples/csmsc/tts0/conf/default.yaml new file mode 100644 index 00000000..171aee88 --- /dev/null +++ b/examples/csmsc/tts0/conf/default.yaml @@ -0,0 +1,95 @@ +# This configuration is for Paddle to train Tacotron 2. Compared to the +# original paper, this configuration additionally use the guided attention +# loss to accelerate the learning of the diagonal attention. It requires +# only a single GPU with 12 GB memory and it takes ~1 days to finish the +# training on Titan V. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size (samples). +n_shift: 300 # Hop size (samples). 12.5ms +win_length: 1200 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +# Only used for the model using pitch features (e.g. FastSpeech2) +f0min: 80 # Maximum f0 for pitch extraction. +f0max: 400 # Minimum f0 for pitch extraction. + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 64 +num_workers: 2 + +########################################################### +# MODEL SETTING # +########################################################### +model: # keyword arguments for the selected model + embed_dim: 512 # char or phn embedding dimension + elayers: 1 # number of blstm layers in encoder + eunits: 512 # number of blstm units + econv_layers: 3 # number of convolutional layers in encoder + econv_chans: 512 # number of channels in convolutional layer + econv_filts: 5 # filter size of convolutional layer + atype: location # attention function type + adim: 512 # attention dimension + aconv_chans: 32 # number of channels in convolutional layer of attention + aconv_filts: 15 # filter size of convolutional layer of attention + cumulate_att_w: True # whether to cumulate attention weight + dlayers: 2 # number of lstm layers in decoder + dunits: 1024 # number of lstm units in decoder + prenet_layers: 2 # number of layers in prenet + prenet_units: 256 # number of units in prenet + postnet_layers: 5 # number of layers in postnet + postnet_chans: 512 # number of channels in postnet + postnet_filts: 5 # filter size of postnet layer + output_activation: null # activation function for the final output + use_batch_norm: True # whether to use batch normalization in encoder + use_concate: True # whether to concatenate encoder embedding with decoder outputs + use_residual: False # whether to use residual connection in encoder + dropout_rate: 0.5 # dropout rate + zoneout_rate: 0.1 # zoneout rate + reduction_factor: 1 # reduction factor + spk_embed_dim: null # speaker embedding dimension + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation + use_guided_attn_loss: True # whether to use guided attention loss + guided_attn_loss_sigma: 0.4 # sigma of guided attention loss + guided_attn_loss_lambda: 1.0 # strength of guided attention loss + + +########################################################## +# OPTIMIZER SETTING # +########################################################## +optimizer: + optim: adam # optimizer type + learning_rate: 1.0e-03 # learning rate + epsilon: 1.0e-06 # epsilon + weight_decay: 0.0 # weight decay coefficient + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 200 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +seed: 42 \ No newline at end of file diff --git a/examples/csmsc/tts0/local/preprocess.sh b/examples/csmsc/tts0/local/preprocess.sh new file mode 100755 index 00000000..8a4b8dd9 --- /dev/null +++ b/examples/csmsc/tts0/local/preprocess.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./baker_alignment_tone \ + --output=durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/preprocess.py \ + --dataset=baker \ + --rootdir=~/datasets/BZNSYP/ \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --num-cpu=20 \ + --cut-sil=True +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="speech" + +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize and covert phone to id, dev and test should use train's stats + echo "Normalize ..." + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt +fi diff --git a/examples/csmsc/tts0/local/synthesize.sh b/examples/csmsc/tts0/local/synthesize.sh new file mode 100755 index 00000000..4be06dd8 --- /dev/null +++ b/examples/csmsc/tts0/local/synthesize.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh new file mode 100755 index 00000000..fe5d11d4 --- /dev/null +++ b/examples/csmsc/tts0/local/synthesize_e2e.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ + --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ + --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi + +# the pretrained models haven't release now +# style melgan +# style melgan's Dygraph to Static Graph is not ready now +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt + # --inference_dir=${train_output_path}/inference +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/csmsc/tts0/local/train.sh b/examples/csmsc/tts0/local/train.sh new file mode 100755 index 00000000..f90db915 --- /dev/null +++ b/examples/csmsc/tts0/local/train.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 \ + --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/csmsc/tts0/path.sh b/examples/csmsc/tts0/path.sh new file mode 100755 index 00000000..9cdbe256 --- /dev/null +++ b/examples/csmsc/tts0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=new_tacotron2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/csmsc/tts0/run.sh b/examples/csmsc/tts0/run.sh new file mode 100755 index 00000000..86800920 --- /dev/null +++ b/examples/csmsc/tts0/run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_153.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml index 252f634d..03e4f2e3 100644 --- a/examples/csmsc/tts3/conf/conformer.yaml +++ b/examples/csmsc/tts3/conf/conformer.yaml @@ -53,8 +53,8 @@ model: conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type conformer_activation_type: swish # conformer activation type - use_macaron_style_in_conformer: true # whether to use macaron style in conformer - use_cnn_in_conformer: true # whether to use CNN in conformer + use_macaron_style_in_conformer: True # whether to use macaron style in conformer + use_cnn_in_conformer: True # whether to use CNN in conformer conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder init_type: xavier_uniform # initialization type @@ -70,14 +70,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml index 1f723d67..ce2b24d9 100644 --- a/examples/csmsc/tts3/conf/default.yaml +++ b/examples/csmsc/tts3/conf/default.yaml @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder @@ -82,7 +82,6 @@ updater: use_masking: True # whether to apply masking for padded part in loss calculation - ########################################################### # OPTIMIZER SETTING # ########################################################### diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index c1ddd3b9..8f06e933 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -18,7 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - bash ./local/preprocess.sh ${conf_path} || exit -1 + ./local/preprocess.sh ${conf_path} || exit -1 fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml index 28d218ff..703be21b 100644 --- a/examples/csmsc/voc1/conf/default.yaml +++ b/examples/csmsc/voc1/conf/default.yaml @@ -34,10 +34,10 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - bias: true # use bias in residual blocks - use_weight_norm: true # Whether to use weight norm. + bias: True # use bias in residual blocks + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. - use_causal_conv: false # use causal conv in residual blocks and upsample layers + use_causal_conv: False # use causal conv in residual blocks and upsample layers upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size. interpolate_mode: "nearest" # upsample net interpolate mode freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis @@ -53,8 +53,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml index 27e97664..fbff54f1 100644 --- a/examples/csmsc/voc3/conf/default.yaml +++ b/examples/csmsc/voc3/conf/default.yaml @@ -63,13 +63,13 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss. win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. window: "hann" # Window function for STFT-based loss -use_subband_stft_loss: true +use_subband_stft_loss: True subband_stft_loss_params: fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss @@ -79,7 +79,7 @@ subband_stft_loss_params: ########################################################### # ADVERSARIAL LOSS SETTING # ########################################################### -use_feat_match_loss: false # Whether to use feature matching loss. +use_feat_match_loss: False # Whether to use feature matching loss. lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. ########################################################### diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml index a3b1d8b1..0a38c282 100644 --- a/examples/csmsc/voc3/conf/finetune.yaml +++ b/examples/csmsc/voc3/conf/finetune.yaml @@ -63,13 +63,13 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. window: "hann" # Window function for STFT-based loss -use_subband_stft_loss: true +use_subband_stft_loss: True subband_stft_loss_params: fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss. @@ -79,7 +79,7 @@ subband_stft_loss_params: ########################################################### # ADVERSARIAL LOSS SETTING # ########################################################### -use_feat_match_loss: false # Whether to use feature matching loss. +use_feat_match_loss: False # Whether to use feature matching loss. lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. ########################################################### diff --git a/examples/csmsc/voc4/conf/default.yaml b/examples/csmsc/voc4/conf/default.yaml index c9abf78d..cd8f8e28 100644 --- a/examples/csmsc/voc4/conf/default.yaml +++ b/examples/csmsc/voc4/conf/default.yaml @@ -65,7 +65,7 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss @@ -78,9 +78,9 @@ lambda_aux: 1.0 # Loss balancing coefficient for aux loss. ########################################################### lambda_adv: 1.0 # Loss balancing coefficient for adv loss. generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. ########################################################### # DATA LOADER SETTING # diff --git a/examples/csmsc/voc5/conf/default.yaml b/examples/csmsc/voc5/conf/default.yaml index f42fc385..38b94cf5 100644 --- a/examples/csmsc/voc5/conf/default.yaml +++ b/examples/csmsc/voc5/conf/default.yaml @@ -35,12 +35,12 @@ generator_params: - [1, 3, 5] - [1, 3, 5] - [1, 3, 5] - use_additional_convs: true # Whether to use additional conv layer in residual blocks. - bias: true # Whether to use bias parameter in conv. + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. + use_weight_norm: True # Whether to apply weight normalization. ########################################################### @@ -60,12 +60,12 @@ discriminator_params: channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. - bias: true + bias: True downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: negative_slope: 0.1 - follow_official_norm: true # Whether to follow the official norm setting. + follow_official_norm: True # Whether to follow the official norm setting. periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. period_discriminator_params: in_channels: 1 # Number of input channels. @@ -74,19 +74,19 @@ discriminator_params: channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. - bias: true # Whether to use bias parameter in conv layer." + bias: True # Whether to use bias parameter in conv layer." nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. - use_spectral_norm: false # Whether to apply spectral normalization. + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: false # Whether to use multi-resolution STFT loss. -use_mel_loss: true # Whether to use Mel-spectrogram loss. +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. mel_loss_params: fs: 24000 fft_size: 2048 @@ -98,14 +98,14 @@ mel_loss_params: fmax: 12000 log_base: null generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. -use_feat_match_loss: true + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True feat_match_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. - average_by_layers: false # Whether to average loss by #layers in each discriminator. - include_final_outputs: false # Whether to include final outputs in feat match loss calculation. + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. ########################################################### # ADVERSARIAL LOSS SETTING # diff --git a/examples/csmsc/voc5/conf/finetune.yaml b/examples/csmsc/voc5/conf/finetune.yaml index 73420625..110ae052 100644 --- a/examples/csmsc/voc5/conf/finetune.yaml +++ b/examples/csmsc/voc5/conf/finetune.yaml @@ -35,12 +35,12 @@ generator_params: - [1, 3, 5] - [1, 3, 5] - [1, 3, 5] - use_additional_convs: true # Whether to use additional conv layer in residual blocks. - bias: true # Whether to use bias parameter in conv. + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. + use_weight_norm: True # Whether to apply weight normalization. ########################################################### @@ -60,12 +60,12 @@ discriminator_params: channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. - bias: true + bias: True downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: negative_slope: 0.1 - follow_official_norm: true # Whether to follow the official norm setting. + follow_official_norm: True # Whether to follow the official norm setting. periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. period_discriminator_params: in_channels: 1 # Number of input channels. @@ -74,19 +74,19 @@ discriminator_params: channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. - bias: true # Whether to use bias parameter in conv layer." + bias: True # Whether to use bias parameter in conv layer." nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. - use_spectral_norm: false # Whether to apply spectral normalization. + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: false # Whether to use multi-resolution STFT loss. -use_mel_loss: true # Whether to use Mel-spectrogram loss. +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. mel_loss_params: fs: 24000 fft_size: 2048 @@ -98,14 +98,14 @@ mel_loss_params: fmax: 12000 log_base: null generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. -use_feat_match_loss: true + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True feat_match_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. - average_by_layers: false # Whether to average loss by #layers in each discriminator. - include_final_outputs: false # Whether to include final outputs in feat match loss calculation. + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. ########################################################### # ADVERSARIAL LOSS SETTING # diff --git a/examples/ljspeech/tts1/conf/default.yaml b/examples/ljspeech/tts1/conf/default.yaml index 6b495eff..456b6a1e 100644 --- a/examples/ljspeech/tts1/conf/default.yaml +++ b/examples/ljspeech/tts1/conf/default.yaml @@ -63,9 +63,9 @@ model: # keyword arguments for the selected model # UPDATER SETTING # ########################################################### updater: - use_masking: true # whether to apply masking for padded part in loss calculation + use_masking: True # whether to apply masking for padded part in loss calculation loss_type: L1 - use_guided_attn_loss: true # whether to use guided attention loss + use_guided_attn_loss: True # whether to use guided attention loss guided_attn_loss_sigma: 0.4 # sigma in guided attention loss guided_attn_loss_lambda: 10.0 # lambda in guided attention loss modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml index 872dafcb..15cfda2c 100644 --- a/examples/ljspeech/tts3/conf/default.yaml +++ b/examples/ljspeech/tts3/conf/default.yaml @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder diff --git a/examples/ljspeech/voc1/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml index 2d39beb7..d30960d6 100644 --- a/examples/ljspeech/voc1/conf/default.yaml +++ b/examples/ljspeech/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 4, 4, 4] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml index 2738e7c2..86d4a0d5 100644 --- a/examples/vctk/tts3/conf/default.yaml +++ b/examples/vctk/tts3/conf/default.yaml @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type diff --git a/examples/vctk/voc1/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml index 59ce3825..af859d4c 100644 --- a/examples/vctk/voc1/conf/default.yaml +++ b/examples/vctk/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 526871a2..2fcb46d9 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -17,6 +17,35 @@ import paddle from paddlespeech.t2s.data.batch import batch_sequences +def tacotron2_single_spk_batch_fn(examples): + # fields = ["text", "text_lengths", "speech", "speech_lengths"] + text = [np.array(item["text"], dtype=np.int64) for item in examples] + speech = [np.array(item["speech"], dtype=np.float32) for item in examples] + text_lengths = [ + np.array(item["text_lengths"], dtype=np.int64) for item in examples + ] + speech_lengths = [ + np.array(item["speech_lengths"], dtype=np.int64) for item in examples + ] + + text = batch_sequences(text) + speech = batch_sequences(speech) + + # convert each batch to paddle.Tensor + text = paddle.to_tensor(text) + speech = paddle.to_tensor(speech) + text_lengths = paddle.to_tensor(text_lengths) + speech_lengths = paddle.to_tensor(speech_lengths) + + batch = { + "text": text, + "text_lengths": text_lengths, + "speech": speech, + "speech_lengths": speech_lengths, + } + return batch + + def speedyspeech_single_spk_batch_fn(examples): # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] phones = [np.array(item["phones"], dtype=np.int64) for item in examples] @@ -56,7 +85,7 @@ def speedyspeech_single_spk_batch_fn(examples): def speedyspeech_multi_spk_batch_fn(examples): - # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] + # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations", "spk_id"] phones = [np.array(item["phones"], dtype=np.int64) for item in examples] tones = [np.array(item["tones"], dtype=np.int64) for item in examples] feats = [np.array(item["feats"], dtype=np.float32) for item in examples] diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py index 4ddd19f7..13569b99 100644 --- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py +++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py @@ -15,14 +15,14 @@ # for mb melgan finetune # 长度和原本的 mel 不一致怎么办? import argparse +import os from pathlib import Path import numpy as np import paddle import yaml -from yacs.config import CfgNode from tqdm import tqdm -import os +from yacs.config import CfgNode from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import merge_silence @@ -50,11 +50,14 @@ def evaluate(args, fastspeech2_config): spk_id_list = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id_list) else: - spk_num=None + spk_num = None odim = fastspeech2_config.n_mels model = FastSpeech2( - idim=vocab_size, odim=odim, **fastspeech2_config["model"], spk_num=spk_num) + idim=vocab_size, + odim=odim, + **fastspeech2_config["model"], + spk_num=spk_num) model.set_state_dict( paddle.load(args.fastspeech2_checkpoint)["main_params"]) @@ -99,9 +102,15 @@ def evaluate(args, fastspeech2_config): else: train_wav_files += wav_files - train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files] - dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files] - test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files] + train_wav_files = [ + os.path.basename(str(str_path)) for str_path in train_wav_files + ] + dev_wav_files = [ + os.path.basename(str(str_path)) for str_path in dev_wav_files + ] + test_wav_files = [ + os.path.basename(str(str_path)) for str_path in test_wav_files + ] for i, utt_id in enumerate(tqdm(sentences)): phones = sentences[utt_id][0] @@ -122,7 +131,8 @@ def evaluate(args, fastspeech2_config): phone_ids = paddle.to_tensor(np.array(phone_ids)) if args.speaker_dict: - speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0]) + speaker_id = int( + [item[1] for item in spk_id_list if speaker == item[0]][0]) speaker_id = paddle.to_tensor(speaker_id) else: speaker_id = None @@ -143,7 +153,8 @@ def evaluate(args, fastspeech2_config): sub_output_dir.mkdir(parents=True, exist_ok=True) with paddle.no_grad(): - mel = fastspeech2_inference(phone_ids, durations=durations, spk_id=speaker_id) + mel = fastspeech2_inference( + phone_ids, durations=durations, spk_id=speaker_id) np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) @@ -175,12 +186,9 @@ def main(): type=str, default="phone_id_map.txt", help="phone vocabulary file.") - + parser.add_argument( - "--speaker-dict", - type=str, - default=None, - help="speaker id map file.") + "--speaker-dict", type=str, default=None, help="speaker id map file.") parser.add_argument( "--dur-file", default=None, type=str, help="path to durations.txt.") diff --git a/paddlespeech/t2s/exps/new_tacotron2/normalize.py b/paddlespeech/t2s/exps/new_tacotron2/normalize.py new file mode 120000 index 00000000..64848f89 --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/normalize.py @@ -0,0 +1 @@ +../transformer_tts/normalize.py \ No newline at end of file diff --git a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py new file mode 100644 index 00000000..0b61912c --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py @@ -0,0 +1,353 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from concurrent.futures import ThreadPoolExecutor +from operator import itemgetter +from pathlib import Path +from typing import Any +from typing import Dict +from typing import List + +import jsonlines +import librosa +import numpy as np +import tqdm +import yaml +from yacs.config import CfgNode + +from paddlespeech.t2s.data.get_feats import Energy +from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.data.get_feats import Pitch +from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length +from paddlespeech.t2s.datasets.preprocess_utils import get_input_token +from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur +from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map +from paddlespeech.t2s.datasets.preprocess_utils import merge_silence + + +def process_sentence(config: Dict[str, Any], + fp: Path, + sentences: Dict, + output_dir: Path, + mel_extractor=None, + pitch_extractor=None, + energy_extractor=None, + cut_sil: bool=True, + spk_emb_dir: Path=None): + utt_id = fp.stem + # for vctk + if utt_id.endswith("_mic2"): + utt_id = utt_id[:-5] + record = None + if utt_id in sentences: + # reading, resampling may occur + wav, _ = librosa.load(str(fp), sr=config.fs) + if len(wav.shape) != 1 or np.abs(wav).max() > 1.0: + return record + assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." + assert np.abs(wav).max( + ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + speaker = sentences[utt_id][2] + d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant') + # little imprecise than use *.TextGrid directly + times = librosa.frames_to_time( + d_cumsum, sr=config.fs, hop_length=config.n_shift) + if cut_sil: + start = 0 + end = d_cumsum[-1] + if phones[0] == "sil" and len(durations) > 1: + start = times[1] + durations = durations[1:] + phones = phones[1:] + if phones[-1] == 'sil' and len(durations) > 1: + end = times[-2] + durations = durations[:-1] + phones = phones[:-1] + sentences[utt_id][0] = phones + sentences[utt_id][1] = durations + start, end = librosa.time_to_samples([start, end], sr=config.fs) + wav = wav[start:end] + # extract mel feats + logmel = mel_extractor.get_log_mel_fbank(wav) + # change duration according to mel_length + compare_duration_and_mel_length(sentences, utt_id, logmel) + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + num_frames = logmel.shape[0] + assert sum(durations) == num_frames + mel_dir = output_dir / "data_speech" + mel_dir.mkdir(parents=True, exist_ok=True) + mel_path = mel_dir / (utt_id + "_speech.npy") + np.save(mel_path, logmel) + record = { + "utt_id": utt_id, + "phones": phones, + "text_lengths": len(phones), + "speech_lengths": num_frames, + "speech": str(mel_path), + "speaker": speaker + } + if spk_emb_dir: + if speaker in os.listdir(spk_emb_dir): + embed_name = utt_id + ".npy" + embed_path = spk_emb_dir / speaker / embed_name + if embed_path.is_file(): + record["spk_emb"] = str(embed_path) + else: + return None + return record + + +def process_sentences(config, + fps: List[Path], + sentences: Dict, + output_dir: Path, + mel_extractor=None, + pitch_extractor=None, + energy_extractor=None, + nprocs: int=1, + cut_sil: bool=True, + spk_emb_dir: Path=None): + if nprocs == 1: + results = [] + for fp in fps: + record = process_sentence(config, fp, sentences, output_dir, + mel_extractor, pitch_extractor, + energy_extractor, cut_sil, spk_emb_dir) + if record: + results.append(record) + else: + with ThreadPoolExecutor(nprocs) as pool: + futures = [] + with tqdm.tqdm(total=len(fps)) as progress: + for fp in fps: + future = pool.submit(process_sentence, config, fp, + sentences, output_dir, mel_extractor, + pitch_extractor, energy_extractor, + cut_sil, spk_emb_dir) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + results = [] + for ft in futures: + record = ft.result() + if record: + results.append(record) + + results.sort(key=itemgetter("utt_id")) + with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: + for item in results: + writer.write(item) + print("Done") + + +def main(): + # parse config and args + parser = argparse.ArgumentParser( + description="Preprocess audio and then extract features.") + + parser.add_argument( + "--dataset", + default="baker", + type=str, + help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now") + + parser.add_argument( + "--rootdir", default=None, type=str, help="directory to dataset.") + + parser.add_argument( + "--dumpdir", + type=str, + required=True, + help="directory to dump feature files.") + parser.add_argument( + "--dur-file", default=None, type=str, help="path to durations.txt.") + + parser.add_argument("--config", type=str, help="fastspeech2 config file.") + + parser.add_argument( + "--verbose", + type=int, + default=1, + help="logging level. higher is more logging. (default=1)") + parser.add_argument( + "--num-cpu", type=int, default=1, help="number of process.") + + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--cut-sil", + type=str2bool, + default=True, + help="whether cut sil in the edge of audio") + + parser.add_argument( + "--spk_emb_dir", + default=None, + type=str, + help="directory to speaker embedding files.") + args = parser.parse_args() + + rootdir = Path(args.rootdir).expanduser() + dumpdir = Path(args.dumpdir).expanduser() + # use absolute path + dumpdir = dumpdir.resolve() + dumpdir.mkdir(parents=True, exist_ok=True) + dur_file = Path(args.dur_file).expanduser() + + if args.spk_emb_dir: + spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve() + else: + spk_emb_dir = None + + assert rootdir.is_dir() + assert dur_file.is_file() + + with open(args.config, 'rt') as f: + config = CfgNode(yaml.safe_load(f)) + + if args.verbose > 1: + print(vars(args)) + print(config) + + sentences, speaker_set = get_phn_dur(dur_file) + + merge_silence(sentences) + phone_id_map_path = dumpdir / "phone_id_map.txt" + speaker_id_map_path = dumpdir / "speaker_id_map.txt" + get_input_token(sentences, phone_id_map_path, args.dataset) + get_spk_id_map(speaker_set, speaker_id_map_path) + + if args.dataset == "baker": + wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) + # split data into 3 sections + num_train = 9800 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "aishell3": + sub_num_dev = 5 + wav_dir = rootdir / "train" / "wav" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*.wav"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + elif args.dataset == "ljspeech": + wav_files = sorted(list((rootdir / "wavs").rglob("*.wav"))) + # split data into 3 sections + num_train = 12900 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "vctk": + sub_num_dev = 5 + wav_dir = rootdir / "wav48_silence_trimmed" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + else: + print("dataset should in {baker, aishell3, ljspeech, vctk} now!") + + train_dump_dir = dumpdir / "train" / "raw" + train_dump_dir.mkdir(parents=True, exist_ok=True) + dev_dump_dir = dumpdir / "dev" / "raw" + dev_dump_dir.mkdir(parents=True, exist_ok=True) + test_dump_dir = dumpdir / "test" / "raw" + test_dump_dir.mkdir(parents=True, exist_ok=True) + + # Extractor + mel_extractor = LogMelFBank( + sr=config.fs, + n_fft=config.n_fft, + hop_length=config.n_shift, + win_length=config.win_length, + window=config.window, + n_mels=config.n_mels, + fmin=config.fmin, + fmax=config.fmax) + pitch_extractor = Pitch( + sr=config.fs, + hop_length=config.n_shift, + f0min=config.f0min, + f0max=config.f0max) + energy_extractor = Energy( + sr=config.fs, + n_fft=config.n_fft, + hop_length=config.n_shift, + win_length=config.win_length, + window=config.window) + + # process for the 3 sections + if train_wav_files: + process_sentences( + config, + train_wav_files, + sentences, + train_dump_dir, + mel_extractor, + pitch_extractor, + energy_extractor, + nprocs=args.num_cpu, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + if dev_wav_files: + process_sentences( + config, + dev_wav_files, + sentences, + dev_dump_dir, + mel_extractor, + pitch_extractor, + energy_extractor, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + if test_wav_files: + process_sentences( + config, + test_wav_files, + sentences, + test_dump_dir, + mel_extractor, + pitch_extractor, + energy_extractor, + nprocs=args.num_cpu, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/new_tacotron2/train.py b/paddlespeech/t2s/exps/new_tacotron2/train.py new file mode 100644 index 00000000..20f73f0c --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/train.py @@ -0,0 +1,190 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import logging +import os +import shutil +from pathlib import Path + +import jsonlines +import numpy as np +import paddle +import yaml +from paddle import DataParallel +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_single_spk_batch_fn +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.models.new_tacotron2 import Tacotron2 +from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Evaluator +from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Updater +from paddlespeech.t2s.training.extensions.snapshot import Snapshot +from paddlespeech.t2s.training.extensions.visualizer import VisualDL +from paddlespeech.t2s.training.optimizer import build_optimizers +from paddlespeech.t2s.training.seeding import seed_everything +from paddlespeech.t2s.training.trainer import Trainer + + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + paddle.set_device("cpu") + else: + paddle.set_device("gpu") + world_size = paddle.distributed.get_world_size() + if world_size > 1: + paddle.distributed.init_parallel_env() + + # set the random seed, it is a must for multiprocess training + seed_everything(config.seed) + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + + # dataloader has been too verbose + logging.getLogger("DataLoader").disabled = True + + # construct dataset for training and validation + with jsonlines.open(args.train_metadata, 'r') as reader: + train_metadata = list(reader) + train_dataset = DataTable( + data=train_metadata, + fields=[ + "text", + "text_lengths", + "speech", + "speech_lengths", + ], + converters={ + "speech": np.load, + }, ) + with jsonlines.open(args.dev_metadata, 'r') as reader: + dev_metadata = list(reader) + dev_dataset = DataTable( + data=dev_metadata, + fields=[ + "text", + "text_lengths", + "speech", + "speech_lengths", + ], + converters={ + "speech": np.load, + }, ) + + # collate function and dataloader + train_sampler = DistributedBatchSampler( + train_dataset, + batch_size=config.batch_size, + shuffle=True, + drop_last=True) + + print("samplers done!") + + train_dataloader = DataLoader( + train_dataset, + batch_sampler=train_sampler, + collate_fn=tacotron2_single_spk_batch_fn, + num_workers=config.num_workers) + + dev_dataloader = DataLoader( + dev_dataset, + shuffle=False, + drop_last=False, + batch_size=config.batch_size, + collate_fn=tacotron2_single_spk_batch_fn, + num_workers=config.num_workers) + print("dataloaders done!") + + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + odim = config.n_mels + model = Tacotron2(idim=vocab_size, odim=odim, **config["model"]) + if world_size > 1: + model = DataParallel(model) + print("model done!") + + optimizer = build_optimizers(model, **config["optimizer"]) + print("optimizer done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + if dist.get_rank() == 0: + config_name = args.config.split("/")[-1] + # copy conf to output_dir + shutil.copyfile(args.config, output_dir / config_name) + + updater = Tacotron2Updater( + model=model, + optimizer=optimizer, + dataloader=train_dataloader, + output_dir=output_dir, + **config["updater"]) + + trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) + + evaluator = Tacotron2Evaluator( + model, dev_dataloader, output_dir=output_dir, **config["updater"]) + + if dist.get_rank() == 0: + trainer.extend(evaluator, trigger=(1, "epoch")) + trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + # print(trainer.extensions) + trainer.run() + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Train a Tacotron2 model.") + parser.add_argument("--config", type=str, help="tacotron2 config file.") + parser.add_argument("--train-metadata", type=str, help="training data.") + parser.add_argument("--dev-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + parser.add_argument( + "--phones-dict", type=str, default=None, help="phone vocabulary file.") + + args = parser.parse_args() + + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + # dispatch + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) + else: + train_sp(args, config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index f5477470..02bfcb15 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -36,6 +36,10 @@ model_alias = { "paddlespeech.t2s.models.fastspeech2:FastSpeech2", "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -91,6 +95,8 @@ def evaluate(args): print("spk_num:", spk_num) elif am_name == 'speedyspeech': fields = ["utt_id", "phones", "tones"] + elif am_name == 'tacotron2': + fields = ["utt_id", "text"] test_dataset = DataTable(data=test_metadata, fields=fields) @@ -117,6 +123,8 @@ def evaluate(args): elif am_name == 'speedyspeech': am = am_class( vocab_size=vocab_size, tone_size=tone_size, **am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) am.eval() @@ -168,6 +176,9 @@ def evaluate(args): phone_ids = paddle.to_tensor(datum["phones"]) tone_ids = paddle.to_tensor(datum["tones"]) mel = am_inference(phone_ids, tone_ids) + elif am_name == 'tacotron2': + phone_ids = paddle.to_tensor(datum["text"]) + mel = am_inference(phone_ids) # vocoder wav = voc_inference(mel) sf.write( @@ -188,7 +199,7 @@ def main(): default='fastspeech2_csmsc', choices=[ 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk' + 'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc' ], help='Choose acoustic model type of tts task.') parser.add_argument( diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 15ed1e4d..9aeff638 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -38,6 +38,10 @@ model_alias = { "paddlespeech.t2s.models.fastspeech2:FastSpeech2", "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -126,6 +130,8 @@ def evaluate(args): elif am_name == 'speedyspeech': am = am_class( vocab_size=vocab_size, tone_size=tone_size, **am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) am.eval() @@ -230,6 +236,8 @@ def evaluate(args): elif am_name == 'speedyspeech': part_tone_ids = tone_ids[i] mel = am_inference(part_phone_ids, part_tone_ids) + elif am_name == 'tacotron2': + mel = am_inference(part_phone_ids) # vocoder wav = voc_inference(mel) if flags == 0: @@ -255,7 +263,7 @@ def main(): default='fastspeech2_csmsc', choices=[ 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk' + 'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc' ], help='Choose acoustic model type of tts task.') parser.add_argument( diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 405ad957..fe25351c 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -556,8 +556,7 @@ class FastSpeech2(nn.Layer): tone_id=tone_id) # modify mod part of groundtruth if self.reduction_factor > 1: - olens = paddle.to_tensor( - [olen - olen % self.reduction_factor for olen in olens.numpy()]) + olens = olens - olens % self.reduction_factor max_olen = max(olens) ys = ys[:, :max_olen] diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py index 0dabf934..3f5e1b56 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py @@ -36,12 +36,9 @@ class FastSpeech2Updater(StandardUpdater): use_weighted_masking=False, output_dir=None): super().__init__(model, optimizer, dataloader, init_state=None) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking self.criterion = FastSpeech2Loss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking) + use_masking=use_masking, use_weighted_masking=use_weighted_masking) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -113,8 +110,6 @@ class FastSpeech2Evaluator(StandardEvaluator): use_weighted_masking=False, output_dir=None): super().__init__(model, dataloader) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -123,8 +118,7 @@ class FastSpeech2Evaluator(StandardEvaluator): self.msg = "" self.criterion = FastSpeech2Loss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking) + use_masking=use_masking, use_weighted_masking=use_weighted_masking) def evaluate_core(self, batch): self.msg = "Evaluate: " diff --git a/paddlespeech/t2s/models/new_tacotron2/__init__.py b/paddlespeech/t2s/models/new_tacotron2/__init__.py new file mode 100644 index 00000000..ea63257c --- /dev/null +++ b/paddlespeech/t2s/models/new_tacotron2/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .tacotron2 import * +from .tacotron2_updater import * diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py new file mode 100644 index 00000000..747c74f9 --- /dev/null +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py @@ -0,0 +1,496 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tacotron 2 related modules for paddle""" +import logging +from typing import Dict +from typing import Optional +from typing import Tuple + +import paddle +import paddle.nn.functional as F +from paddle import nn +from typeguard import check_argument_types + +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.t2s.modules.nets_utils import make_pad_mask +from paddlespeech.t2s.modules.tacotron2.attentions import AttForward +from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA +from paddlespeech.t2s.modules.tacotron2.attentions import AttLoc +from paddlespeech.t2s.modules.tacotron2.decoder import Decoder +from paddlespeech.t2s.modules.tacotron2.encoder import Encoder + + +class Tacotron2(nn.Layer): + """Tacotron2 module for end-to-end text-to-speech. + + This is a module of Spectrogram prediction network in Tacotron2 described + in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_, + which converts the sequence of characters into the sequence of Mel-filterbanks. + + .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`: + https://arxiv.org/abs/1712.05884 + + """ + + def __init__( + self, + # network structure related + idim: int, + odim: int, + embed_dim: int=512, + elayers: int=1, + eunits: int=512, + econv_layers: int=3, + econv_chans: int=512, + econv_filts: int=5, + atype: str="location", + adim: int=512, + aconv_chans: int=32, + aconv_filts: int=15, + cumulate_att_w: bool=True, + dlayers: int=2, + dunits: int=1024, + prenet_layers: int=2, + prenet_units: int=256, + postnet_layers: int=5, + postnet_chans: int=512, + postnet_filts: int=5, + output_activation: str=None, + use_batch_norm: bool=True, + use_concate: bool=True, + use_residual: bool=False, + reduction_factor: int=1, + # extra embedding related + spk_num: Optional[int]=None, + lang_num: Optional[int]=None, + spk_embed_dim: Optional[int]=None, + spk_embed_integration_type: str="concat", + dropout_rate: float=0.5, + zoneout_rate: float=0.1, + # training related + init_type: str="xavier_uniform",): + """Initialize Tacotron2 module. + Parameters + ---------- + idim : int + Dimension of the inputs. + odim : int + Dimension of the outputs. + embed_dim : int + Dimension of the token embedding. + elayers : int + Number of encoder blstm layers. + eunits : int + Number of encoder blstm units. + econv_layers : int + Number of encoder conv layers. + econv_filts : int + Number of encoder conv filter size. + econv_chans : int + Number of encoder conv filter channels. + dlayers : int + Number of decoder lstm layers. + dunits : int + Number of decoder lstm units. + prenet_layers : int + Number of prenet layers. + prenet_units : int + Number of prenet units. + postnet_layers : int + Number of postnet layers. + postnet_filts : int + Number of postnet filter size. + postnet_chans : int + Number of postnet filter channels. + output_activation : str + Name of activation function for outputs. + adim : int + Number of dimension of mlp in attention. + aconv_chans : int + Number of attention conv filter channels. + aconv_filts : int + Number of attention conv filter size. + cumulate_att_w : bool + Whether to cumulate previous attention weight. + use_batch_norm : bool + Whether to use batch normalization. + use_concate : bool + Whether to concat enc outputs w/ dec lstm outputs. + reduction_factor : int + Reduction factor. + spk_num : Optional[int] + Number of speakers. If set to > 1, assume that the + sids will be provided as the input and use sid embedding layer. + lang_num : Optional[int] + Number of languages. If set to > 1, assume that the + lids will be provided as the input and use sid embedding layer. + spk_embed_dim : Optional[int] + Speaker embedding dimension. If set to > 0, + assume that spk_emb will be provided as the input. + spk_embed_integration_type : str + How to integrate speaker embedding. + dropout_rate : float + Dropout rate. + zoneout_rate : float + Zoneout rate. + """ + assert check_argument_types() + super().__init__() + + # store hyperparameters + self.idim = idim + self.odim = odim + self.eos = idim - 1 + self.cumulate_att_w = cumulate_att_w + self.reduction_factor = reduction_factor + + # define activation function for the final output + if output_activation is None: + self.output_activation_fn = None + elif hasattr(F, output_activation): + self.output_activation_fn = getattr(F, output_activation) + else: + raise ValueError(f"there is no such an activation function. " + f"({output_activation})") + + # set padding idx + padding_idx = 0 + self.padding_idx = padding_idx + + # initialize parameters + initialize(self, init_type) + + # define network modules + self.enc = Encoder( + idim=idim, + embed_dim=embed_dim, + elayers=elayers, + eunits=eunits, + econv_layers=econv_layers, + econv_chans=econv_chans, + econv_filts=econv_filts, + use_batch_norm=use_batch_norm, + use_residual=use_residual, + dropout_rate=dropout_rate, + padding_idx=padding_idx, ) + + self.spk_num = None + if spk_num is not None and spk_num > 1: + self.spk_num = spk_num + self.sid_emb = nn.Embedding(spk_num, eunits) + self.lang_num = None + if lang_num is not None and lang_num > 1: + self.lang_num = lang_num + self.lid_emb = nn.Embedding(lang_num, eunits) + + self.spk_embed_dim = None + if spk_embed_dim is not None and spk_embed_dim > 0: + self.spk_embed_dim = spk_embed_dim + self.spk_embed_integration_type = spk_embed_integration_type + if self.spk_embed_dim is None: + dec_idim = eunits + elif self.spk_embed_integration_type == "concat": + dec_idim = eunits + spk_embed_dim + elif self.spk_embed_integration_type == "add": + dec_idim = eunits + self.projection = nn.Linear(self.spk_embed_dim, eunits) + else: + raise ValueError(f"{spk_embed_integration_type} is not supported.") + + if atype == "location": + att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts) + elif atype == "forward": + att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts) + if self.cumulate_att_w: + logging.warning("cumulation of attention weights is disabled " + "in forward attention.") + self.cumulate_att_w = False + elif atype == "forward_ta": + att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts, + odim) + if self.cumulate_att_w: + logging.warning("cumulation of attention weights is disabled " + "in forward attention.") + self.cumulate_att_w = False + else: + raise NotImplementedError("Support only location or forward") + self.dec = Decoder( + idim=dec_idim, + odim=odim, + att=att, + dlayers=dlayers, + dunits=dunits, + prenet_layers=prenet_layers, + prenet_units=prenet_units, + postnet_layers=postnet_layers, + postnet_chans=postnet_chans, + postnet_filts=postnet_filts, + output_activation_fn=self.output_activation_fn, + cumulate_att_w=self.cumulate_att_w, + use_batch_norm=use_batch_norm, + use_concate=use_concate, + dropout_rate=dropout_rate, + zoneout_rate=zoneout_rate, + reduction_factor=reduction_factor, ) + + nn.initializer.set_global_initializer(None) + + def forward( + self, + text: paddle.Tensor, + text_lengths: paddle.Tensor, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + spk_emb: Optional[paddle.Tensor]=None, + spk_id: Optional[paddle.Tensor]=None, + lang_id: Optional[paddle.Tensor]=None + ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: + """Calculate forward propagation. + + Parameters + ---------- + text : Tensor(int64) + Batch of padded character ids (B, T_text). + text_lengths : Tensor(int64) + Batch of lengths of each input batch (B,). + speech : Tensor + Batch of padded target features (B, T_feats, odim). + speech_lengths : Tensor(int64) + Batch of the lengths of each target (B,). + spk_emb : Optional[Tensor] + Batch of speaker embeddings (B, spk_embed_dim). + spk_id : Optional[Tensor] + Batch of speaker IDs (B, 1). + lang_id : Optional[Tensor] + Batch of language IDs (B, 1). + + Returns + ---------- + Tensor + Loss scalar value. + Dict + Statistics to be monitored. + Tensor + Weight value if not joint training else model outputs. + + """ + text = text[:, :text_lengths.max()] + speech = speech[:, :speech_lengths.max()] + + batch_size = paddle.shape(text)[0] + + # Add eos at the last of sequence + xs = F.pad(text, [0, 0, 0, 1], "constant", self.padding_idx) + for i, l in enumerate(text_lengths): + xs[i, l] = self.eos + ilens = text_lengths + 1 + + ys = speech + olens = speech_lengths + + # make labels for stop prediction + labels = make_pad_mask(olens - 1) + # bool 类型无法切片 + labels = paddle.cast(labels, dtype='float32') + labels = F.pad(labels, [0, 0, 0, 1], "constant", 1.0) + + # calculate tacotron2 outputs + after_outs, before_outs, logits, att_ws = self._forward( + xs=xs, + ilens=ilens, + ys=ys, + olens=olens, + spk_emb=spk_emb, + spk_id=spk_id, + lang_id=lang_id, ) + + # modify mod part of groundtruth + if self.reduction_factor > 1: + assert olens.ge(self.reduction_factor).all( + ), "Output length must be greater than or equal to reduction factor." + olens = olens - olens % self.reduction_factor + max_out = max(olens) + ys = ys[:, :max_out] + labels = labels[:, :max_out] + labels = paddle.scatter(labels, 1, (olens - 1).unsqueeze(1), 1.0) + return after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens + + def _forward( + self, + xs: paddle.Tensor, + ilens: paddle.Tensor, + ys: paddle.Tensor, + olens: paddle.Tensor, + spk_emb: paddle.Tensor, + spk_id: paddle.Tensor, + lang_id: paddle.Tensor, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + + hs, hlens = self.enc(xs, ilens) + if self.spk_num is not None: + sid_embs = self.sid_emb(spk_id.reshape([-1])) + hs = hs + sid_embs.unsqueeze(1) + if self.lang_num is not None: + lid_embs = self.lid_emb(lang_id.reshape([-1])) + hs = hs + lid_embs.unsqueeze(1) + if self.spk_embed_dim is not None: + hs = self._integrate_with_spk_embed(hs, spk_emb) + + return self.dec(hs, hlens, ys) + + def inference( + self, + text: paddle.Tensor, + speech: Optional[paddle.Tensor]=None, + spk_emb: Optional[paddle.Tensor]=None, + spk_id: Optional[paddle.Tensor]=None, + lang_id: Optional[paddle.Tensor]=None, + threshold: float=0.5, + minlenratio: float=0.0, + maxlenratio: float=10.0, + use_att_constraint: bool=False, + backward_window: int=1, + forward_window: int=3, + use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: + """Generate the sequence of features given the sequences of characters. + + Parameters + ---------- + text Tensor(int64) + Input sequence of characters (T_text,). + speech : Optional[Tensor] + Feature sequence to extract style (N, idim). + spk_emb : ptional[Tensor] + Speaker embedding (spk_embed_dim,). + spk_id : Optional[Tensor] + Speaker ID (1,). + lang_id : Optional[Tensor] + Language ID (1,). + threshold : float + Threshold in inference. + minlenratio : float + Minimum length ratio in inference. + maxlenratio : float + Maximum length ratio in inference. + use_att_constraint : bool + Whether to apply attention constraint. + backward_window : int + Backward window in attention constraint. + forward_window : int + Forward window in attention constraint. + use_teacher_forcing : bool + Whether to use teacher forcing. + + Return + ---------- + Dict[str, Tensor] + Output dict including the following items: + * feat_gen (Tensor): Output sequence of features (T_feats, odim). + * prob (Tensor): Output sequence of stop probabilities (T_feats,). + * att_w (Tensor): Attention weights (T_feats, T). + + """ + x = text + y = speech + + # add eos at the last of sequence + x = F.pad(x, [0, 1], "constant", self.eos) + + # inference with teacher forcing + if use_teacher_forcing: + assert speech is not None, "speech must be provided with teacher forcing." + + xs, ys = x.unsqueeze(0), y.unsqueeze(0) + spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0) + ilens = paddle.shape(xs)[1] + olens = paddle.shape(ys)[1] + outs, _, _, att_ws = self._forward( + xs=xs, + ilens=ilens, + ys=ys, + olens=olens, + spk_emb=spk_emb, + spk_id=spk_id, + lang_id=lang_id, ) + + return dict(feat_gen=outs[0], att_w=att_ws[0]) + + # inference + h = self.enc.inference(x) + if self.spk_num is not None: + sid_emb = self.sid_emb(spk_id.reshape([-1])) + h = h + sid_emb + if self.lang_num is not None: + lid_emb = self.lid_emb(lang_id.reshape([-1])) + h = h + lid_emb + if self.spk_embed_dim is not None: + hs, spk_emb = h.unsqueeze(0), spk_emb.unsqueeze(0) + h = self._integrate_with_spk_embed(hs, spk_emb)[0] + out, prob, att_w = self.dec.inference( + h, + threshold=threshold, + minlenratio=minlenratio, + maxlenratio=maxlenratio, + use_att_constraint=use_att_constraint, + backward_window=backward_window, + forward_window=forward_window, ) + + return dict(feat_gen=out, prob=prob, att_w=att_w) + + def _integrate_with_spk_embed(self, + hs: paddle.Tensor, + spk_emb: paddle.Tensor) -> paddle.Tensor: + """Integrate speaker embedding with hidden states. + + Parameters + ---------- + hs : Tensor + Batch of hidden state sequences (B, Tmax, eunits). + spk_emb : Tensor + Batch of speaker embeddings (B, spk_embed_dim). + + Returns + ---------- + Tensor + Batch of integrated hidden state sequences (B, Tmax, eunits) if + integration_type is "add" else (B, Tmax, eunits + spk_embed_dim). + + """ + if self.spk_embed_integration_type == "add": + # apply projection and then add to hidden states + spk_emb = self.projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) + elif self.spk_embed_integration_type == "concat": + # concat hidden states with spk embeds + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( + -1, paddle.shape(hs)[1], -1) + hs = paddle.concat([hs, spk_emb], axis=-1) + else: + raise NotImplementedError("support only add or concat.") + + return hs + + +class Tacotron2Inference(nn.Layer): + def __init__(self, normalizer, model): + super().__init__() + self.normalizer = normalizer + self.acoustic_model = model + + def forward(self, text, spk_id=None, spk_emb=None): + out = self.acoustic_model.inference( + text, spk_id=spk_id, spk_emb=spk_emb) + normalized_mel = out["feat_gen"] + logmel = self.normalizer.inverse(normalized_mel) + return logmel diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py new file mode 100644 index 00000000..f1a2a50e --- /dev/null +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py @@ -0,0 +1,217 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from pathlib import Path +from typing import Dict + +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer + +from paddlespeech.t2s.modules.losses import GuidedAttentionLoss +from paddlespeech.t2s.modules.losses import Tacotron2Loss +from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator +from paddlespeech.t2s.training.reporter import report +from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater +logging.basicConfig( + format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class Tacotron2Updater(StandardUpdater): + def __init__(self, + model: Dict[str, Layer], + optimizer: Dict[str, Optimizer], + dataloader: DataLoader, + init_state=None, + use_masking: bool=True, + use_weighted_masking: bool=False, + bce_pos_weight: float=5.0, + loss_type: str="L1+L2", + use_guided_attn_loss: bool=True, + guided_attn_loss_sigma: float=0.4, + guided_attn_loss_lambda: float=1.0, + output_dir: Path=None): + super().__init__(model, optimizer, dataloader, init_state=None) + + self.loss_type = loss_type + self.use_guided_attn_loss = use_guided_attn_loss + + self.taco2_loss = Tacotron2Loss( + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight, ) + if self.use_guided_attn_loss: + self.attn_loss = GuidedAttentionLoss( + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def update_core(self, batch): + self.msg = "Rank: {}, ".format(dist.get_rank()) + losses_dict = {} + # spk_id!=None in multiple spk fastspeech2 + spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + # No explicit speaker identifier labels are used during voice cloning training. + if spk_emb is not None: + spk_id = None + + after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + spk_id=spk_id, + spk_emb=spk_emb) + + # calculate taco2 loss + l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs, + logits, ys, labels, olens) + + if self.loss_type == "L1+L2": + loss = l1_loss + mse_loss + bce_loss + elif self.loss_type == "L1": + loss = l1_loss + bce_loss + elif self.loss_type == "L2": + loss = mse_loss + bce_loss + else: + raise ValueError(f"unknown --loss-type {self.loss_type}") + + # calculate attention loss + if self.use_guided_attn_loss: + # NOTE: length of output for auto-regressive + # input will be changed when r > 1 + if self.model.reduction_factor > 1: + olens_in = olens // self.model.reduction_factor + else: + olens_in = olens + attn_loss = self.attn_loss(att_ws, ilens, olens_in) + loss = loss + attn_loss + + optimizer = self.optimizer + optimizer.clear_grad() + loss.backward() + optimizer.step() + + report("train/l1_loss", float(l1_loss)) + report("train/mse_loss", float(mse_loss)) + report("train/bce_loss", float(bce_loss)) + report("train/attn_loss", float(attn_loss)) + report("train/loss", float(loss)) + + losses_dict["l1_loss"] = float(l1_loss) + losses_dict["mse_loss"] = float(mse_loss) + losses_dict["bce_loss"] = float(bce_loss) + losses_dict["attn_loss"] = float(attn_loss) + losses_dict["loss"] = float(loss) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + +class Tacotron2Evaluator(StandardEvaluator): + def __init__(self, + model, + dataloader, + use_masking: bool=True, + use_weighted_masking: bool=False, + bce_pos_weight: float=5.0, + loss_type: str="L1+L2", + use_guided_attn_loss: bool=True, + guided_attn_loss_sigma: float=0.4, + guided_attn_loss_lambda: float=1.0, + output_dir=None): + super().__init__(model, dataloader) + + self.loss_type = loss_type + self.use_guided_attn_loss = use_guided_attn_loss + + self.taco2_loss = Tacotron2Loss( + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight, ) + if self.use_guided_attn_loss: + self.attn_loss = GuidedAttentionLoss( + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def evaluate_core(self, batch): + self.msg = "Evaluate: " + losses_dict = {} + # spk_id!=None in multiple spk fastspeech2 + spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + if spk_emb is not None: + spk_id = None + + after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + spk_id=spk_id, + spk_emb=spk_emb) + + # calculate taco2 loss + l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs, + logits, ys, labels, olens) + + if self.loss_type == "L1+L2": + loss = l1_loss + mse_loss + bce_loss + elif self.loss_type == "L1": + loss = l1_loss + bce_loss + elif self.loss_type == "L2": + loss = mse_loss + bce_loss + else: + raise ValueError(f"unknown --loss-type {self.loss_type}") + + # calculate attention loss + if self.use_guided_attn_loss: + # NOTE: length of output for auto-regressive + # input will be changed when r > 1 + if self.model.reduction_factor > 1: + olens_in = olens // self.model.reduction_factor + else: + olens_in = olens + attn_loss = self.attn_loss(att_ws, ilens, olens_in) + loss = loss + attn_loss + + report("eval/l1_loss", float(l1_loss)) + report("eval/mse_loss", float(mse_loss)) + report("eval/bce_loss", float(bce_loss)) + report("eval/attn_loss", float(attn_loss)) + report("eval/loss", float(loss)) + + losses_dict["l1_loss"] = float(l1_loss) + losses_dict["mse_loss"] = float(mse_loss) + losses_dict["bce_loss"] = float(bce_loss) + losses_dict["attn_loss"] = float(attn_loss) + losses_dict["loss"] = float(loss) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + self.logger.info(self.msg) diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py index f16cf4dd..6022567e 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py @@ -46,24 +46,20 @@ class TransformerTTSUpdater(StandardUpdater): guided_attn_loss_sigma: float=0.4, guided_attn_loss_lambda: float=1.0, ): super().__init__(model, optimizer, dataloader, init_state=None) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - self.bce_pos_weight = bce_pos_weight + self.loss_type = loss_type self.use_guided_attn_loss = use_guided_attn_loss - self.guided_attn_loss_sigma = guided_attn_loss_sigma - self.guided_attn_loss_lambda = guided_attn_loss_lambda self.modules_applied_guided_attn = modules_applied_guided_attn self.criterion = TransformerTTSLoss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking, - bce_pos_weight=self.bce_pos_weight) + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight) if self.use_guided_attn_loss: self.attn_criterion = GuidedMultiHeadAttentionLoss( - sigma=self.guided_attn_loss_sigma, - alpha=self.guided_attn_loss_lambda, ) + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -195,24 +191,20 @@ class TransformerTTSEvaluator(StandardEvaluator): guided_attn_loss_sigma: float=0.4, guided_attn_loss_lambda: float=1.0, ): super().__init__(model, dataloader) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - self.bce_pos_weight = bce_pos_weight + self.loss_type = loss_type self.use_guided_attn_loss = use_guided_attn_loss - self.guided_attn_loss_sigma = guided_attn_loss_sigma - self.guided_attn_loss_lambda = guided_attn_loss_lambda self.modules_applied_guided_attn = modules_applied_guided_attn self.criterion = TransformerTTSLoss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking, - bce_pos_weight=self.bce_pos_weight) + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight) if self.use_guided_attn_loss: self.attn_criterion = GuidedMultiHeadAttentionLoss( - sigma=self.guided_attn_loss_sigma, - alpha=self.guided_attn_loss_lambda, ) + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 569e96ad..0cb0c6fd 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -20,6 +20,250 @@ from paddle.fluid.layers import sequence_mask from paddle.nn import functional as F from scipy import signal +from paddlespeech.s2t.modules.mask import make_non_pad_mask + + +# Loss for new Tacotron2 +class GuidedAttentionLoss(nn.Layer): + """Guided attention loss function module. + This module calculates the guided attention loss described + in `Efficiently Trainable Text-to-Speech System Based + on Deep Convolutional Networks with Guided Attention`_, + which forces the attention to be diagonal. + .. _`Efficiently Trainable Text-to-Speech System + Based on Deep Convolutional Networks with Guided Attention`: + https://arxiv.org/abs/1710.08969 + """ + + def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): + """Initialize guided attention loss module. + Parameters + ---------- + sigma : float, optional + Standard deviation to control + how close attention to a diagonal. + alpha : float, optional + Scaling coefficient (lambda). + reset_always : bool, optional + Whether to always reset masks. + """ + super().__init__() + self.sigma = sigma + self.alpha = alpha + self.reset_always = reset_always + self.guided_attn_masks = None + self.masks = None + + def _reset_masks(self): + self.guided_attn_masks = None + self.masks = None + + def forward(self, att_ws, ilens, olens): + """Calculate forward propagation. + Parameters + ---------- + att_ws : Tensor + Batch of attention weights (B, T_max_out, T_max_in). + ilens : Tensor(int64) + Batch of input lengths (B,). + olens : Tensor(int64) + Batch of output lengths (B,). + Returns + ---------- + Tensor + Guided attention loss value. + """ + if self.guided_attn_masks is None: + self.guided_attn_masks = self._make_guided_attention_masks(ilens, + olens) + if self.masks is None: + self.masks = self._make_masks(ilens, olens) + losses = self.guided_attn_masks * att_ws + loss = paddle.mean(losses.masked_select(self.masks)) + if self.reset_always: + self._reset_masks() + return self.alpha * loss + + def _make_guided_attention_masks(self, ilens, olens): + n_batches = len(ilens) + max_ilen = max(ilens) + max_olen = max(olens) + guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen)) + for idx, (ilen, olen) in enumerate(zip(ilens, olens)): + guided_attn_masks[idx, :olen, : + ilen] = self._make_guided_attention_mask( + ilen, olen, self.sigma) + return guided_attn_masks + + @staticmethod + def _make_guided_attention_mask(ilen, olen, sigma): + """Make guided attention mask. + Parameters + ---------- + >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4) + >>> guided_attn_mask.shape + Size([5, 5]) + >>> guided_attn_mask + tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647], + [0.1175, 0.0000, 0.1175, 0.3935, 0.6753], + [0.3935, 0.1175, 0.0000, 0.1175, 0.3935], + [0.6753, 0.3935, 0.1175, 0.0000, 0.1175], + [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]]) + >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4) + >>> guided_attn_mask.shape + Size([6, 3]) + >>> guided_attn_mask + tensor([[0.0000, 0.2934, 0.7506], + [0.0831, 0.0831, 0.5422], + [0.2934, 0.0000, 0.2934], + [0.5422, 0.0831, 0.0831], + [0.7506, 0.2934, 0.0000], + [0.8858, 0.5422, 0.0831]]) + """ + grid_x, grid_y = paddle.meshgrid( + paddle.arange(olen), paddle.arange(ilen)) + grid_x = paddle.cast(grid_x, dtype='float32') + grid_y = paddle.cast(grid_y, dtype='float32') + + return 1.0 - paddle.exp(-( + (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2))) + + @staticmethod + def _make_masks(ilens, olens): + """Make masks indicating non-padded part. + Examples + ---------- + ilens : Tensor(int64) or List + Batch of lengths (B,). + olens : Tensor(int64) or List + Batch of lengths (B,). + Returns + ---------- + Tensor + Mask tensor indicating non-padded part. + Examples + ---------- + >>> ilens, olens = [5, 2], [8, 5] + >>> _make_mask(ilens, olens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1]], + [[1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]]],) + """ + # (B, T_in) + in_masks = make_non_pad_mask(ilens) + # (B, T_out) + out_masks = make_non_pad_mask(olens) + # (B, T_out, T_in) + return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2) + + +class Tacotron2Loss(nn.Layer): + """Loss function module for Tacotron2.""" + + def __init__(self, + use_masking=True, + use_weighted_masking=False, + bce_pos_weight=20.0): + """Initialize Tactoron2 loss module. + Parameters + ---------- + use_masking : bool + Whether to apply masking for padded part in loss calculation. + use_weighted_masking : bool + Whether to apply weighted masking in loss calculation. + bce_pos_weight : float + Weight of positive sample of stop token. + """ + super().__init__() + assert (use_masking != use_weighted_masking) or not use_masking + self.use_masking = use_masking + self.use_weighted_masking = use_weighted_masking + + # define criterions + reduction = "none" if self.use_weighted_masking else "mean" + self.l1_criterion = nn.L1Loss(reduction=reduction) + self.mse_criterion = nn.MSELoss(reduction=reduction) + self.bce_criterion = nn.BCEWithLogitsLoss( + reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight)) + + def forward(self, after_outs, before_outs, logits, ys, labels, olens): + """Calculate forward propagation. + Parameters + ---------- + after_outs : Tensor + Batch of outputs after postnets (B, Lmax, odim). + before_outs : Tensor + Batch of outputs before postnets (B, Lmax, odim). + logits : Tensor + Batch of stop logits (B, Lmax). + ys : Tensor + Batch of padded target features (B, Lmax, odim). + labels : Tensor(int64) + Batch of the sequences of stop token labels (B, Lmax). + olens : Tensor(int64) + Batch of the lengths of each target (B,). + Returns + ---------- + Tensor + L1 loss value. + Tensor + Mean square error loss value. + Tensor + Binary cross entropy loss value. + """ + # make mask and apply it + if self.use_masking: + masks = make_non_pad_mask(olens).unsqueeze(-1) + ys = ys.masked_select(masks.broadcast_to(ys.shape)) + after_outs = after_outs.masked_select( + masks.broadcast_to(after_outs.shape)) + before_outs = before_outs.masked_select( + masks.broadcast_to(before_outs.shape)) + labels = labels.masked_select( + masks[:, :, 0].broadcast_to(labels.shape)) + logits = logits.masked_select( + masks[:, :, 0].broadcast_to(logits.shape)) + + # calculate loss + l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion( + before_outs, ys) + mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion( + before_outs, ys) + bce_loss = self.bce_criterion(logits, labels) + + # make weighted mask and apply it + if self.use_weighted_masking: + masks = make_non_pad_mask(olens).unsqueeze(-1) + weights = masks.float() / masks.sum(axis=1, keepdim=True).float() + out_weights = weights.divide( + paddle.shape(ys)[0] * paddle.shape(ys)[2]) + logit_weights = weights.divide(paddle.shape(ys)[0]) + + # apply weight + l1_loss = l1_loss.multiply(out_weights) + l1_loss = l1_loss.masked_select(masks.broadcast_to(l1_loss)).sum() + mse_loss = mse_loss.multiply(out_weights) + mse_loss = mse_loss.masked_select( + masks.broadcast_to(mse_loss)).sum() + bce_loss = bce_loss.multiply(logit_weights.squeeze(-1)) + bce_loss = bce_loss.masked_select( + masks.squeeze(-1).broadcast_to(bce_loss)).sum() + + return l1_loss, mse_loss, bce_loss + # Loss for Tacotron2 def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None): diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py new file mode 100644 index 00000000..2b912db3 --- /dev/null +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -0,0 +1,519 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Attention modules for RNN.""" +import paddle +import paddle.nn.functional as F +from paddle import nn + +from paddlespeech.t2s.modules.masked_fill import masked_fill +from paddlespeech.t2s.modules.nets_utils import make_pad_mask + + +def _apply_attention_constraint(e, + last_attended_idx, + backward_window=1, + forward_window=3): + """Apply monotonic attention constraint. + + This function apply the monotonic attention constraint + introduced in `Deep Voice 3: Scaling + Text-to-Speech with Convolutional Sequence Learning`_. + + Parameters + ---------- + e : Tensor + Attention energy before applying softmax (1, T). + last_attended_idx : int + The index of the inputs of the last attended [0, T]. + backward_window : int, optional + Backward window size in attention constraint. + forward_window : int, optional + Forward window size in attetion constraint. + + Returns + ---------- + Tensor + Monotonic constrained attention energy (1, T). + + .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`: + https://arxiv.org/abs/1710.07654 + + """ + if paddle.shape(e)[0] != 1: + raise NotImplementedError( + "Batch attention constraining is not yet supported.") + backward_idx = last_attended_idx - backward_window + forward_idx = last_attended_idx + forward_window + if backward_idx > 0: + e[:, :backward_idx] = -float("inf") + if forward_idx < paddle.shape(e)[1]: + e[:, forward_idx:] = -float("inf") + return e + + +class AttLoc(nn.Layer): + """location-aware attention module. + + Reference: Attention-Based Models for Speech Recognition + (https://arxiv.org/pdf/1506.07503.pdf) + Parameters + ---------- + eprojs : int + projection-units of encoder + dunits : int + units of decoder + att_dim : int + att_dim: attention dimension + aconv_chans : int + channels of attention convolution + aconv_filts : int + filter size of attention convolution + han_mode : bool + flag to swith on mode of hierarchical attention and not store pre_compute_enc_h + """ + + def __init__(self, + eprojs, + dunits, + att_dim, + aconv_chans, + aconv_filts, + han_mode=False): + super().__init__() + self.mlp_enc = nn.Linear(eprojs, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + + self.dunits = dunits + self.eprojs = eprojs + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.han_mode = han_mode + + def reset(self): + """reset states""" + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + scaling=2.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttLoc forward propagation. + Parameters + ---------- + enc_hs_pad : paddle.Tensor + padded encoder hidden state (B, T_max, D_enc) + enc_hs_len : paddle.Tensor + padded encoder hidden state length (B) + dec_z : paddle.Tensor dec_z + decoder hidden state (B, D_dec) + att_prev : paddle.Tensor + previous attention weight (B, T_max) + scaling : float + scaling parameter before applying softmax + forward_window : paddle.Tensor + forward window size when constraining attention + last_attended_idx : int + index of the inputs of the last attended + backward_window : int + backward window size in attention constraint + forward_window : int + forward window size in attetion constraint + + Returns + ---------- + paddle.Tensor + attention weighted encoder state (B, D_enc) + paddle.Tensor + previous attention weights (B, T_max) + """ + batch = len(enc_hs_pad) + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None or self.han_mode: + # (utt, frame, hdim) + self.enc_h = enc_hs_pad + self.h_length = paddle.shape(self.enc_h)[1] + # (utt, frame, att_dim) + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + # initialize attention weight with uniform dist. + if att_prev is None: + # if no bias, 0 0-pad goes 0 + + att_prev = 1.0 - make_pad_mask(enc_hs_len) + att_prev = att_prev / enc_hs_len.unsqueeze(-1) + + # att_prev: (utt, frame) -> (utt, 1, 1, frame) + # -> (utt, att_conv_chans, 1, frame) + + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: (utt, att_conv_chans, 1, frame) -> (utt, frame, att_conv_chans) + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: (utt, frame, att_conv_chans) -> (utt, frame, att_dim) + att_conv = self.mlp_att(att_conv) + + # dec_z_tiled: (utt, frame, att_dim) + dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim]) + + # dot with gvec + # (utt, frame, att_dim) -> (utt, frame) + e = self.gvec( + paddle.tanh(att_conv + self.pre_compute_enc_h + + dec_z_tiled)).squeeze(2) + + # NOTE: consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + e = masked_fill(e, self.mask, -float("inf")) + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # weighted sum over flames + # utt x hdim + c = paddle.sum( + self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1) + + return c, w + + +class AttForward(nn.Layer): + """Forward attention module. + Reference + ---------- + Forward attention in sequence-to-sequence acoustic modeling for speech synthesis + (https://arxiv.org/pdf/1807.06736.pdf) + + Parameters + ---------- + eprojs : int + projection-units of encoder + dunits : int + units of decoder + att_dim : int + attention dimension + aconv_chans : int + channels of attention convolution + aconv_filts : int + filter size of attention convolution + """ + + def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts): + super().__init__() + self.mlp_enc = nn.Linear(eprojs, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + self.dunits = dunits + self.eprojs = eprojs + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def reset(self): + """reset states""" + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + scaling=1.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttForward forward propagation. + Parameters + ---------- + enc_hs_pad : paddle.Tensor + padded encoder hidden state (B, T_max, D_enc) + enc_hs_len : list + padded encoder hidden state length (B,) + dec_z : paddle.Tensor + decoder hidden state (B, D_dec) + att_prev : paddle.Tensor + attention weights of previous step (B, T_max) + scaling : float + scaling parameter before applying softmax + last_attended_idx : int + index of the inputs of the last attended + backward_window : int + backward window size in attention constraint + forward_window : int + forward window size in attetion constraint + Returns + ---------- + paddle.Tensor + attention weighted encoder state (B, D_enc) + paddle.Tensor + previous attention weights (B, T_max) + """ + batch = len(enc_hs_pad) + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None: + self.enc_h = enc_hs_pad # utt x frame x hdim + self.h_length = paddle.shape(self.enc_h)[1] + # utt x frame x att_dim + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + if att_prev is None: + # initial attention will be [1, 0, 0, ...] + att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]]) + att_prev[:, 0] = 1.0 + + # att_prev: utt x frame -> utt x 1 x 1 x frame + # -> utt x att_conv_chans x 1 x frame + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim + att_conv = self.mlp_att(att_conv) + + # dec_z_tiled: utt x frame x att_dim + dec_z_tiled = self.mlp_dec(dec_z).unsqueeze(1) + + # dot with gvec + # utt x frame x att_dim -> utt x frame + e = self.gvec( + paddle.tanh(self.pre_compute_enc_h + dec_z_tiled + + att_conv)).squeeze(2) + + # NOTE: consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + e = masked_fill(e, self.mask, -float("inf")) + + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # forward attention + att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1] + + w = (att_prev + att_prev_shift) * w + # NOTE: clip is needed to avoid nan gradient + w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1) + + # weighted sum over flames + # utt x hdim + # NOTE use bmm instead of sum(*) + c = paddle.sum(self.enc_h * w.unsqueeze(-1), axis=1) + + return c, w + + +class AttForwardTA(nn.Layer): + """Forward attention with transition agent module. + Reference + ---------- + Forward attention in sequence-to-sequence acoustic modeling for speech synthesis + (https://arxiv.org/pdf/1807.06736.pdf) + Parameters + ---------- + eunits : int + units of encoder + dunits : int + units of decoder + att_dim : int + attention dimension + aconv_chans : int + channels of attention convolution + aconv_filts : int + filter size of attention convolution + odim : int + output dimension + """ + + def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim): + super().__init__() + self.mlp_enc = nn.Linear(eunits, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_ta = nn.Linear(eunits + dunits + odim, 1) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + self.dunits = dunits + self.eunits = eunits + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.trans_agent_prob = 0.5 + + def reset(self): + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.trans_agent_prob = 0.5 + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + out_prev, + scaling=1.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttForwardTA forward propagation. + Parameters + ---------- + enc_hs_pad : paddle.Tensor + padded encoder hidden state (B, Tmax, eunits) + enc_hs_len : list paddle.Tensor + padded encoder hidden state length (B,) + dec_z : paddle.Tensor + decoder hidden state (B, dunits) + att_prev : paddle.Tensor + attention weights of previous step (B, T_max) + out_prev : paddle.Tensor + decoder outputs of previous step (B, odim) + scaling : float + scaling parameter before applying softmax + last_attended_idx : int + index of the inputs of the last attended + backward_window : int + backward window size in attention constraint + forward_window : int + forward window size in attetion constraint + Returns + ---------- + paddle.Tensor + attention weighted encoder state (B, dunits) + paddle.Tensor + previous attention weights (B, Tmax) + """ + batch = len(enc_hs_pad) + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None: + self.enc_h = enc_hs_pad # utt x frame x hdim + self.h_length = paddle.shape(self.enc_h)[1] + # utt x frame x att_dim + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + if att_prev is None: + # initial attention will be [1, 0, 0, ...] + att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]]) + att_prev[:, 0] = 1.0 + + # att_prev: utt x frame -> utt x 1 x 1 x frame + # -> utt x att_conv_chans x 1 x frame + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim + att_conv = self.mlp_att(att_conv) + + # dec_z_tiled: utt x frame x att_dim + dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim]) + + # dot with gvec + # utt x frame x att_dim -> utt x frame + e = self.gvec( + paddle.tanh(att_conv + self.pre_compute_enc_h + + dec_z_tiled)).squeeze(2) + + # NOTE consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + e = masked_fill(e, self.mask, -float("inf")) + + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # forward attention + # att_prev_shift = F.pad(att_prev.unsqueeze(0), (1, 0), data_format='NCL').squeeze(0)[:, :-1] + att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1] + w = (self.trans_agent_prob * att_prev + + (1 - self.trans_agent_prob) * att_prev_shift) * w + # NOTE: clip is needed to avoid nan gradient + w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1) + + # weighted sum over flames + # utt x hdim + # NOTE use bmm instead of sum(*) + c = paddle.sum( + self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1) + + # update transition agent prob + self.trans_agent_prob = F.sigmoid( + self.mlp_ta(paddle.concat([c, out_prev, dec_z], axis=1))) + + return c, w diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py index 691bb3ee..fc15adfd 100644 --- a/paddlespeech/t2s/modules/tacotron2/decoder.py +++ b/paddlespeech/t2s/modules/tacotron2/decoder.py @@ -13,10 +13,13 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) """Tacotron2 decoder related modules.""" +import paddle import paddle.nn.functional as F import six from paddle import nn +from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA + class Prenet(nn.Layer): """Prenet module for decoder of Spectrogram prediction network. @@ -196,3 +199,527 @@ class Postnet(nn.Layer): for i in six.moves.range(len(self.postnet)): xs = self.postnet[i](xs) return xs + + +class ZoneOutCell(nn.Layer): + """ZoneOut Cell module. + This is a module of zoneout described in + `Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`_. + This code is modified from `eladhoffer/seq2seq.pytorch`_. + Examples + ---------- + >>> lstm = paddle.nn.LSTMCell(16, 32) + >>> lstm = ZoneOutCell(lstm, 0.5) + .. _`Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`: + https://arxiv.org/abs/1606.01305 + .. _`eladhoffer/seq2seq.pytorch`: + https://github.com/eladhoffer/seq2seq.pytorch + """ + + def __init__(self, cell, zoneout_rate=0.1): + """Initialize zone out cell module. + Parameters + ---------- + cell : nn.Layer: + Paddle recurrent cell module + e.g. `paddle.nn.LSTMCell`. + zoneout_rate : float, optional + Probability of zoneout from 0.0 to 1.0. + """ + super().__init__() + self.cell = cell + self.hidden_size = cell.hidden_size + self.zoneout_rate = zoneout_rate + if zoneout_rate > 1.0 or zoneout_rate < 0.0: + raise ValueError( + "zoneout probability must be in the range from 0.0 to 1.0.") + + def forward(self, inputs, hidden): + """Calculate forward propagation. + Parameters + ---------- + inputs : Tensor + Batch of input tensor (B, input_size). + hidden : tuple + - Tensor: Batch of initial hidden states (B, hidden_size). + - Tensor: Batch of initial cell states (B, hidden_size). + Returns + ---------- + Tensor + Batch of next hidden states (B, hidden_size). + tuple: + - Tensor: Batch of next hidden states (B, hidden_size). + - Tensor: Batch of next cell states (B, hidden_size). + """ + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.cell(inputs, hidden) + next_hidden = self._zoneout(hidden, next_hidden, self.zoneout_rate) + # to have the same output format with LSTMCell in paddle + return next_hidden[0], next_hidden + + def _zoneout(self, h, next_h, prob): + # apply recursively + if isinstance(h, tuple): + num_h = len(h) + if not isinstance(prob, tuple): + prob = tuple([prob] * num_h) + return tuple( + [self._zoneout(h[i], next_h[i], prob[i]) for i in range(num_h)]) + if self.training: + mask = paddle.bernoulli(paddle.ones([*paddle.shape(h)]) * prob) + return mask * h + (1 - mask) * next_h + else: + return prob * h + (1 - prob) * next_h + + +class Decoder(nn.Layer): + """Decoder module of Spectrogram prediction network. + This is a module of decoder of Spectrogram prediction network in Tacotron2, + which described in `Natural TTS + Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_. + The decoder generates the sequence of + features from the sequence of the hidden states. + .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`: + https://arxiv.org/abs/1712.05884 + """ + + def __init__( + self, + idim, + odim, + att, + dlayers=2, + dunits=1024, + prenet_layers=2, + prenet_units=256, + postnet_layers=5, + postnet_chans=512, + postnet_filts=5, + output_activation_fn=None, + cumulate_att_w=True, + use_batch_norm=True, + use_concate=True, + dropout_rate=0.5, + zoneout_rate=0.1, + reduction_factor=1, ): + """Initialize Tacotron2 decoder module. + Parameters + ---------- + idim : int + Dimension of the inputs. + odim : int + Dimension of the outputs. + att nn.Layer + Instance of attention class. + dlayers int, optional + The number of decoder lstm layers. + dunits : int, optional + The number of decoder lstm units. + prenet_layers : int, optional + The number of prenet layers. + prenet_units : int, optional + The number of prenet units. + postnet_layers : int, optional + The number of postnet layers. + postnet_filts : int, optional + The number of postnet filter size. + postnet_chans : int, optional + The number of postnet filter channels. + output_activation_fn : nn.Layer, optional + Activation function for outputs. + cumulate_att_w : bool, optional + Whether to cumulate previous attention weight. + use_batch_norm : bool, optional + Whether to use batch normalization. + use_concate : bool, optional + Whether to concatenate encoder embedding with decoder lstm outputs. + dropout_rate : float, optional + Dropout rate. + zoneout_rate : float, optional + Zoneout rate. + reduction_factor : int, optional + Reduction factor. + """ + super().__init__() + + # store the hyperparameters + self.idim = idim + self.odim = odim + self.att = att + self.output_activation_fn = output_activation_fn + self.cumulate_att_w = cumulate_att_w + self.use_concate = use_concate + self.reduction_factor = reduction_factor + + # check attention type + if isinstance(self.att, AttForwardTA): + self.use_att_extra_inputs = True + else: + self.use_att_extra_inputs = False + + # define lstm network + prenet_units = prenet_units if prenet_layers != 0 else odim + self.lstm = nn.LayerList() + for layer in six.moves.range(dlayers): + iunits = idim + prenet_units if layer == 0 else dunits + lstm = nn.LSTMCell(iunits, dunits) + if zoneout_rate > 0.0: + lstm = ZoneOutCell(lstm, zoneout_rate) + self.lstm.append(lstm) + + # define prenet + if prenet_layers > 0: + self.prenet = Prenet( + idim=odim, + n_layers=prenet_layers, + n_units=prenet_units, + dropout_rate=dropout_rate, ) + else: + self.prenet = None + + # define postnet + if postnet_layers > 0: + self.postnet = Postnet( + idim=idim, + odim=odim, + n_layers=postnet_layers, + n_chans=postnet_chans, + n_filts=postnet_filts, + use_batch_norm=use_batch_norm, + dropout_rate=dropout_rate, ) + else: + self.postnet = None + + # define projection layers + iunits = idim + dunits if use_concate else dunits + self.feat_out = nn.Linear( + iunits, odim * reduction_factor, bias_attr=False) + self.prob_out = nn.Linear(iunits, reduction_factor) + + # initialize + # self.apply(decoder_init) + + def _zero_state(self, hs): + init_hs = paddle.zeros([paddle.shape(hs)[0], self.lstm[0].hidden_size]) + return init_hs + + def forward(self, hs, hlens, ys): + """Calculate forward propagation. + Parameters + ---------- + hs : Tensor + Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens : Tensor(int64) padded + Batch of lengths of each input batch (B,). + ys : Tensor + Batch of the sequences of padded target features (B, Lmax, odim). + Returns + ---------- + Tensor + Batch of output tensors after postnet (B, Lmax, odim). + Tensor + Batch of output tensors before postnet (B, Lmax, odim). + Tensor + Batch of logits of stop prediction (B, Lmax). + Tensor + Batch of attention weights (B, Lmax, Tmax). + Note + ---------- + This computation is performed in teacher-forcing manner. + """ + # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) + if self.reduction_factor > 1: + ys = ys[:, self.reduction_factor - 1::self.reduction_factor] + + # length list should be list of int + # hlens = list(map(int, hlens)) + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in six.moves.range(1, len(self.lstm)): + c_list += [self._zero_state(hs)] + z_list += [self._zero_state(hs)] + prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim]) + + # initialize attention + prev_att_w = None + self.att.reset() + + # loop for an output sequence + outs, logits, att_ws = [], [], [] + for y in ys.transpose([1, 0, 2]): + if self.use_att_extra_inputs: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w, + prev_out) + else: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w) + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + z_list[0], c_list[0] = next_hidden + for i in six.moves.range(1, len(self.lstm)): + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + z_list[i], c_list[i] = next_hidden + zcs = (paddle.concat([z_list[-1], att_c], axis=1) + if self.use_concate else z_list[-1]) + outs += [ + self.feat_out(zcs).reshape([paddle.shape(hs)[0], self.odim, -1]) + ] + logits += [self.prob_out(zcs)] + att_ws += [att_w] + # teacher forcing + prev_out = y + if self.cumulate_att_w and prev_att_w is not None: + prev_att_w = prev_att_w + att_w # Note: error when use += + else: + prev_att_w = att_w + # (B, Lmax) + logits = paddle.concat(logits, axis=1) + # (B, odim, Lmax) + before_outs = paddle.concat(outs, axis=2) + # (B, Lmax, Tmax) + att_ws = paddle.stack(att_ws, axis=1) + + if self.reduction_factor > 1: + # (B, odim, Lmax) + before_outs = before_outs.reshape( + [paddle.shape(before_outs)[0], self.odim, -1]) + + if self.postnet is not None: + # (B, odim, Lmax) + after_outs = before_outs + self.postnet(before_outs) + else: + after_outs = before_outs + # (B, Lmax, odim) + before_outs = before_outs.transpose([0, 2, 1]) + # (B, Lmax, odim) + after_outs = after_outs.transpose([0, 2, 1]) + logits = logits + + # apply activation function for scaling + if self.output_activation_fn is not None: + before_outs = self.output_activation_fn(before_outs) + after_outs = self.output_activation_fn(after_outs) + + return after_outs, before_outs, logits, att_ws + + def inference( + self, + h, + threshold=0.5, + minlenratio=0.0, + maxlenratio=10.0, + use_att_constraint=False, + backward_window=None, + forward_window=None, ): + """Generate the sequence of features given the sequences of characters. + Parameters + ---------- + h : Tensor + Input sequence of encoder hidden states (T, C). + threshold : float, optional + Threshold to stop generation. + minlenratio : float, optional + Minimum length ratio. + If set to 1.0 and the length of input is 10, + the minimum length of outputs will be 10 * 1 = 10. + minlenratio : float, optional + Minimum length ratio. + If set to 10 and the length of input is 10, + the maximum length of outputs will be 10 * 10 = 100. + use_att_constraint : bool + Whether to apply attention constraint introduced in `Deep Voice 3`_. + backward_window : int + Backward window size in attention constraint. + forward_window : int + Forward window size in attention constraint. + Returns + ---------- + Tensor + Output sequence of features (L, odim). + Tensor + Output sequence of stop probabilities (L,). + Tensor + Attention weights (L, T). + Note + ---------- + This computation is performed in auto-regressive manner. + .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654 + """ + # setup + assert len(paddle.shape(h)) == 2 + hs = h.unsqueeze(0) + ilens = paddle.shape(h)[0] + maxlen = int(paddle.shape(h)[0] * maxlenratio) + minlen = int(paddle.shape(h)[0] * minlenratio) + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in six.moves.range(1, len(self.lstm)): + c_list += [self._zero_state(hs)] + z_list += [self._zero_state(hs)] + prev_out = paddle.zeros([1, self.odim]) + + # initialize attention + prev_att_w = None + self.att.reset() + + # setup for attention constraint + if use_att_constraint: + last_attended_idx = 0 + else: + last_attended_idx = None + + # loop for an output sequence + idx = 0 + outs, att_ws, probs = [], [], [] + while True: + # updated index + idx += self.reduction_factor + + # decoder calculation + if self.use_att_extra_inputs: + att_c, att_w = self.att( + hs, + ilens, + z_list[0], + prev_att_w, + prev_out, + last_attended_idx=last_attended_idx, + backward_window=backward_window, + forward_window=forward_window, ) + else: + att_c, att_w = self.att( + hs, + ilens, + z_list[0], + prev_att_w, + last_attended_idx=last_attended_idx, + backward_window=backward_window, + forward_window=forward_window, ) + + att_ws += [att_w] + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + z_list[0], c_list[0] = next_hidden + for i in six.moves.range(1, len(self.lstm)): + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + z_list[i], c_list[i] = next_hidden + zcs = (paddle.concat([z_list[-1], att_c], axis=1) + if self.use_concate else z_list[-1]) + # [(1, odim, r), ...] + outs += [self.feat_out(zcs).reshape([1, self.odim, -1])] + + # [(r), ...] + probs += [F.sigmoid(self.prob_out(zcs))[0]] + if self.output_activation_fn is not None: + prev_out = self.output_activation_fn( + outs[-1][:, :, -1]) # (1, odim) + else: + prev_out = outs[-1][:, :, -1] # (1, odim) + if self.cumulate_att_w and prev_att_w is not None: + prev_att_w = prev_att_w + att_w # Note: error when use += + else: + prev_att_w = att_w + if use_att_constraint: + last_attended_idx = int(att_w.argmax()) + + # check whether to finish generation + if sum(paddle.cast(probs[-1] >= threshold, + 'int64')) > 0 or idx >= maxlen: + # check mininum length + if idx < minlen: + continue + # (1, odim, L) + outs = paddle.concat(outs, axis=2) + if self.postnet is not None: + # (1, odim, L) + outs = outs + self.postnet(outs) + # (L, odim) + outs = outs.transpose([0, 2, 1]).squeeze(0) + probs = paddle.concat(probs, axis=0) + att_ws = paddle.concat(att_ws, axis=0) + break + + if self.output_activation_fn is not None: + outs = self.output_activation_fn(outs) + + return outs, probs, att_ws + + def calculate_all_attentions(self, hs, hlens, ys): + """Calculate all of the attention weights. + Parameters + ---------- + hs : Tensor + Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens : Tensor(int64) + Batch of lengths of each input batch (B,). + ys : Tensor + Batch of the sequences of padded target features (B, Lmax, odim). + Returns + ---------- + numpy.ndarray + Batch of attention weights (B, Lmax, Tmax). + Note + ---------- + This computation is performed in teacher-forcing manner. + """ + # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) + if self.reduction_factor > 1: + ys = ys[:, self.reduction_factor - 1::self.reduction_factor] + + # length list should be list of int + hlens = list(map(int, hlens)) + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in six.moves.range(1, len(self.lstm)): + c_list += [self._zero_state(hs)] + z_list += [self._zero_state(hs)] + prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim]) + + # initialize attention + prev_att_w = None + self.att.reset() + + # loop for an output sequence + att_ws = [] + for y in ys.transpose([1, 0, 2]): + if self.use_att_extra_inputs: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w, + prev_out) + else: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w) + att_ws += [att_w] + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + z_list[0], c_list[0] = next_hidden + for i in six.moves.range(1, len(self.lstm)): + z_list[i], c_list[i] = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + # teacher forcing + prev_out = y + if self.cumulate_att_w and prev_att_w is not None: + # Note: error when use += + prev_att_w = prev_att_w + att_w + else: + prev_att_w = att_w + # (B, Lmax, Tmax) + att_ws = paddle.stack(att_ws, axis=1) + + return att_ws diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index f1889061..2f88d307 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -145,16 +145,15 @@ class Encoder(nn.Layer): Batch of the padded sequence. Either character ids (B, Tmax) or acoustic feature (B, Tmax, idim * encoder_reduction_factor). Padded value should be 0. - ilens : LongTensor + ilens : Tensor(int64) Batch of lengths of each input batch (B,). Returns ---------- Tensor Batch of the sequences of encoder states(B, Tmax, eunits). - LongTensor + Tensor(int64) Batch of lengths of each sequence (B,) - """ xs = self.embed(xs).transpose([0, 2, 1]) if self.convs is not None: @@ -170,7 +169,8 @@ class Encoder(nn.Layer): xs = xs.transpose([0, 2, 1]) self.blstm.flatten_parameters() # (B, Tmax, C) - xs, _ = self.blstm(xs) + # see https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/train_cn.html#paddletorch-nn-utils-rnn-pack-padded-sequencetorch-nn-utils-rnn-pad-packed-sequenceapi + xs, _ = self.blstm(xs, sequence_length=ilens) # hlens 是什么 hlens = ilens diff --git a/paddlespeech/t2s/training/optimizer.py b/paddlespeech/t2s/training/optimizer.py index 907e3daf..64274d53 100644 --- a/paddlespeech/t2s/training/optimizer.py +++ b/paddlespeech/t2s/training/optimizer.py @@ -26,10 +26,13 @@ optim_classes = dict( sgd=paddle.optimizer.SGD, ) -def build_optimizers(model: nn.Layer, - optim='adadelta', - max_grad_norm=None, - learning_rate=0.01) -> paddle.optimizer: +def build_optimizers( + model: nn.Layer, + optim='adadelta', + max_grad_norm=None, + learning_rate=0.01, + weight_decay=None, + epsilon=1.0e-6, ) -> paddle.optimizer: optim_class = optim_classes.get(optim) if optim_class is None: raise ValueError(f"must be one of {list(optim_classes)}: {optim}") @@ -37,10 +40,13 @@ def build_optimizers(model: nn.Layer, grad_clip = None if max_grad_norm: grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm) - optim = optim_class( - parameters=model.parameters(), - learning_rate=learning_rate, - grad_clip=grad_clip) + optim_dict = {} + optim_dict['parameters'] = model.parameters() + optim_dict['learning_rate'] = learning_rate + optim_dict['grad_clip'] = grad_clip + optim_dict['weight_decay'] = weight_decay + if optim_class not in {'momentum', 'sgd'}: + optim_dict['epsilon'] = epsilon + optimizers = optim_class(**optim_dict) - optimizers = optim return optimizers From 89e988a69e748306c1eb471682f0226ae0d8e97f Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 12 Jan 2022 05:01:01 +0000 Subject: [PATCH 002/107] add csmsc tacotron2, test=tts --- examples/csmsc/tts0/README.md | 264 ------------------ .../t2s/exps/new_tacotron2/__init__.py | 13 + paddlespeech/t2s/models/__init__.py | 1 + .../t2s/models/new_tacotron2/tacotron2.py | 6 +- paddlespeech/t2s/modules/losses.py | 7 +- paddlespeech/t2s/modules/tacotron2/encoder.py | 1 - 6 files changed, 21 insertions(+), 271 deletions(-) delete mode 100644 examples/csmsc/tts0/README.md create mode 100644 paddlespeech/t2s/exps/new_tacotron2/__init__.py diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md deleted file mode 100644 index 13d291b5..00000000 --- a/examples/csmsc/tts0/README.md +++ /dev/null @@ -1,264 +0,0 @@ -# FastSpeech2 with CSMSC -This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). - -## Dataset -### Download and Extract -Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). - -### Get MFA Result and Extract -We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. - -## Get Started -Assume the path to the dataset is `~/datasets/BZNSYP`. -Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. -Run the command below to -1. **source path**. -2. preprocess the dataset. -3. train the model. -4. synthesize wavs. - - synthesize waveform from `metadata.jsonl`. - - synthesize waveform from a text file. -5. inference using the static model. -```bash -./run.sh -``` -You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. -```bash -./run.sh --stage 0 --stop-stage 0 -``` -### Data Preprocessing -```bash -./local/preprocess.sh ${conf_path} -``` -When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. - -```text -dump -├── dev -│ ├── norm -│ └── raw -├── phone_id_map.txt -├── speaker_id_map.txt -├── test -│ ├── norm -│ └── raw -└── train - ├── energy_stats.npy - ├── norm - ├── pitch_stats.npy - ├── raw - └── speech_stats.npy -``` -The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech、pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. - -Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, the path of pitch features, the path of energy features, speaker, and the id of each utterance. - -### Model Training -```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} -``` -`./local/train.sh` calls `${BIN_DIR}/train.py`. -Here's the complete help message. -```text -usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - -Train a FastSpeech2 model. - -optional arguments: - -h, --help show this help message and exit - --config CONFIG fastspeech2 config file. - --train-metadata TRAIN_METADATA - training data. - --dev-metadata DEV_METADATA - dev data. - --output-dir OUTPUT_DIR - output dir. - --ngpu NGPU if ngpu=0, use cpu. - --phones-dict PHONES_DICT - phone vocabulary file. - --speaker-dict SPEAKER_DICT - speaker id map file for multiple speaker model. - --voice-cloning VOICE_CLONING - whether training voice cloning model. -``` -1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. -2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. -3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. -4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. -5. `--phones-dict` is the path of the phone vocabulary file. - -### Synthesizing -We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it. -```bash -unzip pwg_baker_ckpt_0.4.zip -``` -Parallel WaveGAN checkpoint contains files listed below. -```text -pwg_baker_ckpt_0.4 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan -``` -`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. -```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} -``` -```text -usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] - [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] - [--am_stat AM_STAT] [--phones_dict PHONES_DICT] - [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] - [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] - [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] - [--voc_stat VOC_STAT] [--ngpu NGPU] - [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] - -Synthesize with acoustic model & vocoder - -optional arguments: - -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} - Choose acoustic model type of tts task. - --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. - --am_ckpt AM_CKPT Checkpoint file of acoustic model. - --am_stat AM_STAT mean and standard deviation used to normalize - spectrogram when training acoustic model. - --phones_dict PHONES_DICT - phone vocabulary file. - --tones_dict TONES_DICT - tone vocabulary file. - --speaker_dict SPEAKER_DICT - speaker id map file. - --voice-cloning VOICE_CLONING - whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} - Choose vocoder type of tts task. - --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. - --voc_ckpt VOC_CKPT Checkpoint file of voc. - --voc_stat VOC_STAT mean and standard deviation used to normalize - spectrogram when training voc. - --ngpu NGPU if ngpu == 0, use cpu. - --test_metadata TEST_METADATA - test metadata. - --output_dir OUTPUT_DIR - output dir. -``` -`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. -```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} -``` -```text -usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] - [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] - [--am_stat AM_STAT] [--phones_dict PHONES_DICT] - [--tones_dict TONES_DICT] - [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] - [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] - [--voc_stat VOC_STAT] [--lang LANG] - [--inference_dir INFERENCE_DIR] [--ngpu NGPU] - [--text TEXT] [--output_dir OUTPUT_DIR] - -Synthesize with acoustic model & vocoder - -optional arguments: - -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} - Choose acoustic model type of tts task. - --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. - --am_ckpt AM_CKPT Checkpoint file of acoustic model. - --am_stat AM_STAT mean and standard deviation used to normalize - spectrogram when training acoustic model. - --phones_dict PHONES_DICT - phone vocabulary file. - --tones_dict TONES_DICT - tone vocabulary file. - --speaker_dict SPEAKER_DICT - speaker id map file. - --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} - Choose vocoder type of tts task. - --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. - --voc_ckpt VOC_CKPT Checkpoint file of voc. - --voc_stat VOC_STAT mean and standard deviation used to normalize - spectrogram when training voc. - --lang LANG Choose model language. zh or en - --inference_dir INFERENCE_DIR - dir to save inference models - --ngpu NGPU if ngpu == 0, use cpu. - --text TEXT text to synthesize, a 'utt_id sentence' pair per line. - --output_dir OUTPUT_DIR - output dir. -``` -1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. -3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. -5. `--lang` is the model language, which can be `zh` or `en`. -6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. -7. `--text` is the text file, which contains sentences to synthesize. -8. `--output_dir` is the directory to save synthesized audio files. -9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. - -### Inferencing -After synthesizing, we will get static models of fastspeech2 and pwgan in `${train_output_path}/inference`. -`./local/inference.sh` calls `${BIN_DIR}/inference.py`, which provides a paddle static model inference example for fastspeech2 + pwgan synthesize. -```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} -``` - -## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios: -- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) -- [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip) - -The static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip). - -Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss -:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: -default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287| -conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509| - -FastSpeech2 checkpoint contains files listed below. -```text -fastspeech2_nosil_baker_ckpt_0.4 -├── default.yaml # default config used to train fastspeech2 -├── phone_id_map.txt # phone vocabulary file when training fastspeech2 -├── snapshot_iter_76000.pdz # model parameters and optimizer states -└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 -``` -You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. -```bash -source path.sh - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_csmsc \ - --am_config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --am_ckpt=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --am_stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --voc=pwgan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --lang=zh \ - --text=${BIN_DIR}/../sentences.txt \ - --output_dir=exp/default/test_e2e \ - --inference_dir=exp/default/inference \ - --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt -``` diff --git a/paddlespeech/t2s/exps/new_tacotron2/__init__.py b/paddlespeech/t2s/exps/new_tacotron2/__init__.py new file mode 100644 index 00000000..abf198b9 --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py index f268a4e3..65227374 100644 --- a/paddlespeech/t2s/models/__init__.py +++ b/paddlespeech/t2s/models/__init__.py @@ -14,6 +14,7 @@ from .fastspeech2 import * from .hifigan import * from .melgan import * +from .new_tacotron2 import * from .parallel_wavegan import * from .speedyspeech import * from .tacotron2 import * diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py index 747c74f9..c8ef956c 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py @@ -77,9 +77,9 @@ class Tacotron2(nn.Layer): spk_embed_dim: Optional[int]=None, spk_embed_integration_type: str="concat", dropout_rate: float=0.5, - zoneout_rate: float=0.1, + zoneout_rate: float=0.1, # training related - init_type: str="xavier_uniform",): + init_type: str="xavier_uniform", ): """Initialize Tacotron2 module. Parameters ---------- @@ -243,7 +243,7 @@ class Tacotron2(nn.Layer): dropout_rate=dropout_rate, zoneout_rate=zoneout_rate, reduction_factor=reduction_factor, ) - + nn.initializer.set_global_initializer(None) def forward( diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 0cb0c6fd..781ac792 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -20,7 +20,7 @@ from paddle.fluid.layers import sequence_mask from paddle.nn import functional as F from scipy import signal -from paddlespeech.s2t.modules.mask import make_non_pad_mask +from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask # Loss for new Tacotron2 @@ -324,7 +324,7 @@ def stft(x, details. Defaults to "hann". center : bool, optional center (bool, optional): Whether to pad `x` to make that the - :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`. + :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`. pad_mode : str, optional Choose padding pattern when `center` is `True`. Returns @@ -677,7 +677,8 @@ def weighted_mean(input, weight): Weighted mean tensor with the same dtype as input. """ weight = paddle.cast(weight, input.dtype) - broadcast_ratio = input.size / weight.size + # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__ + broadcast_ratio = input.numel() / weight.numel() return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_ratio) diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index 2f88d307..b2ed30d1 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -171,7 +171,6 @@ class Encoder(nn.Layer): # (B, Tmax, C) # see https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/train_cn.html#paddletorch-nn-utils-rnn-pack-padded-sequencetorch-nn-utils-rnn-pad-packed-sequenceapi xs, _ = self.blstm(xs, sequence_length=ilens) - # hlens 是什么 hlens = ilens return xs, hlens From 98aaa3810be6eb0b83613e86c32fc6f8f55a9b9c Mon Sep 17 00:00:00 2001 From: qingen Date: Wed, 12 Jan 2022 20:07:56 +0800 Subject: [PATCH 003/107] add DER scripts to calculate Diarization Error Rate --- utils/DER.py | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100755 utils/DER.py diff --git a/utils/DER.py b/utils/DER.py new file mode 100755 index 00000000..4c67c4c7 --- /dev/null +++ b/utils/DER.py @@ -0,0 +1,152 @@ +"""Calculates Diarization Error Rate (DER) which is the sum of Missed Speaker (MS), +False Alarm (FA), and Speaker Error Rate (SER) using md-eval-22.pl from NIST RT Evaluation. + +Authors + * Neville Ryant 2018 + * Nauman Dawalatabad 2020 + +Credits + This code is adapted from https://github.com/nryant/dscore +""" + +import os +import re +import subprocess +import numpy as np + +FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)") +SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+") +MISS_SPEAKER_TIME = re.compile(r"(?<=MISSED SPEAKER TIME =)[\d.]+") +FA_SPEAKER_TIME = re.compile(r"(?<=FALARM SPEAKER TIME =)[\d.]+") +ERROR_SPEAKER_TIME = re.compile(r"(?<=SPEAKER ERROR TIME =)[\d.]+") + + +def rectify(arr): + """Corrects corner cases and converts scores into percentage. + """ + + # Numerator and denominator both 0. + arr[np.isnan(arr)] = 0 + + # Numerator > 0, but denominator = 0. + arr[np.isinf(arr)] = 1 + arr *= 100.0 + + return arr + + +def DER( + ref_rttm, + sys_rttm, + ignore_overlap=False, + collar=0.25, + individual_file_scores=False, +): + """Computes Missed Speaker percentage (MS), False Alarm (FA), + Speaker Error Rate (SER), and Diarization Error Rate (DER). + + Arguments + --------- + ref_rttm : str + The path of reference/groundtruth RTTM file. + sys_rttm : str + The path of the system generated RTTM file. + individual_file : bool + If True, returns scores for each file in order. + collar : float + Forgiveness collar. + ignore_overlap : bool + If True, ignores overlapping speech during evaluation. + + Returns + ------- + MS : float array + Missed Speech. + FA : float array + False Alarms. + SER : float array + Speaker Error Rates. + DER : float array + Diarization Error Rates. + + Example + ------- + >>> import pytest + >>> pytest.skip('Skipping because of Perl dependency') + >>> ref_rttm = "../../samples/rttm_samples/ref_rttm/ES2014c.rttm" + >>> sys_rttm = "../../samples/rttm_samples/sys_rttm/ES2014c.rttm" + >>> ignore_overlap = True + >>> collar = 0.25 + >>> individual_file_scores = True + >>> Scores = DER(ref_rttm, sys_rttm, ignore_overlap, collar, individual_file_scores) + >>> print (Scores) + (array([0., 0.]), array([0., 0.]), array([7.16923618, 7.16923618]), array([7.16923618, 7.16923618])) + """ + + curr = os.path.abspath(os.path.dirname(__file__)) + mdEval = os.path.join(curr, "../../tools/der_eval/md-eval.pl") + + cmd = [ + mdEval, + "-af", + "-r", + ref_rttm, + "-s", + sys_rttm, + "-c", + str(collar), + ] + if ignore_overlap: + cmd.append("-1") + + try: + stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT) + + except subprocess.CalledProcessError as ex: + stdout = ex.output + + else: + stdout = stdout.decode("utf-8") + + # Get all recording IDs + file_ids = [m.strip() for m in FILE_IDS.findall(stdout)] + file_ids = [ + file_id[2:] if file_id.startswith("f=") else file_id + for file_id in file_ids + ] + + scored_speaker_times = np.array( + [float(m) for m in SCORED_SPEAKER_TIME.findall(stdout)] + ) + + miss_speaker_times = np.array( + [float(m) for m in MISS_SPEAKER_TIME.findall(stdout)] + ) + + fa_speaker_times = np.array( + [float(m) for m in FA_SPEAKER_TIME.findall(stdout)] + ) + + error_speaker_times = np.array( + [float(m) for m in ERROR_SPEAKER_TIME.findall(stdout)] + ) + + with np.errstate(invalid="ignore", divide="ignore"): + tot_error_times = ( + miss_speaker_times + fa_speaker_times + error_speaker_times + ) + miss_speaker_frac = miss_speaker_times / scored_speaker_times + fa_speaker_frac = fa_speaker_times / scored_speaker_times + sers_frac = error_speaker_times / scored_speaker_times + ders_frac = tot_error_times / scored_speaker_times + + # Values in percentage of scored_speaker_time + miss_speaker = rectify(miss_speaker_frac) + fa_speaker = rectify(fa_speaker_frac) + sers = rectify(sers_frac) + ders = rectify(ders_frac) + + if individual_file_scores: + return miss_speaker, fa_speaker, sers, ders + else: + return miss_speaker[-1], fa_speaker[-1], sers[-1], ders[-1] From 03a5750276950176d72f0d1a997fb17c19511d48 Mon Sep 17 00:00:00 2001 From: qingen Date: Fri, 14 Jan 2022 16:32:05 +0800 Subject: [PATCH 004/107] [vector] add DER scripts to calculate Diarization Error Rate --- utils/DER.py | 31 +- utils/md-eval.pl | 2938 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 2967 insertions(+), 2 deletions(-) create mode 100755 utils/md-eval.pl diff --git a/utils/DER.py b/utils/DER.py index 4c67c4c7..25003b0a 100755 --- a/utils/DER.py +++ b/utils/DER.py @@ -4,11 +4,13 @@ False Alarm (FA), and Speaker Error Rate (SER) using md-eval-22.pl from NIST RT Authors * Neville Ryant 2018 * Nauman Dawalatabad 2020 + * Qingen Zhao 2021 Credits This code is adapted from https://github.com/nryant/dscore """ - +import argparse +from distutils.util import strtobool import os import re import subprocess @@ -84,7 +86,7 @@ def DER( """ curr = os.path.abspath(os.path.dirname(__file__)) - mdEval = os.path.join(curr, "../../tools/der_eval/md-eval.pl") + mdEval = os.path.join(curr, "./md-eval.pl") cmd = [ mdEval, @@ -150,3 +152,28 @@ def DER( return miss_speaker, fa_speaker, sers, ders else: return miss_speaker[-1], fa_speaker[-1], sers[-1], ders[-1] + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Compute Diarization Error Rate') + parser.add_argument( + '--ref_rttm', required=True, help='the path of reference/groundtruth RTTM file') + parser.add_argument( + '--sys_rttm', required=True, help='the path of the system generated RTTM file') + parser.add_argument( + '--individual_file', + default=False, + type=strtobool, + help='if True, returns scores for each file in order') + parser.add_argument( + '--collar', default=0.25, type=float, help='forgiveness collar') + parser.add_argument( + '--ignore_overlap', + default=False, + type=strtobool, + help='if True, ignores overlapping speech during evaluation') + args = parser.parse_args() + print(args) + + der = DER(args.ref_rttm, args.sys_rttm) + print("miss_speaker: %.3f%% fa_speaker: %.3f%% sers: %.3f%% ders: %.3f%%" % (der[0], der[1], der[2], der[-1])) \ No newline at end of file diff --git a/utils/md-eval.pl b/utils/md-eval.pl new file mode 100755 index 00000000..0356b927 --- /dev/null +++ b/utils/md-eval.pl @@ -0,0 +1,2938 @@ +#!/usr/bin/perl -w +################################# +# NIST. (2009). The 2009 (RT-09) Rich Transcription Meeting Recognition Evaluation Plan. +# https://web.archive.org/web/20100606041157if_/http://www.itl.nist.gov/iad/mig/tests/rt/2009/docs/rt09-meeting-eval-plan-v2.pdf +# Source (dscore): https://github.com/nryant/dscore/blob/master/scorelib/md-eval-22.pl +################################# +# BSD 2-Clause License +# +# Copyright (c) 2018, Neville Ryant +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################# + +use strict; + +my $version = "22"; + +################################# +# History: +# +# version 22: * JGF: added an option '-m FILE' to hold a CSV speaker map file. +# +# version 21: * JGF: added a flag '-n' to not remove the directory paths from the source +# files in the UEM file. +# +# version 20: * change metadata discard rule: rather than discard if the midpoint +# (or endpoint) of the metadata object lies in a no-eval zone, discard +# if there is ANY overlap whatsoever between the metadata object and +# a no-eval zone. This holds for system output objects only if the +# system output metadata object is not mapped to a ref object. +# * optimize IP and SU mapping by giving a secondary bonus mapping score +# to candidate ref-sys MD map pairs if the end-words of both coincide. +# +# version 19: * bug fix in subroutine speakers_match +# * bug fix in tag_ref_words_with_metadata_info +# +# version 18: * cosmetic fix to error message in eval_condition +# * added conditional output options for word coverage performance +# * added secondary MD word coverage optimization to word alignment +# * further optimize word alignment by considering MD subtypes +# * further optimize MD alignment by considering MD subtypes +# * add a new SU discard rule: discard if TEND in no-eval zone +# * enforce legal values for su_extent_limit +# +# version 17: create_speaker_segs modified to accommodate the same speaker +# having multiple overlapping speaker segments. (This is an +# error and pathological condition, but the system must either +# disallow (abort on) the condition, or perform properly under +# the pathological condition. The second option is chosen.) +# +# version 16: * If neither -w nor -W is specified, suppress warnings about +# ref SPEAKER records subsuming no lexemes. +# * Output the overall speaker diarization stats after the +# stats for the individual files +# * Do not alter the case of alphabetic characters in the filename +# field from the ref rttm file +# * Made the format of the overall speaker error line more similar to +# the corresponding line of output from SpkrSegEval, to facilitate +# use of existing "grep" commands in existing scripts. +# +# version 15: * bug fix in create_speaker_segs to accommodate +# contiguous same-speaker segments +# * added conditional file/channel scoring to +# speaker diarization evaluation +# +# version 14: bug fix in md_score +# +# version 13: add DISCOURSE_RESPONSE as a FILLER subtype +# +# version 12: make REF LEXEMES optional if they aren't required +# +# version 11: change default for noscore MD regions +# +# version 10: bug fix +# +# version 09: +# * avoid crash when metadata discard yields no metadata +# * make evaluated ref_wds sensitive to metadata type +# * defer discarding of system output metadata until after +# metadata mapping, then discard only unmapped events. +# * extend 1-speaker scoring inhibition to metadata +# * eliminate demand for SPKR-INFO subtype for speakers +# * correct ref count of IP and SU exact boundary words +# * add official RT-04F scores +# * add conditional analyses for file/chnl/spkr/gender +# +# version 08: +# * bug fixes speaker diarization scoring +# - count of EVAL_WORDS corrected +# - no-score extended to nearest SPEAKER boundary +# +# version 07: +# * warning issued when discarding metadata events +# that cover LEXEMEs in the evaluation region +# +# version 06: +# * eliminated unused speakers from speaker scoring +# * changed discard algorithm for unannotated SU's and +# complex EDIT's to discard sys SU's and EDIT's when +# their midpoints overlap (rather than ANY overlap). +# * fixed display_metadata_mapping +# +# version 05: +# * upgraded display_metadata_mapping +# +# version 04: +# * diagnostic metadata mapping output added +# * uem_from_rttm bug fix +# +# version 03: +# * adjusted times used for speaker diarization +# * changed usage of max_extend to agree with cookbook +# +# version 02: speaker diarization evaluation added +# +# version 01: a merged version of df-eval-v14 and su-eval-v16 +# +################################# + +#global data +my $epsilon = 1E-8; +my $miss_name = " MISS"; +my $fa_name = " FALSE ALARM"; +my %rttm_datatypes = (SEGMENT => {eval => 1, "" => 1}, + NOSCORE => {"" => 1}, + NO_RT_METADATA => {"" => 1}, + LEXEME => {lex => 1, fp => 1, frag => 1, "un-lex" => 1, + "for-lex" => 1, alpha => 1, acronym => 1, + interjection => 1, propernoun => 1, other => 1}, + "NON-LEX" => {laugh => 1, breath => 1, lipsmack => 1, + cough => 1, sneeze => 1, other => 1}, + "NON-SPEECH" => {noise => 1, music => 1, other => 1}, + FILLER => {filled_pause => 1, discourse_marker => 1, + discourse_response => 1, explicit_editing_term => 1, + other => 1}, + EDIT => {repetition => 1, restart => 1, revision => 1, + simple => 1, complex => 1, other => 1}, + IP => {edit => 1, filler => 1, "edit&filler" => 1, + other => 1}, + SU => {statement => 1, backchannel => 1, question => 1, + incomplete => 1, unannotated => 1, other => 1}, + CB => {coordinating => 1, clausal => 1, other => 1}, + "A/P" => {"" => 1}, + SPEAKER => {"" => 1}, + "SPKR-INFO" => {adult_male => 1, adult_female => 1, child => 1, unknown => 1}); +my %md_subtypes = (FILLER => $rttm_datatypes{FILLER}, + EDIT => $rttm_datatypes{EDIT}, + IP => $rttm_datatypes{IP}, + SU => $rttm_datatypes{SU}); +my %spkr_subtypes = (adult_male => 1, adult_female => 1, child => 1, unknown => 1); + +my $noeval_mds = { + DEFAULT => { + NOSCORE => {"" => 1}, + NO_RT_METADATA => {"" => 1}, + }, +}; +my $noscore_mds = { + DEFAULT => { + NOSCORE => {"" => 1}, + LEXEME => {"un-lex" => 1}, + SU => {unannotated => 1}, + }, + MIN => { + NOSCORE => {"" => 1}, + SU => {unannotated => 1}, + }, + FRAG_UNLEX => { + NOSCORE => {"" => 1}, + LEXEME => {frag => 1, "un-lex" => 1}, + SU => {unannotated => 1}, + }, + FRAG => { + NOSCORE => {"" => 1}, + LEXEME => {frag => 1}, + SU => {unannotated => 1}, + }, + NONE => { + }, +}; +my $noeval_sds = { + DEFAULT => { + NOSCORE => {"" => 1}, + }, +}; +my $noscore_sds = { + DEFAULT => { + NOSCORE => {"" => 1}, + "NON-LEX" => {laugh => 1, breath => 1, lipsmack => 1, + cough => 1, sneeze => 1, other => 1}, + }, +}; + +my %speaker_map; + +my $default_extend = 0.50; #the maximum time (in seconds) to extend a no-score zone +my $default_collar = 0.00; #the no-score collar (in +/- seconds) to attach to SPEAKER boundaries +my $default_tgap = 1.00; #the max gap (in seconds) between matching ref/sys words +my $default_Tgap = 1.00; #the max gap (in seconds) between matching ref/sys metadata events +my $default_Wgap = 0.10; #the max gap (in words) between matching ref/sys metadata events +my $default_su_time_limit = 0.50; #the max extent (in seconds) to match for SU's +my $default_su_word_limit = 2.00; #the max extent (in words) to match for SU's +my $default_word_delta_score = 10.0; #the max delta score for word-based DP alignment of ref/sys words +my $default_time_delta_score = 1.00; #the max delta score for time-based DP alignment of ref/sys words + +my $usage = "\n\nUsage: $0 [-h] -r -s \n\n". + "Description: md-eval evaluates EARS metadata detection performance\n". + " by comparing system metadata output data with reference data\n". + "INPUT:\n". + " -R A file containing a list of the reference metadata files\n". + " being evaluated, in RTTM format. If the word-mediated alignment\n". + " option is used then this data must include reference STT data\n". + " in addition to the metadata being evaluated.\n". + " OR\n". + " -r A file containing reference metadata, in RTTM format\n\n". + " -S A file containing a list of the system output metadata\n". + " files to be evaluated, in RTTM format. If the word-mediated\n". + " alignment option is used then this data must include system STT\n". + " output data in addition to the metadata to be evaluated.\n". + " OR\n". + " -s A file containing system output metadata, in RTTM format\n\n". + " input options:\n". + " -x to include complex edits in the analysis and scoring.\n". + " -w for word-mediated alignment.\n". + " * The default (time-mediated) alignment aligns ref and sys metadata\n". + " according to the time overlap of the original ref and sys metadata\n". + " time intervals.\n". + " * Word-mediated alignment aligns ref and sys metadata according to\n". + " the alignment of the words that are subsumed within the metadata\n". + " time intervals.\n". + " -W for word-optimized mapping.\n". + " * The default (time-optimized) mapping maps ref and sys metadata\n". + " so as to maximize the time overlap of mapped metadata events.\n". + " * Word-optimized mapping maps ref and sys metadata so as to\n". + " maximize the overlap in terms of the number of reference words\n". + " that are subsumed within the overlapping time interval.\n". + " -a Conditional analysis options for metadata detection performance:\n". + " c for performance versus channel,\n". + " f for performance versus file,\n". + " g for performance versus gender, and\n". + " s for performance versus speaker.\n". + " -A Conditional analysis options for word coverage performance:\n". + " c for performance versus channel,\n". + " f for performance versus file,\n". + " -t