diff --git a/README.md b/README.md index 3c60db650..f71d0562a 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision ### Recent Update - 👑 2023.03.09: Add [Wav2vec2ASR-zh](./examples/aishell/asr3). -- 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo](./demos/TTSArmLinux). +- 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo (with C++ Chinese Text Frontend)](./demos/TTSArmLinux). - 🔥 2023.03.03 Add Voice Conversion [StarGANv2-VC synthesize pipeline](./examples/vctk/vc3). - 🎉 2023.02.16: Add [Cantonese TTS](./examples/canton/tts3). - 🔥 2023.01.10: Add [code-switch asr CLI and Demos](./demos/speech_recognition). diff --git a/README_cn.md b/README_cn.md index 29ee387c0..5771d766b 100644 --- a/README_cn.md +++ b/README_cn.md @@ -184,7 +184,7 @@ ### 近期更新 - 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3)。 -- 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例](./demos/TTSArmLinux)。 +- 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例 (包含 C++ 中文文本前端模块)](./demos/TTSArmLinux)。 - 🔥 2023.03.03: 新增声音转换模型 [StarGANv2-VC 合成流程](./examples/vctk/vc3)。 - 🎉 2023.02.16: 新增[粤语语音合成](./examples/canton/tts3)。 - 🔥 2023.01.10: 新增[中英混合 ASR CLI 和 Demos](./demos/speech_recognition)。 diff --git a/demos/TTSCppFrontend/README.md b/demos/TTSCppFrontend/README.md index 5ea1bda49..98499364d 100644 --- a/demos/TTSCppFrontend/README.md +++ b/demos/TTSCppFrontend/README.md @@ -38,6 +38,7 @@ If the download speed is too slow, you can open [third-party/CMakeLists.txt](thi ``` ## Run +You can change `--phone2id_path` in `./front_demo/front.conf` to the `phone_id_map.txt` of your own acoustic model. You can change `--phone2id_path` in `./front_demo/front.conf` to the `phone_id_map.txt` of your own acoustic model. diff --git a/examples/aishell/asr3/README.md b/examples/aishell/asr3/README.md index f6fa60d7f..6b587e12f 100644 --- a/examples/aishell/asr3/README.md +++ b/examples/aishell/asr3/README.md @@ -190,9 +190,9 @@ tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz ``` You can download the audio demo: ```bash -wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ ``` You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. ```bash -CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_002_en.wav +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_01_03.wav ``` diff --git a/examples/aishell/asr3/RESULT.md b/examples/aishell/asr3/RESULT.md index 1291ef15c..42edeac11 100644 --- a/examples/aishell/asr3/RESULT.md +++ b/examples/aishell/asr3/RESULT.md @@ -4,6 +4,7 @@ * paddle version: develop (commit id: daea892c67e85da91906864de40ce9f6f1b893ae) * paddlespeech version: develop (commit id: c14b4238b256693281e59605abff7c9435b3e2b2) +* paddlenlp version: 2.5.2 ## Device * python: 3.7 diff --git a/examples/aishell/asr3/conf/train_with_wav2vec.yaml b/examples/aishell/asr3/conf/train_with_wav2vec.yaml index 77b3762ef..273175d27 100755 --- a/examples/aishell/asr3/conf/train_with_wav2vec.yaml +++ b/examples/aishell/asr3/conf/train_with_wav2vec.yaml @@ -83,7 +83,7 @@ dnn_neurons: 1024 freeze_wav2vec: False dropout: 0.15 -tokenizer: !apply:transformers.BertTokenizer.from_pretrained +tokenizer: !apply:paddlenlp.transformers.AutoTokenizer.from_pretrained pretrained_model_name_or_path: bert-base-chinese # bert-base-chinese tokens length output_neurons: 21128 diff --git a/examples/aishell/asr3/conf/wav2vec2ASR.yaml b/examples/aishell/asr3/conf/wav2vec2ASR.yaml old mode 100755 new mode 100644 index cdb04f8c1..4a1274688 --- a/examples/aishell/asr3/conf/wav2vec2ASR.yaml +++ b/examples/aishell/asr3/conf/wav2vec2ASR.yaml @@ -107,6 +107,7 @@ vocab_filepath: data/lang_char/vocab.txt ########################################### unit_type: 'char' +tokenizer: bert-base-chinese mean_std_filepath: preprocess_config: conf/preprocess.yaml sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs @@ -139,12 +140,10 @@ n_epoch: 80 accum_grad: 1 global_grad_clip: 5.0 -model_optim: adadelta +model_optim: sgd model_optim_conf: lr: 1.0 weight_decay: 0.0 - rho: 0.95 - epsilon: 1.0e-8 wav2vec2_optim: adam wav2vec2_optim_conf: @@ -165,3 +164,4 @@ log_interval: 1 checkpoint: kbest_n: 50 latest_n: 5 + diff --git a/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml b/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml new file mode 100755 index 000000000..ec287f0c6 --- /dev/null +++ b/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml @@ -0,0 +1,168 @@ +############################################ +# Network Architecture # +############################################ +freeze_wav2vec2: False +normalize_wav: True +output_norm: True +init_type: 'kaiming_uniform' # !Warning: need to convergence +enc: + input_shape: 1024 + dnn_blocks: 3 + dnn_neurons: 1024 + activation: True + normalization: True + dropout_rate: [0.15, 0.15, 0.0] +ctc: + enc_n_units: 1024 + blank_id: 0 + dropout_rate: 0.0 + +audio_augment: + speeds: [90, 100, 110] + +spec_augment: + time_warp: True + time_warp_window: 5 + time_warp_mode: bicubic + freq_mask: True + n_freq_mask: 2 + time_mask: True + n_time_mask: 2 + replace_with_zero: False + freq_mask_width: 30 + time_mask_width: 40 +wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams + + +############################################ +# Wav2Vec2.0 # +############################################ +# vocab_size: 1000000 +hidden_size: 1024 +num_hidden_layers: 24 +num_attention_heads: 16 +intermediate_size: 4096 +hidden_act: gelu +hidden_dropout: 0.1 +activation_dropout: 0.0 +attention_dropout: 0.1 +feat_proj_dropout: 0.1 +feat_quantizer_dropout: 0.0 +final_dropout: 0.0 +layerdrop: 0.1 +initializer_range: 0.02 +layer_norm_eps: 1e-5 +feat_extract_norm: layer +feat_extract_activation: gelu +conv_dim: [512, 512, 512, 512, 512, 512, 512] +conv_stride: [5, 2, 2, 2, 2, 2, 2] +conv_kernel: [10, 3, 3, 3, 3, 2, 2] +conv_bias: True +num_conv_pos_embeddings: 128 +num_conv_pos_embedding_groups: 16 +do_stable_layer_norm: True +apply_spec_augment: False +mask_channel_length: 10 +mask_channel_min_space: 1 +mask_channel_other: 0.0 +mask_channel_prob: 0.0 +mask_channel_selection: static +mask_feature_length: 10 +mask_feature_min_masks: 0 +mask_feature_prob: 0.0 +mask_time_length: 10 +mask_time_min_masks: 2 +mask_time_min_space: 1 +mask_time_other: 0.0 +mask_time_prob: 0.075 +mask_time_selection: static +num_codevectors_per_group: 320 +num_codevector_groups: 2 +contrastive_logits_temperature: 0.1 +num_negatives: 100 +codevector_dim: 256 +proj_codevector_dim: 256 +diversity_loss_weight: 0.1 +use_weighted_layer_sum: False +# pad_token_id: 0 +# bos_token_id: 1 +# eos_token_id: 2 +add_adapter: False +adapter_kernel_size: 3 +adapter_stride: 2 +num_adapter_layers: 3 +output_hidden_size: None + +########################################### +# Data # +########################################### + +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +vocab_filepath: data/lang_char/vocab.txt + +########################################### +# Dataloader # +########################################### + +unit_type: 'char' +tokenizer: bert-base-chinese +mean_std_filepath: +preprocess_config: conf/preprocess.yaml +sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 5 # Different batch_size may cause large differences in results +maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced +maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 6 +subsampling_factor: 1 +num_encs: 1 +dist_sampler: True +shortest_first: True +return_lens_rate: True + +########################################### +# use speechbrain dataloader # +########################################### +use_sb_pipeline: True # whether use speechbrain pipeline. Default is True. +sb_pipeline_conf: conf/train_with_wav2vec.yaml + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +global_grad_clip: 5.0 + +model_optim: adadelta +model_optim_conf: + lr: 1.0 + weight_decay: 0.0 + rho: 0.95 + epsilon: 1.0e-8 + +wav2vec2_optim: adam +wav2vec2_optim_conf: + lr: 0.0001 + weight_decay: 0.0 + +model_scheduler: newbobscheduler +model_scheduler_conf: + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +wav2vec2_scheduler: newbobscheduler +wav2vec2_scheduler_conf: + improvement_threshold: 0.0025 + annealing_factor: 0.9 + patient: 0 +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr3/local/aishell_prepare.py b/examples/aishell/asr3/local/aishell_prepare.py index a25735791..2a7ba5c6c 100644 --- a/examples/aishell/asr3/local/aishell_prepare.py +++ b/examples/aishell/asr3/local/aishell_prepare.py @@ -21,7 +21,7 @@ import glob import logging import os -from paddlespeech.s2t.models.wav2vec2.io.dataio import read_audio +from paddlespeech.s2t.io.speechbrain.dataio import read_audio logger = logging.getLogger(__name__) diff --git a/examples/aishell/asr3/local/data.sh b/examples/aishell/asr3/local/data.sh index 1a468f546..bd26c1e78 100755 --- a/examples/aishell/asr3/local/data.sh +++ b/examples/aishell/asr3/local/data.sh @@ -1,7 +1,7 @@ #!/bin/bash stage=-1 -stop_stage=-1 +stop_stage=3 dict_dir=data/lang_char . ${MAIN_ROOT}/utils/parse_options.sh || exit -1; diff --git a/examples/aishell/asr3/local/test.sh b/examples/aishell/asr3/local/test.sh index 9d4b84291..91e1c5457 100755 --- a/examples/aishell/asr3/local/test.sh +++ b/examples/aishell/asr3/local/test.sh @@ -8,9 +8,7 @@ echo "using $ngpu gpus..." expdir=exp datadir=data -train_set=train_960 -recog_set="test-clean test-other dev-clean dev-other" -recog_set="test-clean" +train_set=train config_path=$1 decode_config_path=$2 @@ -75,7 +73,7 @@ for type in ctc_prefix_beam_search; do --trans_hyp ${ckpt_prefix}.${type}.rsl.text python3 utils/compute-wer.py --char=1 --v=1 \ - data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error + data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error echo "decoding ${type} done." done diff --git a/examples/aishell/asr3/local/test_wav.sh b/examples/aishell/asr3/local/test_wav.sh index fdf3589f4..7ccef6945 100755 --- a/examples/aishell/asr3/local/test_wav.sh +++ b/examples/aishell/asr3/local/test_wav.sh @@ -14,7 +14,7 @@ ckpt_prefix=$3 audio_file=$4 mkdir -p data -wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ if [ $? -ne 0 ]; then exit 1 fi diff --git a/examples/aishell/asr3/run.sh b/examples/aishell/asr3/run.sh index 9b0a3c472..557ca0fcd 100755 --- a/examples/aishell/asr3/run.sh +++ b/examples/aishell/asr3/run.sh @@ -15,11 +15,11 @@ resume= # xx e.g. 30 export FLAGS_cudnn_deterministic=1 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1; -audio_file=data/demo_002_en.wav +audio_file=data/demo_01_03.wav avg_ckpt=avg_${avg_num} ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') -echo "checkpoint name ${ckpt}"git revert -v +echo "checkpoint name ${ckpt}" if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data diff --git a/examples/opencpop/README.md b/examples/opencpop/README.md new file mode 100644 index 000000000..5a574dc80 --- /dev/null +++ b/examples/opencpop/README.md @@ -0,0 +1,6 @@ + +# Opencpop + +* svs1 - DiffSinger +* voc1 - Parallel WaveGAN +* voc5 - HiFiGAN diff --git a/examples/opencpop/voc1/README.md b/examples/opencpop/voc1/README.md new file mode 100644 index 000000000..37570a648 --- /dev/null +++ b/examples/opencpop/voc1/README.md @@ -0,0 +1,139 @@ +# Parallel WaveGAN with Opencpop +This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Mandarin singing corpus](https://wenet.org.cn/opencpop/). + +## Dataset +### Download and Extract +Download Opencpop from it's [Official Website](https://wenet.org.cn/opencpop/download/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/Opencpop`. + +## Get Started +Assume the path to the dataset is `~/datasets/Opencpop`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── feats_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. + +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] + [--profiler_options PROFILER_OPTIONS] + +Train a ParallelWaveGAN model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG ParallelWaveGAN config file. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + +benchmark: + arguments related to benchmark. + + --batch-size BATCH_SIZE + batch size. + --max-iter MAX_ITER train max steps. + --run-benchmark RUN_BENCHMARK + runing benchmark or not, if True, use the --batch-size + and --max-iter. + --profiler_options PROFILER_OPTIONS + The option of profiler, which should be in format + "key1=value1;key2=value2;key3=value3". +``` + +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +### Synthesizing +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] + [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] + [--output-dir OUTPUT_DIR] [--ngpu NGPU] + +Synthesize with GANVocoder. + +optional arguments: + -h, --help show this help message and exit + --generator-type GENERATOR_TYPE + type of GANVocoder, should in {pwgan, mb_melgan, + style_melgan, } now + --config CONFIG GANVocoder config file. + --checkpoint CHECKPOINT + snapshot to load. + --test-metadata TEST_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. +``` + +1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. +2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. +3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. +4. `--output-dir` is the directory to save the synthesized audio files. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Models +The pretrained model can be downloaded here: +- [pwgan_opencpop_ckpt_1.4.0](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip) + + +Parallel WaveGAN checkpoint contains files listed below. + +```text +pwgan_opencpop_ckpt_1.4.0 +├── default.yaml # default config used to train parallel wavegan +├── snapshot_iter_100000.pdz # generator parameters of parallel wavegan +└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` +## Acknowledgement +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/opencpop/voc1/conf/default.yaml b/examples/opencpop/voc1/conf/default.yaml new file mode 100644 index 000000000..ee99719dc --- /dev/null +++ b/examples/opencpop/voc1/conf/default.yaml @@ -0,0 +1,119 @@ +# This is the hyperparameter configuration file for Parallel WaveGAN. +# Please make sure this is adjusted for the CSMSC dataset. If you want to +# apply to the other dataset, you might need to carefully change some parameters. +# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 512 # FFT size (samples). +n_shift: 128 # Hop size (samples). 12.5ms +win_length: 512 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 30 # Minimum freq in mel basis calculation. (Hz) +fmax: 12000 # Maximum frequency in mel basis calculation. (Hz) + + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Kernel size of dilated convolution. + layers: 30 # Number of residual block layers. + stacks: 3 # Number of stacks i.e., dilation cycles. + residual_channels: 64 # Number of channels in residual conv. + gate_channels: 128 # Number of channels in gated conv. + skip_channels: 64 # Number of channels in skip conv. + aux_channels: 80 # Number of channels for auxiliary feature conv. + # Must be the same as num_mels. + aux_context_window: 2 # Context window size for auxiliary feature. + # If set to 2, previous 2 and future 2 frames will be considered. + dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. + bias: True # use bias in residual blocks + use_weight_norm: True # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + use_causal_conv: False # use causal conv in residual blocks and upsample layers + upsample_scales: [8, 4, 2, 2] # Upsampling scales. Prodcut of these must be the same as hop size. + interpolate_mode: "nearest" # upsample net interpolate mode + freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis + nonlinear_activation: null + nonlinear_activation_params: {} + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Number of output channels. + layers: 10 # Number of conv layers. + conv_channels: 64 # Number of chnn layers. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. + nonlinear_activation_params: # Nonlinear function parameters + negative_slope: 0.2 # Alpha in leakyrelu. + +########################################################### +# STFT LOSS SETTING # +########################################################### +stft_loss_params: + fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. + hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss + win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. + window: "hann" # Window function for STFT-based loss + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +lambda_adv: 4.0 # Loss balancing coefficient. + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 8 # Batch size. +batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by n_shift. +num_workers: 1 # Number of workers in DataLoader. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + epsilon: 1.0e-6 # Generator's epsilon. + weight_decay: 0.0 # Generator's weight decay coefficient. +generator_scheduler_params: + learning_rate: 0.0001 # Generator's learning rate. + step_size: 200000 # Generator's scheduler step size. + gamma: 0.5 # Generator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +generator_grad_norm: 10 # Generator's gradient norm. +discriminator_optimizer_params: + epsilon: 1.0e-6 # Discriminator's epsilon. + weight_decay: 0.0 # Discriminator's weight decay coefficient. +discriminator_scheduler_params: + learning_rate: 0.00005 # Discriminator's learning rate. + step_size: 200000 # Discriminator's scheduler step size. + gamma: 0.5 # Discriminator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +discriminator_grad_norm: 1 # Discriminator's gradient norm. + +########################################################### +# INTERVAL SETTING # +########################################################### +discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. +train_max_steps: 400000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/opencpop/voc1/local/PTQ_static.sh b/examples/opencpop/voc1/local/PTQ_static.sh new file mode 120000 index 000000000..247ce5c74 --- /dev/null +++ b/examples/opencpop/voc1/local/PTQ_static.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/PTQ_static.sh \ No newline at end of file diff --git a/examples/opencpop/voc1/local/dygraph_to_static.sh b/examples/opencpop/voc1/local/dygraph_to_static.sh new file mode 100755 index 000000000..40a2c51ba --- /dev/null +++ b/examples/opencpop/voc1/local/dygraph_to_static.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../../dygraph_to_static.py \ + --type=voc \ + --voc=pwgan_opencpop \ + --voc_config=${config_path} \ + --voc_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --voc_stat=dump/train/feats_stats.npy \ + --inference_dir=exp/default/inference/ diff --git a/examples/opencpop/voc1/local/preprocess.sh b/examples/opencpop/voc1/local/preprocess.sh new file mode 100755 index 000000000..edab4d0d5 --- /dev/null +++ b/examples/opencpop/voc1/local/preprocess.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/../preprocess.py \ + --rootdir=~/datasets/Opencpop/segments/ \ + --dataset=opencpop \ + --dumpdir=dump \ + --dur-file=~/datasets/Opencpop/segments/transcriptions.txt \ + --config=${config_path} \ + --cut-sil=False \ + --num-cpu=20 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="feats" +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize, dev and test should use train's stats + echo "Normalize ..." + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --stats=dump/train/feats_stats.npy + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --stats=dump/train/feats_stats.npy + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --stats=dump/train/feats_stats.npy +fi diff --git a/examples/opencpop/voc1/local/synthesize.sh b/examples/opencpop/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/opencpop/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/opencpop/voc1/local/train.sh b/examples/opencpop/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/opencpop/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/opencpop/voc1/path.sh b/examples/opencpop/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/opencpop/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/opencpop/voc1/run.sh b/examples/opencpop/voc1/run.sh new file mode 100755 index 000000000..1f87425f4 --- /dev/null +++ b/examples/opencpop/voc1/run.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_100000.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +# dygraph to static +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/dygraph_to_static.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +# PTQ_static +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} pwgan_opencpop || exit -1 +fi diff --git a/examples/opencpop/voc5/conf/default.yaml b/examples/opencpop/voc5/conf/default.yaml new file mode 100644 index 000000000..10449f860 --- /dev/null +++ b/examples/opencpop/voc5/conf/default.yaml @@ -0,0 +1,167 @@ +# This is the configuration file for CSMSC dataset. +# This configuration is based on HiFiGAN V1, which is an official configuration. +# But I found that the optimizer setting does not work well with my implementation. +# So I changed optimizer settings as follows: +# - AdamW -> Adam +# - betas: [0.8, 0.99] -> betas: [0.5, 0.9] +# - Scheduler: ExponentialLR -> MultiStepLR +# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 512 # FFT size (samples). +n_shift: 128 # Hop size (samples). 12.5ms +win_length: 512 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 12000 # Maximum frequency in mel basis calculation. (Hz) + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 80 # Number of input channels. + out_channels: 1 # Number of output channels. + channels: 512 # Number of initial channels. + kernel_size: 7 # Kernel size of initial and final conv layers. + upsample_scales: [8, 4, 2, 2] # Upsampling scales. + upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers. + resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks. + resblock_dilations: # Dilations for residual blocks. + - [1, 3, 5] + - [1, 3, 5] + - [1, 3, 5] + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. + nonlinear_activation: "leakyrelu" # Nonlinear activation type. + nonlinear_activation_params: # Nonlinear activation paramters. + negative_slope: 0.1 + use_weight_norm: True # Whether to apply weight normalization. + + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + scales: 3 # Number of multi-scale discriminator. + scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator. + scale_downsample_pooling_params: + kernel_size: 4 # Pooling kernel size. + stride: 2 # Pooling stride. + padding: 2 # Padding size. + scale_discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_sizes: [15, 41, 5, 3] # List of kernel sizes. + channels: 128 # Initial number of channels. + max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. + max_groups: 16 # Maximum number of groups in downsampling conv layers. + bias: True + downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. + nonlinear_activation: "leakyrelu" # Nonlinear activation. + nonlinear_activation_params: + negative_slope: 0.1 + follow_official_norm: True # Whether to follow the official norm setting. + periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. + period_discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_sizes: [5, 3] # List of kernel sizes. + channels: 32 # Initial number of channels. + downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. + max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. + bias: True # Whether to use bias parameter in conv layer." + nonlinear_activation: "leakyrelu" # Nonlinear activation. + nonlinear_activation_params: # Nonlinear activation paramters. + negative_slope: 0.1 + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. + + +########################################################### +# STFT LOSS SETTING # +########################################################### +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. +mel_loss_params: + fs: 24000 + fft_size: 512 + hop_size: 128 + win_length: 512 + window: "hann" + num_mels: 80 + fmin: 30 + fmax: 12000 + log_base: null +generator_adv_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. +discriminator_adv_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True +feat_match_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +lambda_aux: 45.0 # Loss balancing coefficient for STFT loss. +lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss. +lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss.. + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 16 # Batch size. +batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size. +num_workers: 1 # Number of workers in DataLoader. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + beta1: 0.5 + beta2: 0.9 + weight_decay: 0.0 # Generator's weight decay coefficient. +generator_scheduler_params: + learning_rate: 2.0e-4 # Generator's learning rate. + gamma: 0.5 # Generator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 200000 + - 400000 + - 600000 + - 800000 +generator_grad_norm: -1 # Generator's gradient norm. +discriminator_optimizer_params: + beta1: 0.5 + beta2: 0.9 + weight_decay: 0.0 # Discriminator's weight decay coefficient. +discriminator_scheduler_params: + learning_rate: 2.0e-4 # Discriminator's learning rate. + gamma: 0.5 # Discriminator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 200000 + - 400000 + - 600000 + - 800000 +discriminator_grad_norm: -1 # Discriminator's gradient norm. + +########################################################### +# INTERVAL SETTING # +########################################################### +generator_train_start_steps: 1 # Number of steps to start to train discriminator. +discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. +train_max_steps: 2500000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 4 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/opencpop/voc5/conf/finetune.yaml b/examples/opencpop/voc5/conf/finetune.yaml new file mode 100644 index 000000000..0022a67aa --- /dev/null +++ b/examples/opencpop/voc5/conf/finetune.yaml @@ -0,0 +1,168 @@ +# This is the configuration file for CSMSC dataset. +# This configuration is based on HiFiGAN V1, which is an official configuration. +# But I found that the optimizer setting does not work well with my implementation. +# So I changed optimizer settings as follows: +# - AdamW -> Adam +# - betas: [0.8, 0.99] -> betas: [0.5, 0.9] +# - Scheduler: ExponentialLR -> MultiStepLR +# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 512 # FFT size (samples). +n_shift: 128 # Hop size (samples). 12.5ms +win_length: 512 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 12000 # Maximum frequency in mel basis calculation. (Hz) + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 80 # Number of input channels. + out_channels: 1 # Number of output channels. + channels: 512 # Number of initial channels. + kernel_size: 7 # Kernel size of initial and final conv layers. + upsample_scales: [8, 4, 2, 2] # Upsampling scales. + upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers. + resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks. + resblock_dilations: # Dilations for residual blocks. + - [1, 3, 5] + - [1, 3, 5] + - [1, 3, 5] + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. + nonlinear_activation: "leakyrelu" # Nonlinear activation type. + nonlinear_activation_params: # Nonlinear activation paramters. + negative_slope: 0.1 + use_weight_norm: True # Whether to apply weight normalization. + + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + scales: 3 # Number of multi-scale discriminator. + scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator. + scale_downsample_pooling_params: + kernel_size: 4 # Pooling kernel size. + stride: 2 # Pooling stride. + padding: 2 # Padding size. + scale_discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_sizes: [15, 41, 5, 3] # List of kernel sizes. + channels: 128 # Initial number of channels. + max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. + max_groups: 16 # Maximum number of groups in downsampling conv layers. + bias: True + downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. + nonlinear_activation: "leakyrelu" # Nonlinear activation. + nonlinear_activation_params: + negative_slope: 0.1 + follow_official_norm: True # Whether to follow the official norm setting. + periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. + period_discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_sizes: [5, 3] # List of kernel sizes. + channels: 32 # Initial number of channels. + downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. + max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. + bias: True # Whether to use bias parameter in conv layer." + nonlinear_activation: "leakyrelu" # Nonlinear activation. + nonlinear_activation_params: # Nonlinear activation paramters. + negative_slope: 0.1 + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. + + +########################################################### +# STFT LOSS SETTING # +########################################################### +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. +mel_loss_params: + fs: 24000 + fft_size: 512 + hop_size: 128 + win_length: 512 + window: "hann" + num_mels: 80 + fmin: 30 + fmax: 12000 + log_base: null +generator_adv_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. +discriminator_adv_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True +feat_match_loss_params: + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +lambda_aux: 45.0 # Loss balancing coefficient for STFT loss. +lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss. +lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss.. + +########################################################### +# DATA LOADER SETTING # +########################################################### +#batch_size: 16 # Batch size. +batch_size: 1 # Batch size. +batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size. +num_workers: 1 # Number of workers in DataLoader. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + beta1: 0.5 + beta2: 0.9 + weight_decay: 0.0 # Generator's weight decay coefficient. +generator_scheduler_params: + learning_rate: 2.0e-4 # Generator's learning rate. + gamma: 0.5 # Generator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 200000 + - 400000 + - 600000 + - 800000 +generator_grad_norm: -1 # Generator's gradient norm. +discriminator_optimizer_params: + beta1: 0.5 + beta2: 0.9 + weight_decay: 0.0 # Discriminator's weight decay coefficient. +discriminator_scheduler_params: + learning_rate: 2.0e-4 # Discriminator's learning rate. + gamma: 0.5 # Discriminator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 200000 + - 400000 + - 600000 + - 800000 +discriminator_grad_norm: -1 # Discriminator's gradient norm. + +########################################################### +# INTERVAL SETTING # +########################################################### +generator_train_start_steps: 1 # Number of steps to start to train discriminator. +discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. +train_max_steps: 2600000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 4 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/opencpop/voc5/finetune.sh b/examples/opencpop/voc5/finetune.sh new file mode 100755 index 000000000..76f363295 --- /dev/null +++ b/examples/opencpop/voc5/finetune.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py \ + --diffsinger-config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \ + --diffsinger-checkpoint=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \ + --diffsinger-stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy \ + --diffsinger-stretch=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy \ + --dur-file=~/datasets/Opencpop/segments/transcriptions.txt \ + --output-dir=dump_finetune \ + --phones-dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \ + --dataset=opencpop \ + --rootdir=~/datasets/Opencpop/segments/ +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${MAIN_ROOT}/utils/link_wav.py \ + --old-dump-dir=dump \ + --dump-dir=dump_finetune +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + cp dump/train/feats_stats.npy dump_finetune/train/ +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize, dev and test should use train's stats + echo "Normalize ..." + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/train/raw/metadata.jsonl \ + --dumpdir=dump_finetune/train/norm \ + --stats=dump_finetune/train/feats_stats.npy + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/dev/raw/metadata.jsonl \ + --dumpdir=dump_finetune/dev/norm \ + --stats=dump_finetune/train/feats_stats.npy + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/test/raw/metadata.jsonl \ + --dumpdir=dump_finetune/test/norm \ + --stats=dump_finetune/train/feats_stats.npy +fi + +# create finetune env +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "create finetune env" + python3 local/prepare_env.py \ + --pretrained_model_dir=exp/default/checkpoints/ \ + --output_dir=exp/finetune/ +fi + +# finetune +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + CUDA_VISIBLE_DEVICES=${gpus} \ + FLAGS_cudnn_exhaustive_search=true \ + FLAGS_conv_workspace_size_limit=4000 \ + python ${BIN_DIR}/train.py \ + --train-metadata=dump_finetune/train/norm/metadata.jsonl \ + --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \ + --config=conf/finetune.yaml \ + --output-dir=exp/finetune \ + --ngpu=1 +fi diff --git a/examples/opencpop/voc5/local/PTQ_static.sh b/examples/opencpop/voc5/local/PTQ_static.sh new file mode 120000 index 000000000..247ce5c74 --- /dev/null +++ b/examples/opencpop/voc5/local/PTQ_static.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/PTQ_static.sh \ No newline at end of file diff --git a/examples/opencpop/voc5/local/dygraph_to_static.sh b/examples/opencpop/voc5/local/dygraph_to_static.sh new file mode 100755 index 000000000..65077661a --- /dev/null +++ b/examples/opencpop/voc5/local/dygraph_to_static.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../../dygraph_to_static.py \ + --type=voc \ + --voc=hifigan_opencpop \ + --voc_config=${config_path} \ + --voc_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --voc_stat=dump/train/feats_stats.npy \ + --inference_dir=exp/default/inference/ diff --git a/examples/opencpop/voc5/local/prepare_env.py b/examples/opencpop/voc5/local/prepare_env.py new file mode 120000 index 000000000..be03c86b3 --- /dev/null +++ b/examples/opencpop/voc5/local/prepare_env.py @@ -0,0 +1 @@ +../../../other/tts_finetune/tts3/local/prepare_env.py \ No newline at end of file diff --git a/examples/opencpop/voc5/local/preprocess.sh b/examples/opencpop/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/opencpop/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/opencpop/voc5/local/synthesize.sh b/examples/opencpop/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/opencpop/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/opencpop/voc5/local/train.sh b/examples/opencpop/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/opencpop/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/opencpop/voc5/path.sh b/examples/opencpop/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/opencpop/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/opencpop/voc5/run.sh b/examples/opencpop/voc5/run.sh new file mode 100755 index 000000000..290c90d25 --- /dev/null +++ b/examples/opencpop/voc5/run.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_2500000.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +# dygraph to static +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/dygraph_to_static.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +# PTQ_static +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} hifigan_opencpop || exit -1 +fi diff --git a/examples/other/tn/data/textnorm_test_cases.txt b/examples/other/tn/data/textnorm_test_cases.txt index 17e90d0b6..ba9e6529a 100644 --- a/examples/other/tn/data/textnorm_test_cases.txt +++ b/examples/other/tn/data/textnorm_test_cases.txt @@ -32,7 +32,7 @@ iPad Pro的秒控键盘这次也推出白色版本。|iPad Pro的秒控键盘这 明天有62%的概率降雨|明天有百分之六十二的概率降雨 这是固话0421-33441122|这是固话零四二一三三四四一一二二 这是手机+86 18544139121|这是手机八六一八五四四一三九一二一 -小王的身高是153.5cm,梦想是打篮球!我觉得有0.1%的可能性。|小王的身高是一百五十三点五cm,梦想是打篮球!我觉得有百分之零点一的可能性。 +小王的身高是153.5cm,梦想是打篮球!我觉得有0.1%的可能性。|小王的身高是一百五十三点五厘米,梦想是打篮球!我觉得有百分之零点一的可能性。 不管三七二十一|不管三七二十一 九九八十一难|九九八十一难 2018年5月23号上午10点10分|二零一八年五月二十三号上午十点十分 @@ -124,4 +124,4 @@ iPad Pro的秒控键盘这次也推出白色版本。|iPad Pro的秒控键盘这 12~23|十二到二十三 12-23|十二到二十三 25cm²|二十五平方厘米 -25m|米 \ No newline at end of file +25m|米 diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 04df18623..3c5db64bb 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -102,13 +102,11 @@ ssl_dynamic_pretrained_models = { 'params': 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', }, - }, - "wav2vec2ASR_aishell1-zh-16k": { '1.4': { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz', 'md5': - '9f0bc943adb822789bf61e674b229d17', + '150e51b8ea5d255ccce6b395de8d916a', 'cfg_path': 'model.yaml', 'ckpt_path': diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py index 0d66ac410..2416db7ee 100644 --- a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +from paddlenlp.transformers import AutoTokenizer from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer @@ -34,8 +35,13 @@ class Wav2vec2Infer(): self.config = config self.audio_file = args.audio_file - self.text_feature = TextFeaturizer( - unit_type=config.unit_type, vocab=config.vocab_filepath) + if self.config.tokenizer: + self.text_feature = AutoTokenizer.from_pretrained( + self.config.tokenizer) + else: + self.text_feature = TextFeaturizer( + unit_type=config.unit_type, vocab=config.vocab_filepath) + paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') # model @@ -59,14 +65,14 @@ class Wav2vec2Infer(): audio, _ = soundfile.read( self.audio_file, dtype="int16", always_2d=True) logger.info(f"audio shape: {audio.shape}") - xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) decode_config = self.config.decode result_transcripts, result_tokenids = self.model.decode( xs, text_feature=self.text_feature, decoding_method=decode_config.decoding_method, - beam_size=decode_config.beam_size) + beam_size=decode_config.beam_size, + tokenizer=self.config.tokenizer, ) rsl = result_transcripts[0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {rsl}") diff --git a/paddlespeech/t2s/exps/PTQ_static.py b/paddlespeech/t2s/exps/PTQ_static.py index 16b3ae983..a95786450 100644 --- a/paddlespeech/t2s/exps/PTQ_static.py +++ b/paddlespeech/t2s/exps/PTQ_static.py @@ -42,6 +42,8 @@ def parse_args(): 'hifigan_aishell3', 'hifigan_ljspeech', 'hifigan_vctk', + 'pwgan_opencpop', + 'hifigan_opencpop', ], help='Choose model type of tts task.') diff --git a/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py b/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py new file mode 100644 index 000000000..519808f2a --- /dev/null +++ b/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py @@ -0,0 +1,240 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# generate mels using durations.txt +# for mb melgan finetune +import argparse +import os +from pathlib import Path + +import numpy as np +import paddle +import yaml +from tqdm import tqdm +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs +from paddlespeech.t2s.models.diffsinger import DiffSinger +from paddlespeech.t2s.models.diffsinger import DiffSingerInference +from paddlespeech.t2s.modules.normalizer import ZScore +from paddlespeech.t2s.utils import str2bool + + +def evaluate(args, diffsinger_config): + rootdir = Path(args.rootdir).expanduser() + assert rootdir.is_dir() + + # construct dataset for evaluation + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + phone_dict = {} + for phn, id in phn_id: + phone_dict[phn] = int(id) + + if args.speaker_dict: + with open(args.speaker_dict, 'rt') as f: + spk_id_list = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id_list) + else: + spk_num = None + + with open(args.diffsinger_stretch, "r") as f: + spec_min = np.load(args.diffsinger_stretch)[0] + spec_max = np.load(args.diffsinger_stretch)[1] + spec_min = paddle.to_tensor(spec_min) + spec_max = paddle.to_tensor(spec_max) + print("min and max spec done!") + + odim = diffsinger_config.n_mels + diffsinger_config["model"]["fastspeech2_params"]["spk_num"] = spk_num + model = DiffSinger( + spec_min=spec_min, + spec_max=spec_max, + idim=vocab_size, + odim=odim, + **diffsinger_config["model"], ) + + model.set_state_dict(paddle.load(args.diffsinger_checkpoint)["main_params"]) + model.eval() + + stat = np.load(args.diffsinger_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + diffsinger_normalizer = ZScore(mu, std) + + diffsinger_inference = DiffSingerInference(diffsinger_normalizer, model) + diffsinger_inference.eval() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + sentences, speaker_set = get_sentences_svs( + args.dur_file, + dataset=args.dataset, + sample_rate=diffsinger_config.fs, + n_shift=diffsinger_config.n_shift, ) + + if args.dataset == "opencpop": + wavdir = rootdir / "wavs" + # split data into 3 sections + train_file = rootdir / "train.txt" + train_wav_files = [] + with open(train_file, "r") as f_train: + for line in f_train.readlines(): + utt = line.split("|")[0] + wav_name = utt + ".wav" + wav_path = wavdir / wav_name + train_wav_files.append(wav_path) + + test_file = rootdir / "test.txt" + dev_wav_files = [] + test_wav_files = [] + num_dev = 106 + count = 0 + with open(test_file, "r") as f_test: + for line in f_test.readlines(): + count += 1 + utt = line.split("|")[0] + wav_name = utt + ".wav" + wav_path = wavdir / wav_name + if count > num_dev: + test_wav_files.append(wav_path) + else: + dev_wav_files.append(wav_path) + else: + print("dataset should in {opencpop} now!") + + train_wav_files = [ + os.path.basename(str(str_path)) for str_path in train_wav_files + ] + dev_wav_files = [ + os.path.basename(str(str_path)) for str_path in dev_wav_files + ] + test_wav_files = [ + os.path.basename(str(str_path)) for str_path in test_wav_files + ] + + for i, utt_id in enumerate(tqdm(sentences)): + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + note = sentences[utt_id][2] + note_dur = sentences[utt_id][3] + is_slur = sentences[utt_id][4] + speaker = sentences[utt_id][-1] + + phone_ids = [phone_dict[phn] for phn in phones] + phone_ids = paddle.to_tensor(np.array(phone_ids)) + + if args.speaker_dict: + speaker_id = int( + [item[1] for item in spk_id_list if speaker == item[0]][0]) + speaker_id = paddle.to_tensor(speaker_id) + else: + speaker_id = None + + durations = paddle.to_tensor(np.array(durations)) + note = paddle.to_tensor(np.array(note)) + note_dur = paddle.to_tensor(np.array(note_dur)) + is_slur = paddle.to_tensor(np.array(is_slur)) + # 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复 + # split data into 3 sections + + wav_path = utt_id + ".wav" + + if wav_path in train_wav_files: + sub_output_dir = output_dir / ("train/raw") + elif wav_path in dev_wav_files: + sub_output_dir = output_dir / ("dev/raw") + elif wav_path in test_wav_files: + sub_output_dir = output_dir / ("test/raw") + + sub_output_dir.mkdir(parents=True, exist_ok=True) + + with paddle.no_grad(): + mel = diffsinger_inference( + text=phone_ids, + note=note, + note_dur=note_dur, + is_slur=is_slur, + get_mel_fs2=False) + np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser( + description="Generate mel with diffsinger.") + parser.add_argument( + "--dataset", + default="opencpop", + type=str, + help="name of dataset, should in {opencpop} now") + parser.add_argument( + "--rootdir", default=None, type=str, help="directory to dataset.") + parser.add_argument( + "--diffsinger-config", type=str, help="diffsinger config file.") + parser.add_argument( + "--diffsinger-checkpoint", + type=str, + help="diffsinger checkpoint to load.") + parser.add_argument( + "--diffsinger-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training diffsinger." + ) + parser.add_argument( + "--diffsinger-stretch", + type=str, + help="min and max mel used to stretch before training diffusion.") + + parser.add_argument( + "--phones-dict", + type=str, + default="phone_id_map.txt", + help="phone vocabulary file.") + + parser.add_argument( + "--speaker-dict", type=str, default=None, help="speaker id map file.") + + parser.add_argument( + "--dur-file", default=None, type=str, help="path to durations.txt.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + + args = parser.parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + with open(args.diffsinger_config) as f: + diffsinger_config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(diffsinger_config) + + evaluate(args, diffsinger_config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/dygraph_to_static.py b/paddlespeech/t2s/exps/dygraph_to_static.py new file mode 100644 index 000000000..5e15ca4ca --- /dev/null +++ b/paddlespeech/t2s/exps/dygraph_to_static.py @@ -0,0 +1,170 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +import yaml +from yacs.config import CfgNode + +from paddlespeech.t2s.exps.syn_utils import am_to_static +from paddlespeech.t2s.exps.syn_utils import get_am_inference +from paddlespeech.t2s.exps.syn_utils import get_voc_inference +from paddlespeech.t2s.exps.syn_utils import voc_to_static + + +def am_dygraph_to_static(args): + with open(args.am_config) as f: + am_config = CfgNode(yaml.safe_load(f)) + am_inference = get_am_inference( + am=args.am, + am_config=am_config, + am_ckpt=args.am_ckpt, + am_stat=args.am_stat, + phones_dict=args.phones_dict, + tones_dict=args.tones_dict, + speaker_dict=args.speaker_dict) + print("acoustic model done!") + + # dygraph to static + am_inference = am_to_static( + am_inference=am_inference, + am=args.am, + inference_dir=args.inference_dir, + speaker_dict=args.speaker_dict) + print("finish to convert dygraph acoustic model to static!") + + +def voc_dygraph_to_static(args): + with open(args.voc_config) as f: + voc_config = CfgNode(yaml.safe_load(f)) + voc_inference = get_voc_inference( + voc=args.voc, + voc_config=voc_config, + voc_ckpt=args.voc_ckpt, + voc_stat=args.voc_stat) + print("voc done!") + + # dygraph to static + voc_inference = voc_to_static( + voc_inference=voc_inference, + voc=args.voc, + inference_dir=args.inference_dir) + print("finish to convert dygraph vocoder model to static!") + + +def parse_args(): + # parse args and config + parser = argparse.ArgumentParser( + description="Synthesize with acoustic model & vocoder") + parser.add_argument( + '--type', + type=str, + required=True, + choices=["am", "voc"], + help='Choose the model type of dynamic to static, am or voc') + # acoustic model + parser.add_argument( + '--am', + type=str, + default='fastspeech2_csmsc', + choices=[ + 'speedyspeech_csmsc', + 'speedyspeech_aishell3', + 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', + 'fastspeech2_aishell3', + 'fastspeech2_vctk', + 'tacotron2_csmsc', + 'tacotron2_ljspeech', + 'fastspeech2_mix', + 'fastspeech2_canton', + 'fastspeech2_male-zh', + 'fastspeech2_male-en', + 'fastspeech2_male-mix', + ], + help='Choose acoustic model type of tts task.') + parser.add_argument( + '--am_config', type=str, default=None, help='Config of acoustic model.') + parser.add_argument( + '--am_ckpt', + type=str, + default=None, + help='Checkpoint file of acoustic model.') + parser.add_argument( + "--am_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training acoustic model." + ) + parser.add_argument( + "--phones_dict", type=str, default=None, help="phone vocabulary file.") + parser.add_argument( + "--tones_dict", type=str, default=None, help="tone vocabulary file.") + parser.add_argument( + "--speaker_dict", type=str, default=None, help="speaker id map file.") + # vocoder + parser.add_argument( + '--voc', + type=str, + default='pwgan_csmsc', + choices=[ + 'pwgan_csmsc', + 'pwgan_ljspeech', + 'pwgan_aishell3', + 'pwgan_vctk', + 'mb_melgan_csmsc', + 'style_melgan_csmsc', + 'hifigan_csmsc', + 'hifigan_ljspeech', + 'hifigan_aishell3', + 'hifigan_vctk', + 'wavernn_csmsc', + 'pwgan_male', + 'hifigan_male', + 'pwgan_opencpop', + 'hifigan_opencpop', + ], + help='Choose vocoder type of tts task.') + parser.add_argument( + '--voc_config', type=str, default=None, help='Config of voc.') + parser.add_argument( + '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') + parser.add_argument( + "--voc_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training voc." + ) + # other + parser.add_argument( + "--inference_dir", + type=str, + default=None, + help="dir to save inference models") + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + if args.type == "am": + am_dygraph_to_static(args) + elif args.type == "voc": + voc_dygraph_to_static(args) + else: + print("type should be in ['am', 'voc'] !") + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 05c657682..a2629a900 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -29,6 +29,7 @@ from yacs.config import CfgNode from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur +from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs from paddlespeech.t2s.datasets.preprocess_utils import merge_silence from paddlespeech.t2s.utils import str2bool @@ -192,8 +193,15 @@ def main(): with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) - sentences, speaker_set = get_phn_dur(dur_file) - merge_silence(sentences) + if args.dataset == "opencpop": + sentences, speaker_set = get_sentences_svs( + dur_file, + dataset=args.dataset, + sample_rate=config.fs, + n_shift=config.n_shift, ) + else: + sentences, speaker_set = get_phn_dur(dur_file) + merge_silence(sentences) # split data into 3 sections if args.dataset == "baker": @@ -240,6 +248,33 @@ def main(): test_wav_files += wav_files[-sub_num_dev:] else: train_wav_files += wav_files + elif args.dataset == "opencpop": + wavdir = rootdir / "wavs" + # split data into 3 sections + train_file = rootdir / "train.txt" + train_wav_files = [] + with open(train_file, "r") as f_train: + for line in f_train.readlines(): + utt = line.split("|")[0] + wav_name = utt + ".wav" + wav_path = wavdir / wav_name + train_wav_files.append(wav_path) + + test_file = rootdir / "test.txt" + dev_wav_files = [] + test_wav_files = [] + num_dev = 106 + count = 0 + with open(test_file, "r") as f_test: + for line in f_test.readlines(): + count += 1 + utt = line.split("|")[0] + wav_name = utt + ".wav" + wav_path = wavdir / wav_name + if count > num_dev: + test_wav_files.append(wav_path) + else: + dev_wav_files.append(wav_path) else: print("dataset should in {baker, ljspeech, vctk, aishell3} now!") diff --git a/paddlespeech/t2s/models/vits/duration_predictor.py b/paddlespeech/t2s/models/vits/duration_predictor.py index b0bb68d0f..12177fbc2 100644 --- a/paddlespeech/t2s/models/vits/duration_predictor.py +++ b/paddlespeech/t2s/models/vits/duration_predictor.py @@ -155,12 +155,10 @@ class StochasticDurationPredictor(nn.Layer): z_u, z1 = paddle.split(z_q, [1, 1], 1) u = F.sigmoid(z_u) * x_mask z0 = (w - u) * x_mask - logdet_tot_q += paddle.sum( - (F.log_sigmoid(z_u) + F.log_sigmoid(-z_u)) * x_mask, [1, 2]) - logq = (paddle.sum(-0.5 * - (math.log(2 * math.pi) + - (e_q**2)) * x_mask, [1, 2]) - logdet_tot_q) - + tmp1 = (F.log_sigmoid(z_u) + F.log_sigmoid(-z_u)) * x_mask + logdet_tot_q += paddle.sum(tmp1, [1, 2]) + tmp2 = -0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask + logq = (paddle.sum(tmp2, [1, 2]) - logdet_tot_q) logdet_tot = 0 z0, logdet = self.log_flow(z0, x_mask) logdet_tot += logdet @@ -168,8 +166,8 @@ class StochasticDurationPredictor(nn.Layer): for flow in self.flows: z, logdet = flow(z, x_mask, g=x, inverse=inverse) logdet_tot = logdet_tot + logdet - nll = (paddle.sum(0.5 * (math.log(2 * math.pi) + - (z**2)) * x_mask, [1, 2]) - logdet_tot) + tmp3 = 0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask + nll = (paddle.sum(tmp3, [1, 2]) - logdet_tot) # (B,) return nll + logq else: diff --git a/paddlespeech/t2s/models/vits/flow.py b/paddlespeech/t2s/models/vits/flow.py index 7593eb727..94df968a0 100644 --- a/paddlespeech/t2s/models/vits/flow.py +++ b/paddlespeech/t2s/models/vits/flow.py @@ -334,11 +334,12 @@ class ConvFlow(nn.Layer): unnorm_widths = h[..., :self.bins] / denom unnorm_heights = h[..., self.bins:2 * self.bins] / denom unnorm_derivatives = h[..., 2 * self.bins:] + xb, logdet_abs = piecewise_rational_quadratic_transform( - xb, - unnorm_widths, - unnorm_heights, - unnorm_derivatives, + inputs=xb, + unnormalized_widths=unnorm_widths, + unnormalized_heights=unnorm_heights, + unnormalized_derivatives=unnorm_derivatives, inverse=inverse, tails="linear", tail_bound=self.tail_bound, ) diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py index fbd2d6653..44bd78984 100644 --- a/paddlespeech/t2s/models/vits/generator.py +++ b/paddlespeech/t2s/models/vits/generator.py @@ -371,8 +371,9 @@ class VITSGenerator(nn.Layer): # (B, H, T_text) s_p_sq_r = paddle.exp(-2 * logs_p) # (B, 1, T_text) + tmp1 = -0.5 * math.log(2 * math.pi) - logs_p neg_x_ent_1 = paddle.sum( - -0.5 * math.log(2 * math.pi) - logs_p, + tmp1, [1], keepdim=True, ) # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text) @@ -384,8 +385,9 @@ class VITSGenerator(nn.Layer): z_p.transpose([0, 2, 1]), (m_p * s_p_sq_r), ) # (B, 1, T_text) + tmp2 = -0.5 * (m_p**2) * s_p_sq_r neg_x_ent_4 = paddle.sum( - -0.5 * (m_p**2) * s_p_sq_r, + tmp2, [1], keepdim=True, ) # (B, T_feats, T_text) @@ -403,7 +405,6 @@ class VITSGenerator(nn.Layer): w = attn.sum(2) dur_nll = self.duration_predictor(x, x_mask, w=w, g=g) dur_nll = dur_nll / paddle.sum(x_mask) - # expand the length to match with the feature sequence # (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats) m_p = paddle.matmul(attn.squeeze(1), @@ -511,8 +512,9 @@ class VITSGenerator(nn.Layer): # (B, H, T_text) s_p_sq_r = paddle.exp(-2 * logs_p) # (B, 1, T_text) + tmp3 = -0.5 * math.log(2 * math.pi) - logs_p neg_x_ent_1 = paddle.sum( - -0.5 * math.log(2 * math.pi) - logs_p, + tmp3, [1], keepdim=True, ) # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text) @@ -524,8 +526,9 @@ class VITSGenerator(nn.Layer): z_p.transpose([0, 2, 1]), (m_p * s_p_sq_r), ) # (B, 1, T_text) + tmp4 = -0.5 * (m_p**2) * s_p_sq_r neg_x_ent_4 = paddle.sum( - -0.5 * (m_p**2) * s_p_sq_r, + tmp4, [1], keepdim=True, ) # (B, T_feats, T_text) diff --git a/paddlespeech/t2s/models/vits/transform.py b/paddlespeech/t2s/models/vits/transform.py index 61bd5ee2b..917f28430 100644 --- a/paddlespeech/t2s/models/vits/transform.py +++ b/paddlespeech/t2s/models/vits/transform.py @@ -61,8 +61,12 @@ def piecewise_rational_quadratic_transform( def mask_preprocess(x, mask): + # bins.dtype = int32 B, C, T, bins = paddle.shape(x) - new_x = paddle.zeros([mask.sum(), bins]) + mask_int = paddle.cast(mask, dtype='int64') + # paddle.sum 输入是 int32 或 bool 的时候,输出是 int64 + # paddle.zeros (fill_constant) 的 shape 会被强制转成 int32 类型 + new_x = paddle.zeros([paddle.sum(mask_int), bins]) for i in range(bins): new_x[:, i] = x[:, :, :, i][mask] return new_x @@ -240,4 +244,7 @@ def rational_quadratic_spline( def _searchsorted(bin_locations, inputs, eps=1e-6): bin_locations[..., -1] += eps - return paddle.sum(inputs[..., None] >= bin_locations, axis=-1) - 1 + mask = inputs[..., None] >= bin_locations + mask_int = paddle.cast(mask, dtype='int64') + out = paddle.sum(mask_int, axis=-1) - 1 + return out diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 798e4dee8..99130acca 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -145,18 +145,18 @@ def make_pad_mask(lengths, xs=None, length_dim=-1): bs = paddle.shape(lengths)[0] if xs is None: - maxlen = lengths.max() + maxlen = paddle.cast(lengths.max(), dtype=bs.dtype) else: maxlen = paddle.shape(xs)[length_dim] seq_range = paddle.arange(0, maxlen, dtype=paddle.int64) + # VITS 最后一个 expand 的位置 seq_range_expand = seq_range.unsqueeze(0).expand([bs, maxlen]) seq_length_expand = lengths.unsqueeze(-1) mask = seq_range_expand >= seq_length_expand.cast(seq_range_expand.dtype) if xs is not None: assert paddle.shape(xs)[0] == bs, (paddle.shape(xs)[0], bs) - if length_dim < 0: length_dim = len(paddle.shape(xs)) + length_dim # ind = (:, None, ..., None, :, , None, ..., None)