From 35c37ace17c0f0d09c1c53fd25a82c8458d3e1e1 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 4 Nov 2021 11:31:38 +0000 Subject: [PATCH] change nprocs to ngpu, add aishell3/voc1 --- examples/aishell3/tts3/README.md | 27 +- examples/aishell3/tts3/local/synthesize.sh | 1 - .../aishell3/tts3/local/synthesize_e2e.sh | 1 - examples/aishell3/tts3/local/train.sh | 2 +- examples/aishell3/vc0/README.md | 2 +- examples/aishell3/vc0/local/preprocess.sh | 1 - examples/aishell3/vc0/local/train.sh | 2 +- examples/aishell3/voc1/conf/default.yaml | 115 ++++++ examples/aishell3/voc1/local/preprocess.sh | 55 +++ examples/aishell3/voc1/local/synthesize.sh | 13 + examples/aishell3/voc1/local/train.sh | 13 + examples/aishell3/voc1/path.sh | 13 + examples/aishell3/voc1/run.sh | 32 ++ examples/csmsc/tts2/README.md | 31 +- examples/csmsc/tts2/local/synthesize.sh | 3 +- examples/csmsc/tts2/local/synthesize_e2e.sh | 1 - examples/csmsc/tts2/local/train.sh | 2 +- examples/csmsc/tts3/README.md | 26 +- examples/csmsc/tts3/local/synthesize.sh | 1 - examples/csmsc/tts3/local/synthesize_e2e.sh | 1 - examples/csmsc/tts3/local/train.sh | 2 +- examples/csmsc/voc1/README.md | 17 +- examples/csmsc/voc1/local/train.sh | 2 +- examples/csmsc/voc3/README.md | 32 +- examples/csmsc/voc3/local/train.sh | 2 +- examples/ljspeech/tts0/README.md | 18 +- examples/ljspeech/tts0/local/synthesize.sh | 2 +- examples/ljspeech/tts0/local/train.sh | 2 +- examples/ljspeech/tts1/README.md | 22 +- examples/ljspeech/tts1/local/synthesize.sh | 1 - .../ljspeech/tts1/local/synthesize_e2e.sh | 1 - examples/ljspeech/tts1/local/train.sh | 2 +- examples/ljspeech/tts3/README.md | 40 +-- examples/ljspeech/tts3/local/synthesize.sh | 1 - .../ljspeech/tts3/local/synthesize_e2e.sh | 1 - examples/ljspeech/tts3/local/train.sh | 2 +- examples/ljspeech/voc0/README.md | 7 +- examples/ljspeech/voc0/local/synthesize.sh | 2 +- examples/ljspeech/voc0/local/train.sh | 3 +- examples/ljspeech/voc1/README.md | 21 +- examples/ljspeech/voc1/local/train.sh | 2 +- examples/other/g2p/README.md | 20 ++ .../{text_frontend => g2p}/get_g2p_data.py | 0 examples/other/g2p/run.sh | 16 + .../other/{text_frontend => g2p}/test_g2p.py | 0 examples/other/ge2e/README.md | 5 +- examples/other/ge2e/local/inference.sh | 2 +- examples/other/ge2e/local/train.sh | 3 +- examples/other/text_frontend/README.md | 38 -- examples/other/text_frontend/make_sclite.sh | 13 - examples/other/text_frontend/run.sh | 25 -- examples/other/tn/README.md | 17 + .../data/textnorm_test_cases.txt | 0 .../get_textnorm_data.py | 0 examples/other/tn/run.sh | 17 + .../{text_frontend => tn}/test_textnorm.py | 0 examples/vctk/tts3/README.md | 26 +- examples/vctk/tts3/local/synthesize.sh | 1 - examples/vctk/tts3/local/synthesize_e2e.sh | 1 - examples/vctk/tts3/local/train.sh | 2 +- examples/vctk/voc1/README.md | 18 +- examples/vctk/voc1/local/train.sh | 2 +- .../fastspeech2/multi_spk_synthesize_e2e.py | 9 +- .../multi_spk_synthesize_e2e_en.py | 9 +- .../t2s/exps/fastspeech2/synthesize.py | 10 +- .../t2s/exps/fastspeech2/synthesize_e2e.py | 9 +- .../t2s/exps/fastspeech2/synthesize_e2e_en.py | 9 +- .../exps/fastspeech2/synthesize_e2e_melgan.py | 9 +- paddlespeech/t2s/exps/fastspeech2/train.py | 12 +- .../multi_band_melgan/synthesize.py | 9 +- .../gan_vocoder/multi_band_melgan/train.py | 12 +- .../parallelwave_gan/synthesize.py | 9 +- .../parallelwave_gan/synthesize_from_wav.py | 9 +- .../gan_vocoder/parallelwave_gan/train.py | 12 +- .../t2s/exps/gan_vocoder/preprocess.py | 16 +- paddlespeech/t2s/exps/ge2e/inference.py | 18 +- paddlespeech/t2s/exps/ge2e/train.py | 4 +- .../t2s/exps/speedyspeech/inference.py | 8 +- .../t2s/exps/speedyspeech/synthesize.py | 9 +- .../t2s/exps/speedyspeech/synthesize_e2e.py | 11 +- paddlespeech/t2s/exps/speedyspeech/train.py | 13 +- paddlespeech/t2s/exps/tacotron2/ljspeech.py | 9 +- paddlespeech/t2s/exps/tacotron2/synthesize.py | 9 +- paddlespeech/t2s/exps/tacotron2/train.py | 4 +- .../t2s/exps/transformer_tts/synthesize.py | 9 +- .../exps/transformer_tts/synthesize_e2e.py | 9 +- .../t2s/exps/transformer_tts/train.py | 12 +- .../voice_cloning/tacotron2_ge2e/train.py | 4 +- .../tacotron2_ge2e/voice_cloning.py | 10 +- paddlespeech/t2s/exps/waveflow/synthesize.py | 10 +- paddlespeech/t2s/exps/waveflow/train.py | 4 +- paddlespeech/t2s/frontend/pinyin.py | 333 ------------------ paddlespeech/t2s/training/cli.py | 3 +- paddlespeech/t2s/training/experiment.py | 9 +- tests/benchmark/pwgan/run_benchmark.sh | 4 +- tests/chains/speedyspeech/test.sh | 2 +- 96 files changed, 643 insertions(+), 715 deletions(-) create mode 100644 examples/aishell3/voc1/conf/default.yaml create mode 100755 examples/aishell3/voc1/local/preprocess.sh create mode 100755 examples/aishell3/voc1/local/synthesize.sh create mode 100755 examples/aishell3/voc1/local/train.sh create mode 100755 examples/aishell3/voc1/path.sh create mode 100755 examples/aishell3/voc1/run.sh create mode 100644 examples/other/g2p/README.md rename examples/other/{text_frontend => g2p}/get_g2p_data.py (100%) create mode 100755 examples/other/g2p/run.sh rename examples/other/{text_frontend => g2p}/test_g2p.py (100%) delete mode 100644 examples/other/text_frontend/README.md delete mode 100755 examples/other/text_frontend/make_sclite.sh delete mode 100755 examples/other/text_frontend/run.sh create mode 100644 examples/other/tn/README.md rename examples/other/{text_frontend => tn}/data/textnorm_test_cases.txt (100%) rename examples/other/{text_frontend => tn}/get_textnorm_data.py (100%) create mode 100755 examples/other/tn/run.sh rename examples/other/{text_frontend => tn}/test_textnorm.py (100%) delete mode 100644 paddlespeech/t2s/frontend/pinyin.py diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index c313d922..9f01ff45 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -67,8 +67,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] Train a FastSpeech2 model. @@ -81,8 +81,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu=0, use cpu. --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. @@ -92,10 +91,9 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. -7. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. +6. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2. ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. @@ -122,7 +120,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -149,8 +147,8 @@ optional arguments: test metadata. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --verbose VERBOSE verbose. + --ngpu NGPU if ngpu == 0, use cpu. + --verbose VERBOSE verbose ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file. ```bash @@ -166,7 +164,7 @@ usage: multi_spk_synthesize_e2e.py [-h] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--text TEXT] - [--output-dir OUTPUT_DIR] [--device DEVICE] + [--output-dir OUTPUT_DIR] [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -193,7 +191,7 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` 1. `--fastspeech2-config`, `--fastspeech2-checkpoint`, `--fastspeech2-stat`, `--phones-dict` and `--speaker-dict` are arguments for fastspeech2, which correspond to the 5 files in the fastspeech2 pretrained model. @@ -201,7 +199,7 @@ optional arguments: 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 4. `--text` is the text file, which contains sentences to synthesize. 5. `--output-dir` is the directory to save synthesized audio files. -6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) @@ -231,7 +229,6 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ - --device="gpu" \ --phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \ --speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt diff --git a/examples/aishell3/tts3/local/synthesize.sh b/examples/aishell3/tts3/local/synthesize.sh index 64361983..e9b893f8 100755 --- a/examples/aishell3/tts3/local/synthesize.sh +++ b/examples/aishell3/tts3/local/synthesize.sh @@ -15,6 +15,5 @@ python3 ${BIN_DIR}/synthesize.py \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ --output-dir=${train_output_path}/test \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/aishell3/tts3/local/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh index 8a979844..e1d84d21 100755 --- a/examples/aishell3/tts3/local/synthesize_e2e.sh +++ b/examples/aishell3/tts3/local/synthesize_e2e.sh @@ -15,6 +15,5 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=${train_output_path}/test_e2e \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/aishell3/tts3/local/train.sh b/examples/aishell3/tts3/local/train.sh index be6051c9..1da72f11 100755 --- a/examples/aishell3/tts3/local/train.sh +++ b/examples/aishell3/tts3/local/train.sh @@ -8,6 +8,6 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=2 \ + --ngpu=2 \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md index 9a269ed5..c146198c 100644 --- a/examples/aishell3/vc0/README.md +++ b/examples/aishell3/vc0/README.md @@ -28,7 +28,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${BIN_DIR}/../ge2e/inference.py \ --input=${input} \ --output=${preprocess_path}/embed \ - --device="gpu" \ + --ngpu=1 \ --checkpoint_path=${ge2e_ckpt_path} fi ``` diff --git a/examples/aishell3/vc0/local/preprocess.sh b/examples/aishell3/vc0/local/preprocess.sh index 87cfab32..e14dda53 100755 --- a/examples/aishell3/vc0/local/preprocess.sh +++ b/examples/aishell3/vc0/local/preprocess.sh @@ -12,7 +12,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${BIN_DIR}/../../ge2e/inference.py \ --input=${input} \ --output=${preprocess_path}/embed \ - --device="gpu" \ --checkpoint_path=${ge2e_ckpt_path} fi diff --git a/examples/aishell3/vc0/local/train.sh b/examples/aishell3/vc0/local/train.sh index eb968b5f..f062cbbf 100755 --- a/examples/aishell3/vc0/local/train.sh +++ b/examples/aishell3/vc0/local/train.sh @@ -6,4 +6,4 @@ train_output_path=$2 python3 ${BIN_DIR}/train.py \ --data=${preprocess_path} \ --output=${train_output_path} \ - --device="gpu" \ No newline at end of file + --ngpu=1 \ No newline at end of file diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml new file mode 100644 index 00000000..ba2d9f2e --- /dev/null +++ b/examples/aishell3/voc1/conf/default.yaml @@ -0,0 +1,115 @@ +# This is the hyperparameter configuration file for Parallel WaveGAN. +# Please make sure this is adjusted for the VCTK corpus. If you want to +# apply to the other dataset, you might need to carefully change some parameters. +# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 2048 # FFT size. (in samples) +n_shift: 300 # Hop size. (in samples) +win_length: 1200 # Window length. (in samples) + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 7600 # Maximum frequency in mel basis calculation. (Hz) + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Kernel size of dilated convolution. + layers: 30 # Number of residual block layers. + stacks: 3 # Number of stacks i.e., dilation cycles. + residual_channels: 64 # Number of channels in residual conv. + gate_channels: 128 # Number of channels in gated conv. + skip_channels: 64 # Number of channels in skip conv. + aux_channels: 80 # Number of channels for auxiliary feature conv. + # Must be the same as num_mels. + aux_context_window: 2 # Context window size for auxiliary feature. + # If set to 2, previous 2 and future 2 frames will be considered. + dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. + use_weight_norm: true # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size. + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Number of output channels. + layers: 10 # Number of conv layers. + conv_channels: 64 # Number of chnn layers. + bias: true # Whether to use bias parameter in conv. + use_weight_norm: true # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv. + nonlinear_activation_params: # Nonlinear function parameters + negative_slope: 0.2 # Alpha in LeakyReLU. + +########################################################### +# STFT LOSS SETTING # +########################################################### +stft_loss_params: + fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. + hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss + win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. + window: "hann" # Window function for STFT-based loss + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +lambda_adv: 4.0 # Loss balancing coefficient. + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 8 # Batch size. +batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by hop_size. +pin_memory: true # Whether to pin memory in Pytorch DataLoader. +num_workers: 4 # Number of workers in Pytorch DataLoader. +remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. +allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + epsilon: 1.0e-6 # Generator's epsilon. + weight_decay: 0.0 # Generator's weight decay coefficient. +generator_scheduler_params: + learning_rate: 0.0001 # Generator's learning rate. + step_size: 200000 # Generator's scheduler step size. + gamma: 0.5 # Generator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +generator_grad_norm: 10 # Generator's gradient norm. +discriminator_optimizer_params: + epsilon: 1.0e-6 # Discriminator's epsilon. + weight_decay: 0.0 # Discriminator's weight decay coefficient. +discriminator_scheduler_params: + learning_rate: 0.00005 # Discriminator's learning rate. + step_size: 200000 # Discriminator's scheduler step size. + gamma: 0.5 # Discriminator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +discriminator_grad_norm: 1 # Discriminator's gradient norm. + +########################################################### +# INTERVAL SETTING # +########################################################### +discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. +train_max_steps: 1000000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh new file mode 100755 index 00000000..44cc3dbe --- /dev/null +++ b/examples/aishell3/voc1/local/preprocess.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./aishell3_alignment_tone \ + --output=durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/../preprocess.py \ + --rootdir=~/datasets/data_aishell3/ \ + --dataset=aishell3 \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --cut-sil=True \ + --num-cpu=20 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="feats" +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize, dev and test should use train's stats + echo "Normalize ..." + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --stats=dump/train/feats_stats.npy + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --stats=dump/train/feats_stats.npy + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --stats=dump/train/feats_stats.npy +fi diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh new file mode 100755 index 00000000..9f904ac0 --- /dev/null +++ b/examples/aishell3/voc1/local/synthesize.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh new file mode 100755 index 00000000..9695631e --- /dev/null +++ b/examples/aishell3/voc1/local/train.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +FLAGS_cudnn_exhaustive_search=true \ +FLAGS_conv_workspace_size_limit=4000 \ +python ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh new file mode 100755 index 00000000..1e6647b8 --- /dev/null +++ b/examples/aishell3/voc1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=parallelwave_gan +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/aishell3/voc1/run.sh b/examples/aishell3/voc1/run.sh new file mode 100755 index 00000000..7d0fdb21 --- /dev/null +++ b/examples/aishell3/voc1/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_5000.pdz + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index de9e488c..b3c35e1e 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -55,10 +55,10 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--use-relative-path USE_RELATIVE_PATH] - [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--verbose VERBOSE] + [--use-relative-path USE_RELATIVE_PATH] + [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] Train a Speedyspeech model with sigle speaker dataset. @@ -71,8 +71,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. --use-relative-path USE_RELATIVE_PATH whether use relative path in metadata @@ -85,10 +84,9 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. -7. `--tones-dict` is the path of the tone vocabulary file. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. +6. `--tones-dict` is the path of the tone vocabulary file. ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. @@ -115,7 +113,7 @@ usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--inference-dir INFERENCE_DIR] [--device DEVICE] + [--inference-dir INFERENCE_DIR] [--ngpu NGPU] [--verbose VERBOSE] Synthesize with speedyspeech & parallel wavegan. @@ -145,7 +143,7 @@ optional arguments: output dir --inference-dir INFERENCE_DIR dir to save inference models - --device DEVICE device type to use + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. @@ -161,8 +159,8 @@ usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG] [--pwg-stat PWG_STAT] [--text TEXT] [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] [--output-dir OUTPUT_DIR] - [--inference-dir INFERENCE_DIR] [--device DEVICE] - [--verbose VERBOSE] + [--inference-dir INFERENCE_DIR] [--verbose VERBOSE] + [--ngpu NGPU] Synthesize with speedyspeech & parallel wavegan. @@ -190,15 +188,15 @@ optional arguments: output dir --inference-dir INFERENCE_DIR dir to save inference models - --device DEVICE device type to use --verbose VERBOSE verbose + --ngpu NGPU if ngpu == 0, use cpu. ``` 1. `--speedyspeech-config`, `--speedyspeech-checkpoint`, `--speedyspeech-stat` are arguments for speedyspeech, which correspond to the 3 files in the speedyspeech pretrained model. 2. `--pwg-config`, `--pwg-checkpoint`, `--pwg-stat` are arguments for parallel wavegan, which correspond to the 3 files in the parallel wavegan pretrained model. 3. `--text` is the text file, which contains sentences to synthesize. 4. `--output-dir` is the directory to save synthesized audio files. 5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece. -6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. 7. `--phones-dict` is the path of the phone vocabulary file. 8. `--tones-dict` is the path of the tone vocabulary file. @@ -237,7 +235,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ --inference-dir=exp/default/inference \ - --device="gpu" \ --phones-dict=speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt \ --tones-dict=speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt ``` diff --git a/examples/csmsc/tts2/local/synthesize.sh b/examples/csmsc/tts2/local/synthesize.sh index 418ee02e..8be02dfb 100755 --- a/examples/csmsc/tts2/local/synthesize.sh +++ b/examples/csmsc/tts2/local/synthesize.sh @@ -16,5 +16,4 @@ python3 ${BIN_DIR}/synthesize.py \ --output-dir=${train_output_path}/test \ --inference-dir=${train_output_path}/inference \ --phones-dict=dump/phone_id_map.txt \ - --tones-dict=dump/tone_id_map.txt \ - --device="gpu" + --tones-dict=dump/tone_id_map.txt \ No newline at end of file diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index c50fa776..3cbc7936 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -16,6 +16,5 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=${train_output_path}/test_e2e \ --inference-dir=${train_output_path}/inference \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt diff --git a/examples/csmsc/tts2/local/train.sh b/examples/csmsc/tts2/local/train.sh index e44c7da5..f0a5a683 100755 --- a/examples/csmsc/tts2/local/train.sh +++ b/examples/csmsc/tts2/local/train.sh @@ -9,7 +9,7 @@ python ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=2 \ + --ngpu=1 \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ --use-relative-path=True diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 7eeb14fc..a45eec13 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -59,8 +59,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] Train a FastSpeech2 model. @@ -73,8 +73,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu=0, use cpu. --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. @@ -84,9 +83,8 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. @@ -113,7 +111,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -140,7 +138,7 @@ optional arguments: test metadata. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. @@ -155,7 +153,8 @@ usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--text TEXT] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--inference-dir INFERENCE_DIR] [--ngpu NGPU] + [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -179,7 +178,9 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --inference-dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -188,7 +189,7 @@ optional arguments: 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 4. `--text` is the text file, which contains sentences to synthesize. 5. `--output-dir` is the directory to save synthesized audio files. -6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Inference After Synthesize, we will get static models of fastspeech2 and pwgan in `${train_output_path}/inference`. @@ -224,6 +225,5 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ --inference-dir=exp/default/inference \ - --device="gpu" \ --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt ``` diff --git a/examples/csmsc/tts3/local/synthesize.sh b/examples/csmsc/tts3/local/synthesize.sh index 724afb04..e525fc16 100755 --- a/examples/csmsc/tts3/local/synthesize.sh +++ b/examples/csmsc/tts3/local/synthesize.sh @@ -15,5 +15,4 @@ python3 ${BIN_DIR}/synthesize.py \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ --output-dir=${train_output_path}/test \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index b6542743..cc27ffb6 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -16,5 +16,4 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --text=${BIN_DIR}/../sentences.txt \ --output-dir=${train_output_path}/test_e2e \ --inference-dir=${train_output_path}/inference \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/csmsc/tts3/local/train.sh b/examples/csmsc/tts3/local/train.sh index fbbc9a9d..f90db915 100755 --- a/examples/csmsc/tts3/local/train.sh +++ b/examples/csmsc/tts3/local/train.sh @@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 \ + --ngpu=1 \ --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index 4b6b6c42..c3256b78 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -53,9 +53,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] + [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -69,8 +68,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. benchmark: @@ -90,8 +88,7 @@ benchmark: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesize `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. @@ -101,7 +98,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with parallel wavegan. @@ -114,7 +111,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device to run. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -122,7 +119,7 @@ optional arguments: 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. -5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models Pretrained models can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip). diff --git a/examples/csmsc/voc1/local/train.sh b/examples/csmsc/voc1/local/train.sh index 1ef860c3..9695631e 100755 --- a/examples/csmsc/voc1/local/train.sh +++ b/examples/csmsc/voc1/local/train.sh @@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 + --ngpu=1 diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index 780a8ccd..757d1a36 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -53,12 +53,9 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] [--verbose VERBOSE] -Train a ParallelWaveGAN model. +Train a Multi-Band MelGAN model. optional arguments: -h, --help show this help message and exit @@ -69,29 +66,14 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesize `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. @@ -101,7 +83,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with parallel wavegan. @@ -114,7 +96,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device to run. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -122,6 +104,6 @@ optional arguments: 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. -5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh index 1ef860c3..9695631e 100755 --- a/examples/csmsc/voc3/local/train.sh +++ b/examples/csmsc/voc3/local/train.sh @@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 + --ngpu=1 diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md index e8e3ebff..f33d925d 100644 --- a/examples/ljspeech/tts0/README.md +++ b/examples/ljspeech/tts0/README.md @@ -30,8 +30,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} Here's the complete help message. ```text usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR] - [--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}] - [--nprocs NPROCS] [--opts ...] + [--checkpoint_path CHECKPOINT_PATH] [--ngpu NGPU] [--opts ...] optional arguments: -h, --help show this help message and exit @@ -41,16 +40,15 @@ optional arguments: --output OUTPUT_DIR path to save checkpoint and logs. --checkpoint_path CHECKPOINT_PATH path of the checkpoint to load - --device {cpu,gpu} device type to use, cpu and gpu are supported. - --nprocs NPROCS number of parallel processes to use. + --ngpu NGPU if ngpu == 0, use cpu. --opts ... options to overwrite --config file and the default config, passing in KEY VALUE pairs ``` -If you want to train on CPU, just set ``--device=cpu``. -If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU. -By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint. -And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load. +If you want to train on CPU, just set `--ngpu=0`. +If you want to train on multiple GPUs, just set `--ngpu` as num of GPU. +By default, training will be resumed from the latest checkpoint in `--output`, if you want to start a new training, please use a new `${OUTPUTPATH}` with no checkpoint. +And if you want to resume from an other existing model, you should set `checkpoint_path` to be the checkpoint path you want to load. **Note: The checkpoint path cannot contain the file extension.** ### Synthesize @@ -60,7 +58,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_n ``` ```text usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH] - [--input INPUT] [--output OUTPUT] [--device DEVICE] + [--input INPUT] [--output OUTPUT] [--ngpu NGPU] [--opts ...] [-v] generate mel spectrogram with TransformerTTS. @@ -72,7 +70,7 @@ optional arguments: path of the checkpoint to load. --input INPUT path of the text sentences --output OUTPUT path to save outputs - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --opts ... options to overwrite --config file and the default config, passing in KEY VALUE pairs -v, --verbose print msg diff --git a/examples/ljspeech/tts0/local/synthesize.sh b/examples/ljspeech/tts0/local/synthesize.sh index 91c89dd4..02147803 100755 --- a/examples/ljspeech/tts0/local/synthesize.sh +++ b/examples/ljspeech/tts0/local/synthesize.sh @@ -8,4 +8,4 @@ python3 ${BIN_DIR}/synthesize.py \ --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ --input=${BIN_DIR}/../sentences_en.txt \ --output=${train_output_path}/test - --device=gpu \ No newline at end of file + --ngpu=1 \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh index b8bcf5cb..a94f955a 100755 --- a/examples/ljspeech/tts0/local/train.sh +++ b/examples/ljspeech/tts0/local/train.sh @@ -6,4 +6,4 @@ train_output_path=$2 python3 ${BIN_DIR}/train.py \ --data=${preprocess_path} \ --output=${train_output_path} \ - --device=gpu \ \ No newline at end of file + --ngpu=1 \ \ No newline at end of file diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 0385fdce..625f296a 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -53,8 +53,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] + [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] Train a TransformerTTS model with LJSpeech TTS dataset. @@ -67,8 +66,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. @@ -76,9 +74,8 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. ## Synthesize We use [waveflow](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder. @@ -104,7 +101,7 @@ usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG] [--waveflow-checkpoint WAVEFLOW_CHECKPOINT] [--phones-dict PHONES_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with transformer tts & waveflow. @@ -127,7 +124,7 @@ optional arguments: test metadata. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. @@ -142,7 +139,7 @@ usage: synthesize_e2e.py [-h] [--waveflow-config WAVEFLOW_CONFIG] [--waveflow-checkpoint WAVEFLOW_CHECKPOINT] [--phones-dict PHONES_DICT] [--text TEXT] - [--output-dir OUTPUT_DIR] [--device DEVICE] + [--output-dir OUTPUT_DIR] [--ngpu NGPU] [--verbose VERBOSE] Synthesize with transformer tts & waveflow. @@ -165,7 +162,7 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` 1. `--transformer-tts-config`, `--transformer-tts-checkpoint`, `--transformer-tts-stat` and `--phones-dict` are arguments for transformer_tts, which correspond to the 4 files in the transformer_tts pretrained model. @@ -173,7 +170,7 @@ optional arguments: 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 4. `--text` is the text file, which contains sentences to synthesize. 5. `--output-dir` is the directory to save synthesized audio files. -6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip) @@ -200,6 +197,5 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=exp/default/test_e2e \ - --device="gpu" \ --phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt ``` diff --git a/examples/ljspeech/tts1/local/synthesize.sh b/examples/ljspeech/tts1/local/synthesize.sh index 5d1c9534..9fe837a4 100755 --- a/examples/ljspeech/tts1/local/synthesize.sh +++ b/examples/ljspeech/tts1/local/synthesize.sh @@ -14,5 +14,4 @@ python3 ${BIN_DIR}/synthesize.py \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ --test-metadata=dump/test/norm/metadata.jsonl \ --output-dir=${train_output_path}/test \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/synthesize_e2e.sh b/examples/ljspeech/tts1/local/synthesize_e2e.sh index 333a5cd6..046fdb70 100755 --- a/examples/ljspeech/tts1/local/synthesize_e2e.sh +++ b/examples/ljspeech/tts1/local/synthesize_e2e.sh @@ -14,5 +14,4 @@ python3 ${BIN_DIR}/synthesize_e2e.py \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=${train_output_path}/test_e2e \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/train.sh b/examples/ljspeech/tts1/local/train.sh index 8527f57f..5e255fb8 100755 --- a/examples/ljspeech/tts1/local/train.sh +++ b/examples/ljspeech/tts1/local/train.sh @@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=2 \ + --ngpu=2 \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index dc711ce8..0bcdf372 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -58,8 +58,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] Train a FastSpeech2 model. @@ -72,8 +72,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu=0, use cpu. --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. @@ -83,9 +82,8 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder. @@ -112,7 +110,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -139,7 +137,7 @@ optional arguments: test metadata. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e_en.py`, which can synthesize waveform from text file. @@ -147,14 +145,15 @@ optional arguments: CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text -usage: synthesize_e2e_en.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] - [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] - [--fastspeech2-stat FASTSPEECH2_STAT] - [--pwg-config PWG_CONFIG] - [--pwg-checkpoint PWG_CHECKPOINT] - [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] - [--text TEXT] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] +usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] + [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] + [--fastspeech2-stat FASTSPEECH2_STAT] + [--pwg-config PWG_CONFIG] + [--pwg-checkpoint PWG_CHECKPOINT] + [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] + [--text TEXT] [--output-dir OUTPUT_DIR] + [--inference-dir INFERENCE_DIR] [--ngpu NGPU] + [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -178,7 +177,9 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --inference-dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -187,7 +188,7 @@ optional arguments: 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 4. `--text` is the text file, which contains sentences to synthesize. 5. `--output-dir` is the directory to save synthesized audio files. -6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) @@ -215,6 +216,5 @@ python3 ${BIN_DIR}/synthesize_e2e_en.py \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=exp/default/test_e2e \ - --device="gpu" \ --phones-dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt ``` diff --git a/examples/ljspeech/tts3/local/synthesize.sh b/examples/ljspeech/tts3/local/synthesize.sh index 32dcde58..9b22abb3 100755 --- a/examples/ljspeech/tts3/local/synthesize.sh +++ b/examples/ljspeech/tts3/local/synthesize.sh @@ -15,5 +15,4 @@ python3 ${BIN_DIR}/synthesize.py \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ --output-dir=${train_output_path}/test \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/local/synthesize_e2e.sh b/examples/ljspeech/tts3/local/synthesize_e2e.sh index 28ea3a8f..c723feef 100755 --- a/examples/ljspeech/tts3/local/synthesize_e2e.sh +++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh @@ -15,5 +15,4 @@ python3 ${BIN_DIR}/synthesize_e2e_en.py \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=${train_output_path}/test_e2e \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh index 847a44e3..d1302f99 100755 --- a/examples/ljspeech/tts3/local/train.sh +++ b/examples/ljspeech/tts3/local/train.sh @@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 \ + --ngpu=1 \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md index 6163ae42..ad2337ef 100644 --- a/examples/ljspeech/voc0/README.md +++ b/examples/ljspeech/voc0/README.md @@ -31,10 +31,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_ The training script requires 4 command line arguments. 1. `--data` is the path of the training dataset. 2. `--output` is the path of the output directory. -3. `--device` should be "cpu" or "gpu" -4. `--nprocs` is the number of processes to train the model in parallel. +3. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. -If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet. +If you want distributed training, set a larger `--ngpu` (e.g. 4). Note that distributed training with cpu is not supported yet. ### Synthesize `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from mels. @@ -46,7 +45,7 @@ Synthesize waveform. 1. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format. 2. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does. 3. `--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here. -4. `--device` specifies to device to run synthesis on. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip). diff --git a/examples/ljspeech/voc0/local/synthesize.sh b/examples/ljspeech/voc0/local/synthesize.sh index 055542cf..1d5e1183 100755 --- a/examples/ljspeech/voc0/local/synthesize.sh +++ b/examples/ljspeech/voc0/local/synthesize.sh @@ -8,5 +8,5 @@ python ${BIN_DIR}/synthesize.py \ --input=${input_mel_path} \ --output=${train_output_path}/wavs/ \ --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ - --device="gpu" \ + --ngpu=1 \ --verbose \ No newline at end of file diff --git a/examples/ljspeech/voc0/local/train.sh b/examples/ljspeech/voc0/local/train.sh index 5c4defd9..f062cbbf 100755 --- a/examples/ljspeech/voc0/local/train.sh +++ b/examples/ljspeech/voc0/local/train.sh @@ -6,5 +6,4 @@ train_output_path=$2 python3 ${BIN_DIR}/train.py \ --data=${preprocess_path} \ --output=${train_output_path} \ - --device="gpu" \ - --nprocs=1 \ No newline at end of file + --ngpu=1 \ No newline at end of file diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index ba6eb002..2cc196fb 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -53,11 +53,10 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] + [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -70,8 +69,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. benchmark: @@ -91,8 +89,7 @@ benchmark: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesize `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. @@ -102,7 +99,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with parallel wavegan. @@ -115,7 +112,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device to run. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -123,7 +120,7 @@ optional arguments: 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. -5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh index 1ef860c3..9695631e 100755 --- a/examples/ljspeech/voc1/local/train.sh +++ b/examples/ljspeech/voc1/local/train.sh @@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 + --ngpu=1 diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md new file mode 100644 index 00000000..14bd0d9d --- /dev/null +++ b/examples/other/g2p/README.md @@ -0,0 +1,20 @@ +# G2P +For g2p, we use BZNSYP's phone label as the ground truth and we delete silence tokens in labels and predicted phones. + +You should Download BZNSYP from it's [Official Website](https://test.data-baker.com/data/index/source) and extract it. Assume the path to the dataset is `~/datasets/BZNSYP`. + +We use `WER` as evaluation criterion. + +# Start +Run the command below to get the results of test. +```bash +./run.sh +``` +The `avg WER` of g2p is: 0.027495061517943988 +```text + ,--------------------------------------------------------------------. + | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | + |--------+-----------------+-----------------------------------------| + | Sum/Avg| 9996 299181 | 97.3 2.7 0.0 0.0 2.7 52.5 | + `--------------------------------------------------------------------' +``` diff --git a/examples/other/text_frontend/get_g2p_data.py b/examples/other/g2p/get_g2p_data.py similarity index 100% rename from examples/other/text_frontend/get_g2p_data.py rename to examples/other/g2p/get_g2p_data.py diff --git a/examples/other/g2p/run.sh b/examples/other/g2p/run.sh new file mode 100755 index 00000000..214b8b3d --- /dev/null +++ b/examples/other/g2p/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +source path.sh +USE_SCLITE=true + +# test g2p +echo "Start get g2p test data ..." +python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p +echo "Start test g2p ..." +python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p + +# whether use sclite to get more detail information of WER +if [ "$USE_SCLITE" = true ];then + echo "Start sclite g2p ..." + ${MAIN_ROOT}/tools/sctk/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all +fi diff --git a/examples/other/text_frontend/test_g2p.py b/examples/other/g2p/test_g2p.py similarity index 100% rename from examples/other/text_frontend/test_g2p.py rename to examples/other/g2p/test_g2p.py diff --git a/examples/other/ge2e/README.md b/examples/other/ge2e/README.md index 1fa9677a..17e591b3 100644 --- a/examples/other/ge2e/README.md +++ b/examples/other/ge2e/README.md @@ -70,8 +70,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_ In `${BIN_DIR}/train.py`: 1. `--data` is the path to the preprocessed dataset. 2. `--output` is the directory to save results,usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training. -3. `--device` is the device type to run the training, 'cpu' and 'gpu' are supported. -4. `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. 5. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda. Other options are described below. @@ -91,7 +90,7 @@ In `${BIN_DIR}/inference.py`: 2. `--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format. 3. `--checkpoint_path` is the path of the checkpoint to use, extension not included. 4. `--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`. -5. `--device` and `--opts` have the same meaning as in the training script. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps. diff --git a/examples/other/ge2e/local/inference.sh b/examples/other/ge2e/local/inference.sh index 1beebdfa..431c5309 100755 --- a/examples/other/ge2e/local/inference.sh +++ b/examples/other/ge2e/local/inference.sh @@ -10,5 +10,5 @@ python3 ${BIN_DIR}/inference.py \ --input=${infer_input} \ --output=${infer_output} \ --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ - --device="gpu" + --ngpu=1 diff --git a/examples/other/ge2e/local/train.sh b/examples/other/ge2e/local/train.sh index 5c4defd9..f062cbbf 100755 --- a/examples/other/ge2e/local/train.sh +++ b/examples/other/ge2e/local/train.sh @@ -6,5 +6,4 @@ train_output_path=$2 python3 ${BIN_DIR}/train.py \ --data=${preprocess_path} \ --output=${train_output_path} \ - --device="gpu" \ - --nprocs=1 \ No newline at end of file + --ngpu=1 \ No newline at end of file diff --git a/examples/other/text_frontend/README.md b/examples/other/text_frontend/README.md deleted file mode 100644 index 0bf6e72d..00000000 --- a/examples/other/text_frontend/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Chinese Text Frontend Example -Here's an example for Chinese text frontend, including g2p and text normalization. -## G2P -For g2p, we use BZNSYP's phone label as the ground truth and we delete silence tokens in labels and predicted phones. - -You should Download BZNSYP from it's [Official Website](https://test.data-baker.com/data/index/source) and extract it. Assume the path to the dataset is `~/datasets/BZNSYP`. - -We use `WER` as evaluation criterion. -## Text Normalization -For text normalization, the test data is `data/textnorm_test_cases.txt`, we use `|` as the separator of raw_data and normed_data. - -We use `CER` as evaluation criterion. -## Start -If you want to use sclite to get more detail information of WER, you should run the command below to make sclite first. -```bash -./make_sclite.sh -``` -Run the command below to get the results of test. -```bash -./run.sh -``` -The `avg WER` of g2p is: 0.027495061517943988 -```text - ,--------------------------------------------------------------------. - | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | - |--------+-----------------+-----------------------------------------| - | Sum/Avg| 9996 299181 | 97.3 2.7 0.0 0.0 2.7 52.5 | - `--------------------------------------------------------------------' -``` - -The `avg CER` of text normalization is: 0.006388318503308237 -```text - ,-----------------------------------------------------------------. - | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | - |--------+--------------+-----------------------------------------| - | Sum/Avg| 125 2254 | 99.4 0.1 0.5 0.1 0.7 3.2 | - `-----------------------------------------------------------------' -``` diff --git a/examples/other/text_frontend/make_sclite.sh b/examples/other/text_frontend/make_sclite.sh deleted file mode 100755 index db8c921c..00000000 --- a/examples/other/text_frontend/make_sclite.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -if [ ! -d "./SCTK" ];then - echo "Clone SCTK ..." - git clone https://github.com/usnistgov/SCTK - echo "Clone SCTK done!" -fi - -if [ ! -d "./SCTK/bin" ];then - echo "Start make SCTK ..." - pushd SCTK && make config && make all && make check && make install && make doc && popd - echo "SCTK make done!" -fi diff --git a/examples/other/text_frontend/run.sh b/examples/other/text_frontend/run.sh deleted file mode 100755 index 9882b057..00000000 --- a/examples/other/text_frontend/run.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -USE_SCLITE=true - -# test g2p -echo "Start get g2p test data ..." -python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p -echo "Start test g2p ..." -python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p - -# test text normalization -echo "Start get text normalization test data ..." -python3 get_textnorm_data.py --test-file=data/textnorm_test_cases.txt --output-dir=data/textnorm -echo "Start test text normalization ..." -python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm - -# whether use sclite to get more detail information of WER -if [ "$USE_SCLITE" = true ];then - echo "Start sclite g2p ..." - ./SCTK/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all - echo - - echo "Start sclite textnorm ..." - ./SCTK/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean trn -h ./exp/textnorm/text.tn trn -e utf-8 -o all -fi \ No newline at end of file diff --git a/examples/other/tn/README.md b/examples/other/tn/README.md new file mode 100644 index 00000000..dfefccde --- /dev/null +++ b/examples/other/tn/README.md @@ -0,0 +1,17 @@ +# Text Normalization +For text normalization, the test data is `data/textnorm_test_cases.txt`, we use `|` as the separator of raw_data and normed_data. + +We use `CER` as evaluation criterion. +## Start +Run the command below to get the results of test. +```bash +./run.sh +``` +The `avg CER` of text normalization is: 0.006388318503308237 +```text + ,-----------------------------------------------------------------. + | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | + |--------+--------------+-----------------------------------------| + | Sum/Avg| 125 2254 | 99.4 0.1 0.5 0.1 0.7 3.2 | + `-----------------------------------------------------------------' +``` diff --git a/examples/other/text_frontend/data/textnorm_test_cases.txt b/examples/other/tn/data/textnorm_test_cases.txt similarity index 100% rename from examples/other/text_frontend/data/textnorm_test_cases.txt rename to examples/other/tn/data/textnorm_test_cases.txt diff --git a/examples/other/text_frontend/get_textnorm_data.py b/examples/other/tn/get_textnorm_data.py similarity index 100% rename from examples/other/text_frontend/get_textnorm_data.py rename to examples/other/tn/get_textnorm_data.py diff --git a/examples/other/tn/run.sh b/examples/other/tn/run.sh new file mode 100755 index 00000000..e8298a84 --- /dev/null +++ b/examples/other/tn/run.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +source path.sh + +USE_SCLITE=true + +# test text normalization +echo "Start get text normalization test data ..." +python3 get_textnorm_data.py --test-file=data/textnorm_test_cases.txt --output-dir=data/textnorm +echo "Start test text normalization ..." +python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm + +# whether use sclite to get more detail information of WER +if [ "$USE_SCLITE" = true ];then + echo "Start sclite textnorm ..." + ${MAIN_ROOT}/tools/sctk/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean trn -h ./exp/textnorm/text.tn trn -e utf-8 -o all +fi \ No newline at end of file diff --git a/examples/other/text_frontend/test_textnorm.py b/examples/other/tn/test_textnorm.py similarity index 100% rename from examples/other/text_frontend/test_textnorm.py rename to examples/other/tn/test_textnorm.py diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 717ee7ac..994fe058 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -61,8 +61,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] Train a FastSpeech2 model. @@ -75,8 +75,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu=0, use cpu. --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. @@ -86,9 +85,7 @@ optional arguments: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -6. `--phones-dict` is the path of the phone vocabulary file. +4. `--phones-dict` is the path of the phone vocabulary file. ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder. @@ -116,7 +113,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -143,7 +140,7 @@ optional arguments: test metadata. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e_en.py`, which can synthesize waveform from text file. @@ -161,7 +158,7 @@ usage: multi_spk_synthesize_e2e_en.py [-h] [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] [--text TEXT] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -187,7 +184,7 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -196,7 +193,7 @@ optional arguments: 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 4. `--text` is the text file, which contains sentences to synthesize. 5. `--output-dir` is the directory to save synthesized audio files. -6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip) @@ -218,14 +215,13 @@ FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \ --fastspeech2-config=fastspeech2_nosil_vctk_ckpt_0.5/default.yaml \ - --fastspeech2-checkpoint=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_96400.pdz \ + --fastspeech2-checkpoint=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_66200.pdz \ --fastspeech2-stat=fastspeech2_nosil_vctk_ckpt_0.5/speech_stats.npy \ --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ - --text=${BIN_DIR}/../sentences.txt \ + --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=exp/default/test_e2e \ - --device="gpu" \ --phones-dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \ --speaker-dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt ``` diff --git a/examples/vctk/tts3/local/synthesize.sh b/examples/vctk/tts3/local/synthesize.sh index ca112969..8165c858 100755 --- a/examples/vctk/tts3/local/synthesize.sh +++ b/examples/vctk/tts3/local/synthesize.sh @@ -15,6 +15,5 @@ python3 ${BIN_DIR}/synthesize.py \ --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ --output-dir=${train_output_path}/test \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh index d919bb08..e0b2a041 100755 --- a/examples/vctk/tts3/local/synthesize_e2e.sh +++ b/examples/vctk/tts3/local/synthesize_e2e.sh @@ -15,6 +15,5 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \ --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=${train_output_path}/test_e2e \ - --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh index be6051c9..3a507650 100755 --- a/examples/vctk/tts3/local/train.sh +++ b/examples/vctk/tts3/local/train.sh @@ -8,6 +8,6 @@ python3 ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=2 \ + --ngpu=1 \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index cbfff32d..d2d2d48c 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -58,9 +58,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] + [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -74,8 +73,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device type to use. - --nprocs NPROCS number of processes. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. benchmark: @@ -95,8 +93,8 @@ benchmark: 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. -4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. -5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + ### Synthesize `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash @@ -105,7 +103,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] + [--ngpu NGPU] [--verbose VERBOSE] Synthesize with parallel wavegan. @@ -118,7 +116,7 @@ optional arguments: dev data. --output-dir OUTPUT_DIR output dir. - --device DEVICE device to run. + --ngpu NGPU if ngpu == 0, use cpu. --verbose VERBOSE verbose. ``` @@ -126,7 +124,7 @@ optional arguments: 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. -5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip). diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh index 1ef860c3..9695631e 100755 --- a/examples/vctk/voc1/local/train.sh +++ b/examples/vctk/voc1/local/train.sh @@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=${config_path} \ --output-dir=${train_output_path} \ - --nprocs=1 + --ngpu=1 diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py index 98cf9f8f..a90658c6 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py @@ -145,12 +145,17 @@ def main(): help="text to synthesize, a 'utt_id sentence' pair per line.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py index 9e29eea1..b5d0ce17 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py @@ -154,12 +154,17 @@ def main(): help="text to synthesize, a 'utt_id sentence' pair per line.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize.py b/paddlespeech/t2s/exps/fastspeech2/synthesize.py index 1beac5ce..207275f9 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize.py @@ -145,12 +145,16 @@ def main(): parser.add_argument("--test-metadata", type=str, help="test metadata.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py index b6a8fc58..ff9a41ea 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py @@ -155,12 +155,17 @@ def main(): parser.add_argument( "--inference-dir", type=str, help="dir to save inference models") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py index 7a55fbb1..6e3434a7 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py @@ -145,12 +145,17 @@ def main(): help="text to synthesize, a 'utt_id sentence' pair per line.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py index 92a43d5c..f0ff5655 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py @@ -160,12 +160,17 @@ def main(): parser.add_argument( "--inference-dir", type=str, help="dir to save inference models") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index 5662d15d..38ac2fe3 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -43,7 +43,7 @@ from paddlespeech.t2s.training.trainer import Trainer def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly - if not paddle.is_compiled_with_cuda(): + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: paddle.set_device("cpu") else: paddle.set_device("gpu") @@ -174,9 +174,7 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes.") + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") parser.add_argument( "--phones-dict", type=str, default=None, help="phone vocabulary file.") @@ -187,8 +185,6 @@ def main(): help="speaker id map file for multiple speaker model.") args = parser.parse_args() - if args.device == "cpu" and args.nprocs > 1: - raise RuntimeError("Multiprocess training on CPU is not supported.") with open(args.config) as f: config = CfgNode(yaml.safe_load(f)) @@ -202,8 +198,8 @@ def main(): ) # dispatch - if args.nprocs > 1: - dist.spawn(train_sp, (args, config), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) else: train_sp(args, config) diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py index 720b08ce..9ea76a83 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py @@ -37,7 +37,7 @@ def main(): parser.add_argument("--test-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device to run.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() @@ -53,7 +53,12 @@ def main(): f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" ) - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") generator = MelGANGenerator(**config["generator_params"]) state_dict = paddle.load(args.checkpoint) generator.set_state_dict(state_dict["generator_params"]) diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py index 45704607..ca3c0a1f 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py @@ -50,7 +50,7 @@ def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly world_size = paddle.distributed.get_world_size() - if not paddle.is_compiled_with_cuda(): + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: paddle.set_device("cpu") else: paddle.set_device("gpu") @@ -238,14 +238,10 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - if args.device == "cpu" and args.nprocs > 1: - raise RuntimeError("Multiprocess training on CPU is not supported.") with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) @@ -259,8 +255,8 @@ def main(): ) # dispatch - if args.nprocs > 1: - dist.spawn(train_sp, (args, config), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) else: train_sp(args, config) diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py index ce90aaf4..f275ed44 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py @@ -37,7 +37,7 @@ def main(): parser.add_argument("--test-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device to run.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() @@ -53,7 +53,12 @@ def main(): f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" ) - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") generator = PWGGenerator(**config["generator_params"]) state_dict = paddle.load(args.checkpoint) generator.set_state_dict(state_dict["generator_params"]) diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py index a04a547e..ca2e3f55 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py @@ -92,12 +92,17 @@ def main(): parser.add_argument("--input-dir", type=str, help="input dir of wavs.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device to run.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.config) as f: config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py index 99801267..42ef8830 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py @@ -47,7 +47,7 @@ def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly world_size = paddle.distributed.get_world_size() - if not paddle.is_compiled_with_cuda(): + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: paddle.set_device("cpu") else: paddle.set_device("gpu") @@ -215,9 +215,7 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") benchmark_group = parser.add_argument_group( @@ -241,8 +239,6 @@ def main(): ) args = parser.parse_args() - if args.device == "cpu" and args.nprocs > 1: - raise RuntimeError("Multiprocess training on CPU is not supported.") with open(args.config, 'rt') as f: config = CfgNode(yaml.safe_load(f)) @@ -261,8 +257,8 @@ def main(): ) # dispatch - if args.nprocs > 1: - dist.spawn(train_sp, (args, config), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) else: train_sp(args, config) diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 852b0c91..782fbdf2 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -226,8 +226,22 @@ def main(): test_wav_files += wav_files[-sub_num_dev:] else: train_wav_files += wav_files + elif args.dataset == "aishell3": + sub_num_dev = 5 + wav_dir = rootdir / "train" / "wav" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*.wav"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files else: - print("dataset should in {baker, ljspeech, vctk} now!") + print("dataset should in {baker, ljspeech, vctk, aishell3} now!") train_dump_dir = dumpdir / "train" / "raw" train_dump_dir.mkdir(parents=True, exist_ok=True) diff --git a/paddlespeech/t2s/exps/ge2e/inference.py b/paddlespeech/t2s/exps/ge2e/inference.py index a5733941..eed3b794 100644 --- a/paddlespeech/t2s/exps/ge2e/inference.py +++ b/paddlespeech/t2s/exps/ge2e/inference.py @@ -51,7 +51,13 @@ def _process_utterance(ifpath: Path, def main(config, args): - paddle.set_device(args.device) + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") # load model model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers, @@ -112,13 +118,6 @@ if __name__ == "__main__": parser.add_argument( "--checkpoint_path", type=str, help="path of the checkpoint to load") - # running - parser.add_argument( - "--device", - type=str, - choices=["cpu", "gpu"], - help="device type to use, cpu and gpu are supported.") - # overwrite extra config and default config parser.add_argument( "--opts", @@ -126,6 +125,9 @@ if __name__ == "__main__": help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") + args = parser.parse_args() if args.config: config.merge_from_file(args.config) diff --git a/paddlespeech/t2s/exps/ge2e/train.py b/paddlespeech/t2s/exps/ge2e/train.py index d3a57c93..55c6daf7 100644 --- a/paddlespeech/t2s/exps/ge2e/train.py +++ b/paddlespeech/t2s/exps/ge2e/train.py @@ -102,8 +102,8 @@ def main_sp(config, args): def main(config, args): - if args.nprocs > 1 and args.device == "gpu": - dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) else: main_sp(config, args) diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py index 49ce37f2..617848c5 100644 --- a/paddlespeech/t2s/exps/speedyspeech/inference.py +++ b/paddlespeech/t2s/exps/speedyspeech/inference.py @@ -96,10 +96,10 @@ def main(): input_ids = frontend.get_input_ids( sentence, merge_sentences=True, get_tone_ids=True) - phone_ids = input_ids["phone_ids"].numpy() - tone_ids = input_ids["tone_ids"].numpy() - phones = phone_ids[0] - tones = tone_ids[0] + phone_ids = input_ids["phone_ids"] + tone_ids = input_ids["tone_ids"] + phones = phone_ids[0].numpy() + tones = tone_ids[0].numpy() if args.enable_auto_log: logger.times.stamp() diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize.py b/paddlespeech/t2s/exps/speedyspeech/synthesize.py index 4482c179..67d56ea5 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize.py @@ -155,12 +155,17 @@ def main(): parser.add_argument( "--inference-dir", type=str, help="dir to save inference models") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose") args, _ = parser.parse_known_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.speedyspeech_config) as f: speedyspeech_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index 0870d466..0e64088d 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -170,13 +170,18 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir") parser.add_argument( "--inference-dir", type=str, help="dir to save inference models") - parser.add_argument( - "--device", type=str, default="gpu", help="device type to use") parser.add_argument("--verbose", type=int, default=1, help="verbose") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") args, _ = parser.parse_known_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.speedyspeech_config) as f: speedyspeech_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index 772a39d7..d9a2fbf4 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -43,7 +43,7 @@ def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly world_size = paddle.distributed.get_world_size() - if not paddle.is_compiled_with_cuda(): + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: paddle.set_device("cpu") else: paddle.set_device("gpu") @@ -167,9 +167,7 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") def str2bool(str): @@ -189,8 +187,7 @@ def main(): # 这里可以多传入 max_epoch 等 args, rest = parser.parse_known_args() - if args.device == "cpu" and args.nprocs > 1: - raise RuntimeError("Multiprocess training on CPU is not supported.") + with open(args.config) as f: config = CfgNode(yaml.safe_load(f)) @@ -212,8 +209,8 @@ def main(): ) # dispatch - if args.nprocs > 1: - dist.spawn(train_sp, (args, config), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) else: train_sp(args, config) diff --git a/paddlespeech/t2s/exps/tacotron2/ljspeech.py b/paddlespeech/t2s/exps/tacotron2/ljspeech.py index 08db2a64..4facde40 100644 --- a/paddlespeech/t2s/exps/tacotron2/ljspeech.py +++ b/paddlespeech/t2s/exps/tacotron2/ljspeech.py @@ -67,16 +67,19 @@ class LJSpeechCollector(object): # Sort by text_len in descending order texts = [ - i for i, _ in sorted( + i + for i, _ in sorted( zip(texts, text_lens), key=lambda x: x[1], reverse=True) ] mels = [ - i for i, _ in sorted( + i + for i, _ in sorted( zip(mels, text_lens), key=lambda x: x[1], reverse=True) ] mel_lens = [ - i for i, _ in sorted( + i + for i, _ in sorted( zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True) ] diff --git a/paddlespeech/t2s/exps/tacotron2/synthesize.py b/paddlespeech/t2s/exps/tacotron2/synthesize.py index 613fec02..c73c32d2 100644 --- a/paddlespeech/t2s/exps/tacotron2/synthesize.py +++ b/paddlespeech/t2s/exps/tacotron2/synthesize.py @@ -25,7 +25,12 @@ from paddlespeech.t2s.utils import display def main(config, args): - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") # model frontend = EnglishCharacter() @@ -77,7 +82,7 @@ if __name__ == "__main__": parser.add_argument("--input", type=str, help="path of the text sentences") parser.add_argument("--output", type=str, help="path to save outputs") parser.add_argument( - "--device", type=str, default="cpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument( "--opts", nargs=argparse.REMAINDER, diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py index a5f08360..8198348f 100644 --- a/paddlespeech/t2s/exps/tacotron2/train.py +++ b/paddlespeech/t2s/exps/tacotron2/train.py @@ -199,8 +199,8 @@ def main_sp(config, args): def main(config, args): - if args.nprocs > 1 and args.device == "gpu": - dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) else: main_sp(config, args) diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize.py b/paddlespeech/t2s/exps/transformer_tts/synthesize.py index 82fd8f15..666c3b72 100644 --- a/paddlespeech/t2s/exps/transformer_tts/synthesize.py +++ b/paddlespeech/t2s/exps/transformer_tts/synthesize.py @@ -117,12 +117,17 @@ def main(): parser.add_argument("--test-metadata", type=str, help="test metadata.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.transformer_tts_config) as f: transformer_tts_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py index 993749f0..ba197f43 100644 --- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py @@ -136,12 +136,17 @@ def main(): help="text to synthesize, a 'utt_id sentence' pair per line.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") with open(args.transformer_tts_config) as f: transformer_tts_config = CfgNode(yaml.safe_load(f)) diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index 7d9020a3..163339f4 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -42,7 +42,7 @@ from paddlespeech.t2s.training.trainer import Trainer def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly - if not paddle.is_compiled_with_cuda(): + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: paddle.set_device("cpu") else: paddle.set_device("gpu") @@ -164,16 +164,12 @@ def main(): parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes.") + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") parser.add_argument( "--phones-dict", type=str, default=None, help="phone vocabulary file.") args = parser.parse_args() - if args.device == "cpu" and args.nprocs > 1: - raise RuntimeError("Multiprocess training on CPU is not supported.") with open(args.config) as f: config = CfgNode(yaml.safe_load(f)) @@ -187,8 +183,8 @@ def main(): ) # dispatch - if args.nprocs > 1: - dist.spawn(train_sp, (args, config), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) else: train_sp(args, config) diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py index ceae1360..34660c75 100644 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py +++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py @@ -241,8 +241,8 @@ def main_sp(config, args): def main(config, args): - if args.nprocs > 1 and args.device == "gpu": - dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + if args.gpus: + dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) else: main_sp(config, args) diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py index c76ce007..2f005e72 100644 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py +++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py @@ -140,8 +140,9 @@ def main(): "--tacotron2_params_path", type=str, help="tacotron2 params path.") parser.add_argument( "--waveflow_params_path", type=str, help="waveflow params path.") + parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") parser.add_argument( "--input-dir", @@ -151,7 +152,12 @@ def main(): args = parser.parse_args() - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") voice_cloning(args) diff --git a/paddlespeech/t2s/exps/waveflow/synthesize.py b/paddlespeech/t2s/exps/waveflow/synthesize.py index 4f07aa4e..53715b01 100644 --- a/paddlespeech/t2s/exps/waveflow/synthesize.py +++ b/paddlespeech/t2s/exps/waveflow/synthesize.py @@ -25,7 +25,13 @@ from paddlespeech.t2s.utils import layer_tools def main(config, args): - paddle.set_device(args.device) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path) layer_tools.recursively_remove_weight_norm(model) model.eval() @@ -60,7 +66,7 @@ if __name__ == "__main__": help="path of directory containing mel spectrogram (in .npy format)") parser.add_argument("--output", type=str, help="path to save outputs") parser.add_argument( - "--device", type=str, default="cpu", help="device type to use.") + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") parser.add_argument( "--opts", nargs=argparse.REMAINDER, diff --git a/paddlespeech/t2s/exps/waveflow/train.py b/paddlespeech/t2s/exps/waveflow/train.py index 9d1df13c..d500336a 100644 --- a/paddlespeech/t2s/exps/waveflow/train.py +++ b/paddlespeech/t2s/exps/waveflow/train.py @@ -139,8 +139,8 @@ def main_sp(config, args): def main(config, args): - if args.nprocs > 1 and args.device == "gpu": - dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + if args.ngpu > 1: + dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) else: main_sp(config, args) diff --git a/paddlespeech/t2s/frontend/pinyin.py b/paddlespeech/t2s/frontend/pinyin.py deleted file mode 100644 index f99129ce..00000000 --- a/paddlespeech/t2s/frontend/pinyin.py +++ /dev/null @@ -1,333 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -A Simple Chinese Phonology using pinyin symbols. -The G2P conversion converts pinyin string to symbols. Also it can handle string -in Chinese chracters, but due to the complexity of chinese G2P, we can leave -text -> pinyin to other part of a TTS system. Other NLP techniques may be used -(e.g. tokenization, tagging, NER...) -""" -import re -from itertools import product - -from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin -from pypinyin.core import DefaultConverter -from pypinyin.core import Pinyin -from pypinyin.core import Style - -from paddlespeech.t2s.frontend.phonectic import Phonetics -from paddlespeech.t2s.frontend.vocab import Vocab - -_punctuations = [',', '。', '?', '!'] -_initials = [ - 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'zh', - 'ch', 'sh', 'r', 'z', 'c', 's' -] -_finals = [ - 'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', - 'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien', - 'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng', - 'v', 've', 'van', 'ven', 'veng' -] -_ernized_symbol = ['&r'] -_phones = _initials + _finals + _ernized_symbol + _punctuations -_tones = ['0', '1', '2', '3', '4', '5'] - -_toned_finals = [final + tone for final, tone in product(_finals, _tones[1:])] -_toned_phonems = _initials + _toned_finals + _ernized_symbol + _punctuations - - -class ParakeetConverter(NeutralToneWith5Mixin, DefaultConverter): - pass - - -class ParakeetPinyin(Phonetics): - def __init__(self): - self.vocab_phonemes = Vocab(_phones) - self.vocab_tones = Vocab(_tones) - self.pinyin_backend = Pinyin(ParakeetConverter()) - - def convert_pypinyin_tone3(self, syllables, add_start_end=False): - phonemes, tones = _convert_to_parakeet_style_pinyin(syllables) - - if add_start_end: - start = self.vocab_phonemes.start_symbol - end = self.vocab_phonemes.end_symbol - phonemes = [start] + phonemes + [end] - - start = self.vocab_tones.start_symbol - end = self.vocab_tones.end_symbol - phonemes = [start] + tones + [end] - - phonemes = [ - item for item in phonemes if item in self.vocab_phonemes.stoi - ] - tones = [item for item in tones if item in self.vocab_tones.stoi] - return phonemes, tones - - def phoneticize(self, sentence, add_start_end=False): - """ Normalize the input text sequence and convert it into pronunciation sequence. - - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation sequence. - """ - syllables = self.pinyin_backend.lazy_pinyin( - sentence, style=Style.TONE3, strict=True) - phonemes, tones = self.convert_pypinyin_tone3( - syllables, add_start_end=add_start_end) - return phonemes, tones - - def numericalize(self, phonemes, tones): - """ Convert pronunciation sequence into pronunciation id sequence. - - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. - - Returns - ---------- - List[int] - The list of pronunciation id sequence. - """ - phoneme_ids = [self.vocab_phonemes.lookup(item) for item in phonemes] - tone_ids = [self.vocab_tones.lookup(item) for item in tones] - return phoneme_ids, tone_ids - - def __call__(self, sentence, add_start_end=False): - """ Convert the input text sequence into pronunciation id sequence. - - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation id sequence. - """ - phonemes, tones = self.phoneticize( - sentence, add_start_end=add_start_end) - phoneme_ids, tone_ids = self.numericalize(phonemes, tones) - return phoneme_ids, tone_ids - - @property - def vocab_size(self): - """ Vocab size. - """ - # 70 = 62 phones + 4 punctuations + 4 special tokens - return len(self.vocab_phonemes) - - @property - def tone_vocab_size(self): - # 10 = 1 non tone + 5 tone + 4 special tokens - return len(self.vocab_tones) - - -class ParakeetPinyinWithTone(Phonetics): - def __init__(self): - self.vocab = Vocab(_toned_phonems) - self.pinyin_backend = Pinyin(ParakeetConverter()) - - def convert_pypinyin_tone3(self, syllables, add_start_end=False): - phonemes = _convert_to_parakeet_style_pinyin_with_tone(syllables) - - if add_start_end: - start = self.vocab_phonemes.start_symbol - end = self.vocab_phonemes.end_symbol - phonemes = [start] + phonemes + [end] - - phonemes = [item for item in phonemes if item in self.vocab.stoi] - return phonemes - - def phoneticize(self, sentence, add_start_end=False): - """ Normalize the input text sequence and convert it into pronunciation sequence. - - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation sequence. - """ - syllables = self.pinyin_backend.lazy_pinyin( - sentence, style=Style.TONE3, strict=True) - phonemes = self.convert_pypinyin_tone3( - syllables, add_start_end=add_start_end) - return phonemes - - def numericalize(self, phonemes): - """ Convert pronunciation sequence into pronunciation id sequence. - - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. - - Returns - ---------- - List[int] - The list of pronunciation id sequence. - """ - phoneme_ids = [self.vocab.lookup(item) for item in phonemes] - return phoneme_ids - - def __call__(self, sentence, add_start_end=False): - """ Convert the input text sequence into pronunciation id sequence. - - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation id sequence. - """ - phonemes = self.phoneticize(sentence, add_start_end=add_start_end) - phoneme_ids = self.numericalize(phonemes) - return phoneme_ids - - @property - def vocab_size(self): - """ Vocab size. - """ - # 230 = 222 phones + 4 punctuations + 4 special tokens - return len(self.vocab) - - -def _convert_to_parakeet_convension(syllable): - # from pypinyin.Style.TONE3 to parakeet convension - tone = syllable[-1] - syllable = syllable[:-1] - - # expansion of o -> uo - syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable) - - # expansion for iong, ong - syllable = syllable.replace("iong", "veng").replace("ong", "ueng") - - # expansion for ing, in - syllable = syllable.replace("ing", "ieng").replace("in", "ien") - - # expansion for un, ui, iu - syllable = syllable.replace("un", "uen") \ - .replace("ui", "uei") \ - .replace("iu", "iou") - - # rule for variants of i - syllable = syllable.replace("zi", "zii") \ - .replace("ci", "cii") \ - .replace("si", "sii") \ - .replace("zhi", "zhiii") \ - .replace("chi", "chiii") \ - .replace("shi", "shiii") \ - .replace("ri", "riii") - - # rule for y preceding i, u - syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i") - - # rule for w - syllable = syllable.replace("wu", "u").replace("w", "u") - - # rule for v following j, q, x - syllable = syllable.replace("ju", "jv") \ - .replace("qu", "qv") \ - .replace("xu", "xv") - - return syllable + tone - - -def _split_syllable(syllable: str): - global _punctuations - - if syllable in _punctuations: - # syllables, tones - return [syllable], ['0'] - - syllable = _convert_to_parakeet_convension(syllable) - - tone = syllable[-1] - syllable = syllable[:-1] - - phones = [] - tones = [] - - global _initials - if syllable[:2] in _initials: - phones.append(syllable[:2]) - tones.append('0') - phones.append(syllable[2:]) - tones.append(tone) - elif syllable[0] in _initials: - phones.append(syllable[0]) - tones.append('0') - phones.append(syllable[1:]) - tones.append(tone) - else: - phones.append(syllable) - tones.append(tone) - return phones, tones - - -def _convert_to_parakeet_style_pinyin(syllables): - phones, tones = [], [] - for syllable in syllables: - p, t = _split_syllable(syllable) - phones.extend(p) - tones.extend(t) - return phones, tones - - -def _split_syllable_with_tone(syllable: str): - global _punctuations - - if syllable in _punctuations: - # syllables - return [syllable] - - syllable = _convert_to_parakeet_convension(syllable) - - phones = [] - - global _initials - if syllable[:2] in _initials: - phones.append(syllable[:2]) - phones.append(syllable[2:]) - elif syllable[0] in _initials: - phones.append(syllable[0]) - phones.append(syllable[1:]) - else: - phones.append(syllable) - return phones - - -def _convert_to_parakeet_style_pinyin_with_tone(syllables): - phones = [] - for syllable in syllables: - p = _split_syllable_with_tone(syllable) - phones.extend(p) - return phones diff --git a/paddlespeech/t2s/training/cli.py b/paddlespeech/t2s/training/cli.py index 3b9fd42e..a0710fd7 100644 --- a/paddlespeech/t2s/training/cli.py +++ b/paddlespeech/t2s/training/cli.py @@ -53,8 +53,7 @@ def default_argument_parser(): parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load") # running - parser.add_argument("--device", type=str, choices=["cpu", "gpu"], help="device type to use, cpu and gpu are supported.") - parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.") + parser.add_argument("--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") # overwrite extra config and default config parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py index 7a6a7e99..c9e7f4cc 100644 --- a/paddlespeech/t2s/training/experiment.py +++ b/paddlespeech/t2s/training/experiment.py @@ -107,7 +107,12 @@ class ExperimentBase(object): def setup(self): """Setup the experiment. """ - paddle.set_device(self.args.device) + if self.args.ngpu == 0: + paddle.set_device("cpu") + elif self.args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") if self.parallel: self.init_parallel() @@ -128,7 +133,7 @@ class ExperimentBase(object): """A flag indicating whether the experiment should run with multiprocessing. """ - return self.args.device == "gpu" and self.args.nprocs > 1 + return self.args.ngpu > 1 def init_parallel(self): """Init environment for multiprocess training. diff --git a/tests/benchmark/pwgan/run_benchmark.sh b/tests/benchmark/pwgan/run_benchmark.sh index be5733da..e60d8798 100755 --- a/tests/benchmark/pwgan/run_benchmark.sh +++ b/tests/benchmark/pwgan/run_benchmark.sh @@ -29,8 +29,8 @@ function _train(){ --run-benchmark=true" case ${run_mode} in - sp) train_cmd="python3 paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=1 ${train_cmd}" ;; - mp) train_cmd="python3 paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=8 ${train_cmd}" + sp) train_cmd="python3 paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --ngpu=1 ${train_cmd}" ;; + mp) train_cmd="python3 paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --ngpu=8 ${train_cmd}" log_parse_file="mylog/workerlog.0" ;; *) echo "choose run_mode(sp or mp)"; exit 1; esac diff --git a/tests/chains/speedyspeech/test.sh b/tests/chains/speedyspeech/test.sh index f4441335..ccabc319 100755 --- a/tests/chains/speedyspeech/test.sh +++ b/tests/chains/speedyspeech/test.sh @@ -324,7 +324,7 @@ else gsu=${gpu//,/ } nump=`echo $gsu | wc -w` CUDA_VISIBLE_DEVICES=${gpu} - cmd="${python} ${run_train} --nprocs=$nump" + cmd="${python} ${run_train} --ngpu=$nump" else # train with multi-machine cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" fi