From dd36eafe34b48e9a4d5ae09c5d8280581e10c667 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 2 Dec 2021 10:05:40 +0000 Subject: [PATCH 01/50] add style_melgan --- examples/aishell3/voc1/conf/default.yaml | 4 +- examples/aishell3/voc1/local/synthesize.sh | 5 +- examples/csmsc/README.md | 1 + examples/csmsc/voc1/conf/default.yaml | 18 +- examples/csmsc/voc1/local/synthesize.sh | 5 +- examples/csmsc/voc3/conf/default.yaml | 13 +- examples/csmsc/voc3/conf/finetune.yaml | 13 +- examples/csmsc/voc3/local/synthesize.sh | 5 +- examples/csmsc/voc4/conf/default.yaml | 136 ++++++ examples/csmsc/voc4/local/preprocess.sh | 55 +++ examples/csmsc/voc4/local/synthesize.sh | 14 + examples/csmsc/voc4/local/train.sh | 13 + examples/csmsc/voc4/path.sh | 13 + examples/csmsc/voc4/run.sh | 32 ++ examples/ljspeech/voc1/conf/default.yaml | 22 +- examples/ljspeech/voc1/local/synthesize.sh | 5 +- examples/vctk/voc1/conf/default.yaml | 4 +- examples/vctk/voc1/local/synthesize.sh | 5 +- .../parallelwave_gan/synthesize.py | 103 ----- .../exps/gan_vocoder/style_melgan/__init__.py | 13 + .../exps/gan_vocoder/style_melgan/train.py | 258 +++++++++++ .../{multi_band_melgan => }/synthesize.py | 32 +- paddlespeech/t2s/models/__init__.py | 1 + paddlespeech/t2s/models/melgan/__init__.py | 2 + paddlespeech/t2s/models/melgan/melgan.py | 52 ++- .../melgan/multi_band_melgan_updater.py | 13 +- .../t2s/models/melgan/style_melgan.py | 404 ++++++++++++++++++ .../t2s/models/melgan/style_melgan_updater.py | 221 ++++++++++ .../parallel_wavegan_updater.py | 16 +- paddlespeech/t2s/modules/activation.py | 5 +- paddlespeech/t2s/modules/residual_stack.py | 22 +- paddlespeech/t2s/modules/style_encoder.py | 4 +- paddlespeech/t2s/modules/tade_res_block.py | 164 +++++++ 33 files changed, 1472 insertions(+), 201 deletions(-) create mode 100644 examples/csmsc/voc4/conf/default.yaml create mode 100755 examples/csmsc/voc4/local/preprocess.sh create mode 100755 examples/csmsc/voc4/local/synthesize.sh create mode 100755 examples/csmsc/voc4/local/train.sh create mode 100755 examples/csmsc/voc4/path.sh create mode 100755 examples/csmsc/voc4/run.sh delete mode 100644 paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py create mode 100644 paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py create mode 100644 paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py rename paddlespeech/t2s/exps/gan_vocoder/{multi_band_melgan => }/synthesize.py (79%) create mode 100644 paddlespeech/t2s/models/melgan/style_melgan.py create mode 100644 paddlespeech/t2s/models/melgan/style_melgan_updater.py create mode 100644 paddlespeech/t2s/modules/tade_res_block.py diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml index ba2d9f2e..eb6d350d 100644 --- a/examples/aishell3/voc1/conf/default.yaml +++ b/examples/aishell3/voc1/conf/default.yaml @@ -35,7 +35,7 @@ generator_params: dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. use_weight_norm: true # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. - upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size. + upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift ########################################################### # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # @@ -71,7 +71,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. # DATA LOADER SETTING # ########################################################### batch_size: 8 # Batch size. -batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by hop_size. +batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. pin_memory: true # Whether to pin memory in Pytorch DataLoader. num_workers: 4 # Number of workers in Pytorch DataLoader. remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh index 9f904ac0..d85d1b1d 100755 --- a/examples/aishell3/voc1/local/synthesize.sh +++ b/examples/aishell3/voc1/local/synthesize.sh @@ -6,8 +6,9 @@ ckpt_name=$3 FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/synthesize.py \ +python3 ${BIN_DIR}/../synthesize.py \ --config=${config_path} \ --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test + --output-dir=${train_output_path}/test \ + --generator-type=pwgan diff --git a/examples/csmsc/README.md b/examples/csmsc/README.md index 08a51349..a59a06ed 100644 --- a/examples/csmsc/README.md +++ b/examples/csmsc/README.md @@ -9,3 +9,4 @@ * voc1 - Parallel WaveGAN * voc2 - MelGAN * voc3 - MultiBand MelGAN +* voc4 - Style MelGAN diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml index 1363b454..21bdd040 100644 --- a/examples/csmsc/voc1/conf/default.yaml +++ b/examples/csmsc/voc1/conf/default.yaml @@ -78,7 +78,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. # DATA LOADER SETTING # ########################################################### batch_size: 8 # Batch size. -batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size. +batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by n_shift. pin_memory: true # Whether to pin memory in Pytorch DataLoader. num_workers: 2 # Number of workers in Pytorch DataLoader. remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. @@ -88,23 +88,23 @@ allow_cache: true # Whether to allow cache in dataset. If true, it requ # OPTIMIZER & SCHEDULER SETTING # ########################################################### generator_optimizer_params: - epsilon: 1.0e-6 # Generator's epsilon. + epsilon: 1.0e-6 # Generator's epsilon. weight_decay: 0.0 # Generator's weight decay coefficient. generator_scheduler_params: - learning_rate: 0.0001 # Generator's learning rate. + learning_rate: 0.0001 # Generator's learning rate. step_size: 200000 # Generator's scheduler step size. gamma: 0.5 # Generator's scheduler gamma. # At each step size, lr will be multiplied by this parameter. generator_grad_norm: 10 # Generator's gradient norm. discriminator_optimizer_params: epsilon: 1.0e-6 # Discriminator's epsilon. - weight_decay: 0.0 # Discriminator's weight decay coefficient. + weight_decay: 0.0 # Discriminator's weight decay coefficient. discriminator_scheduler_params: - learning_rate: 0.00005 # Discriminator's learning rate. - step_size: 200000 # Discriminator's scheduler step size. - gamma: 0.5 # Discriminator's scheduler gamma. - # At each step size, lr will be multiplied by this parameter. -discriminator_grad_norm: 1 # Discriminator's gradient norm. + learning_rate: 0.00005 # Discriminator's learning rate. + step_size: 200000 # Discriminator's scheduler step size. + gamma: 0.5 # Discriminator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +discriminator_grad_norm: 1 # Discriminator's gradient norm. ########################################################### # INTERVAL SETTING # diff --git a/examples/csmsc/voc1/local/synthesize.sh b/examples/csmsc/voc1/local/synthesize.sh index 9f904ac0..d85d1b1d 100755 --- a/examples/csmsc/voc1/local/synthesize.sh +++ b/examples/csmsc/voc1/local/synthesize.sh @@ -6,8 +6,9 @@ ckpt_name=$3 FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/synthesize.py \ +python3 ${BIN_DIR}/../synthesize.py \ --config=${config_path} \ --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test + --output-dir=${train_output_path}/test \ + --generator-type=pwgan diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml index cc27220f..1275f995 100644 --- a/examples/csmsc/voc3/conf/default.yaml +++ b/examples/csmsc/voc3/conf/default.yaml @@ -6,8 +6,7 @@ # This configuration is based on full-band MelGAN but the hop size and sampling # rate is different from the paper (16kHz vs 24kHz). The number of iteraions # is not shown in the paper so currently we train 1M iterations (not sure enough -# to converge). The optimizer setting is based on @dathudeptrai advice. -# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906 +# to converge). ########################################################### # FEATURE EXTRACTION SETTING # @@ -30,7 +29,7 @@ generator_params: out_channels: 4 # Number of output channels. kernel_size: 7 # Kernel size of initial and final conv layers. channels: 384 # Initial number of channels for conv layers. - upsample_scales: [5, 5, 3] # List of Upsampling scales. + upsample_scales: [5, 5, 3] # List of Upsampling scales. prod(upsample_scales) == n_shift stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack. stacks: 4 # Number of stacks in a single residual stack module. use_weight_norm: True # Whether to use weight normalization. @@ -67,7 +66,7 @@ discriminator_params: use_stft_loss: true stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. - hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss + hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss. win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. window: "hann" # Window function for STFT-based loss use_subband_stft_loss: true @@ -87,7 +86,7 @@ lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. # DATA LOADER SETTING # ########################################################### batch_size: 64 # Batch size. -batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size. +batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by n_shift. num_workers: 2 # Number of workers in DataLoader. ########################################################### @@ -109,7 +108,7 @@ generator_scheduler_params: - 500000 - 600000 discriminator_optimizer_params: - epsilon: 1.0e-7 # Discriminator's epsilon. + epsilon: 1.0e-7 # Discriminator's epsilon. weight_decay: 0.0 # Discriminator's weight decay coefficient. discriminator_grad_norm: -1 # Discriminator's gradient norm. @@ -129,7 +128,7 @@ discriminator_scheduler_params: ########################################################### discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator. train_max_steps: 1000000 # Number of training steps. -save_interval_steps: 5000 # Interval steps to save checkpoint. +save_interval_steps: 5000 # Interval steps to save checkpoint. eval_interval_steps: 1000 # Interval steps to evaluate the network. ########################################################### diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml index 80ab6bed..8610c526 100644 --- a/examples/csmsc/voc3/conf/finetune.yaml +++ b/examples/csmsc/voc3/conf/finetune.yaml @@ -6,8 +6,7 @@ # This configuration is based on full-band MelGAN but the hop size and sampling # rate is different from the paper (16kHz vs 24kHz). The number of iteraions # is not shown in the paper so currently we train 1M iterations (not sure enough -# to converge). The optimizer setting is based on @dathudeptrai advice. -# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906 +# to converge). ########################################################### # FEATURE EXTRACTION SETTING # @@ -30,7 +29,7 @@ generator_params: out_channels: 4 # Number of output channels. kernel_size: 7 # Kernel size of initial and final conv layers. channels: 384 # Initial number of channels for conv layers. - upsample_scales: [5, 5, 3] # List of Upsampling scales. + upsample_scales: [5, 5, 3] # List of Upsampling scales. prod(upsample_scales) == n_shift stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack. stacks: 4 # Number of stacks in a single residual stack module. use_weight_norm: True # Whether to use weight normalization. @@ -73,7 +72,7 @@ stft_loss_params: use_subband_stft_loss: true subband_stft_loss_params: fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. - hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss + hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss. win_lengths: [150, 300, 60] # List of window length for STFT-based loss. window: "hann" # Window function for STFT-based loss @@ -87,7 +86,7 @@ lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. # DATA LOADER SETTING # ########################################################### batch_size: 64 # Batch size. -batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size. +batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by n_shift. num_workers: 2 # Number of workers in DataLoader. ########################################################### @@ -109,7 +108,7 @@ generator_scheduler_params: - 500000 - 600000 discriminator_optimizer_params: - epsilon: 1.0e-7 # Discriminator's epsilon. + epsilon: 1.0e-7 # Discriminator's epsilon. weight_decay: 0.0 # Discriminator's weight decay coefficient. discriminator_grad_norm: -1 # Discriminator's gradient norm. @@ -129,7 +128,7 @@ discriminator_scheduler_params: ########################################################### discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator. train_max_steps: 2000000 # Number of training steps. -save_interval_steps: 1000 # Interval steps to save checkpoint. +save_interval_steps: 1000 # Interval steps to save checkpoint. eval_interval_steps: 1000 # Interval steps to evaluate the network. ########################################################### diff --git a/examples/csmsc/voc3/local/synthesize.sh b/examples/csmsc/voc3/local/synthesize.sh index 9f904ac0..22d879fa 100755 --- a/examples/csmsc/voc3/local/synthesize.sh +++ b/examples/csmsc/voc3/local/synthesize.sh @@ -6,8 +6,9 @@ ckpt_name=$3 FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/synthesize.py \ +python3 ${BIN_DIR}/../synthesize.py \ --config=${config_path} \ --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test + --output-dir=${train_output_path}/test \ + --generator-type=mb_melgan diff --git a/examples/csmsc/voc4/conf/default.yaml b/examples/csmsc/voc4/conf/default.yaml new file mode 100644 index 00000000..cad4cf9b --- /dev/null +++ b/examples/csmsc/voc4/conf/default.yaml @@ -0,0 +1,136 @@ +# This is the configuration file for CSMSC dataset.This configuration is based +# on StyleMelGAN paper but uses MSE loss instead of Hinge loss. And I found that +# batch_size = 8 is also working good. So maybe if you want to accelerate the training, +# you can reduce the batch size (e.g. 8 or 16). Upsampling scales is modified to +# fit the shift size 300 pt. +# NOTE: batch_max_steps(24000) == prod(noise_upsample_scales)(80) * prod(upsample_scales)(300) + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 2048 # FFT size. (in samples) +n_shift: 300 # Hop size. (in samples) +win_length: 1200 # Window length. (in samples) + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 7600 # Maximum frequency in mel basis calculation. (Hz) + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 128 # Number of input channels. + aux_channels: 80 + channels: 64 # Initial number of channels for conv layers. + out_channels: 1 # Number of output channels. + kernel_size: 9 # Kernel size of initial and final conv layers. + dilation: 2 + bias: True + noise_upsample_scales: [10, 2, 2, 2] + noise_upsample_activation: "leakyrelu" + noise_upsample_activation_params: + negative_slope: 0.2 + upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1] # List of Upsampling scales. prod(upsample_scales) == n_shift + upsample_mode: "nearest" + gated_function: "softmax" + use_weight_norm: True # Whether to use weight normalization. + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + repeats: 4 + window_sizes: [512, 1024, 2048, 4096] + pqmf_params: + - [1, None, None, None] + - [2, 62, 0.26700, 9.0] + - [4, 62, 0.14200, 9.0] + - [8, 62, 0.07949, 9.0] + discriminator_params: + out_channels: 1 # Number of output channels. + kernel_sizes: [5, 3] # List of kernel size. + channels: 16 # Number of channels of the initial conv layer. + max_downsample_channels: 512 # Maximum number of channels of downsampling layers. + bias: True + downsample_scales: [4, 4, 4, 1] # List of downsampling scales. + nonlinear_activation: "leakyrelu" # Nonlinear activation function. + nonlinear_activation_params: # Parameters of nonlinear activation function. + negative_slope: 0.2 + use_weight_norm: True # Whether to use weight norm. + + +########################################################### +# STFT LOSS SETTING # +########################################################### +use_stft_loss: true +stft_loss_params: + fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. + hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss + win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. + window: "hann" # Window function for STFT-based loss +lambda_aux: 1.0 # Loss balancing coefficient for aux loss. + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +lambda_adv: 1.0 # Loss balancing coefficient for adv loss. +generator_adv_loss_params: + average_by_discriminators: false # Whether to average loss by #discriminators. +discriminator_adv_loss_params: + average_by_discriminators: false # Whether to average loss by #discriminators. + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 32 # Batch size. +# batch_max_steps(24000) == prod(noise_upsample_scales)(80) * prod(upsample_scales)(300, n_shift) +batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. +num_workers: 2 # Number of workers in Pytorch DataLoader. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + beta1: 0.5 + beta2: 0.9 + weight_decay: 0.0 # Generator's weight decay coefficient. +generator_scheduler_params: + learning_rate: 1.0e-4 # Generator's learning rate. + gamma: 0.5 # Generator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 100000 + - 300000 + - 500000 + - 700000 + - 900000 +generator_grad_norm: -1 # Generator's gradient norm. +discriminator_optimizer_params: + beta1: 0.5 + beta2: 0.9 + weight_decay: 0.0 # Discriminator's weight decay coefficient. +discriminator_scheduler_params: + learning_rate: 2.0e-4 # Discriminator's learning rate. + gamma: 0.5 # Discriminator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 200000 + - 400000 + - 600000 + - 800000 +discriminator_grad_norm: -1 # Discriminator's gradient norm. + +########################################################### +# INTERVAL SETTING # +########################################################### +discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. +train_max_steps: 1500000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh new file mode 100755 index 00000000..61d6d62b --- /dev/null +++ b/examples/csmsc/voc4/local/preprocess.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./baker_alignment_tone \ + --output=durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/../preprocess.py \ + --rootdir=~/datasets/BZNSYP/ \ + --dataset=baker \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --cut-sil=True \ + --num-cpu=20 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="feats" +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize, dev and test should use train's stats + echo "Normalize ..." + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --stats=dump/train/feats_stats.npy + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --stats=dump/train/feats_stats.npy + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --stats=dump/train/feats_stats.npy +fi diff --git a/examples/csmsc/voc4/local/synthesize.sh b/examples/csmsc/voc4/local/synthesize.sh new file mode 100755 index 00000000..527e5f83 --- /dev/null +++ b/examples/csmsc/voc4/local/synthesize.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test \ + --generator-type=style_melgan diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh new file mode 100755 index 00000000..9695631e --- /dev/null +++ b/examples/csmsc/voc4/local/train.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +FLAGS_cudnn_exhaustive_search=true \ +FLAGS_conv_workspace_size_limit=4000 \ +python ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 diff --git a/examples/csmsc/voc4/path.sh b/examples/csmsc/voc4/path.sh new file mode 100755 index 00000000..f68ea3be --- /dev/null +++ b/examples/csmsc/voc4/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=style_melgan +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/csmsc/voc4/run.sh b/examples/csmsc/voc4/run.sh new file mode 100755 index 00000000..3e7d7e2a --- /dev/null +++ b/examples/csmsc/voc4/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_50000.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/ljspeech/voc1/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml index 2edec3b9..fb97ea8e 100644 --- a/examples/ljspeech/voc1/conf/default.yaml +++ b/examples/ljspeech/voc1/conf/default.yaml @@ -35,7 +35,7 @@ generator_params: dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. use_weight_norm: true # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. - upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size. + upsample_scales: [4, 4, 4, 4] # Upsampling scales. prod(upsample_scales) == n_shift ########################################################### # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # @@ -60,7 +60,7 @@ stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. - window: "hann" # Window function for STFT-based loss + window: "hann" # Window function for STFT-based loss ########################################################### # ADVERSARIAL LOSS SETTING # @@ -71,7 +71,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. # DATA LOADER SETTING # ########################################################### batch_size: 8 # Batch size. -batch_max_steps: 25600 # Length of each audio in batch. Make sure dividable by hop_size. +batch_max_steps: 25600 # Length of each audio in batch. Make sure dividable by n_shift. pin_memory: true # Whether to pin memory in Pytorch DataLoader. num_workers: 4 # Number of workers in Pytorch DataLoader. remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. @@ -84,20 +84,20 @@ generator_optimizer_params: epsilon: 1.0e-6 # Generator's epsilon. weight_decay: 0.0 # Generator's weight decay coefficient. generator_scheduler_params: - learning_rate: 0.0001 # Generator's learning rate. + learning_rate: 0.0001 # Generator's learning rate. step_size: 200000 # Generator's scheduler step size. gamma: 0.5 # Generator's scheduler gamma. # At each step size, lr will be multiplied by this parameter. generator_grad_norm: 10 # Generator's gradient norm. discriminator_optimizer_params: - epsilon: 1.0e-6 # Discriminator's epsilon. - weight_decay: 0.0 # Discriminator's weight decay coefficient. + epsilon: 1.0e-6 # Discriminator's epsilon. + weight_decay: 0.0 # Discriminator's weight decay coefficient. discriminator_scheduler_params: - learning_rate: 0.00005 # Discriminator's learning rate. - step_size: 200000 # Discriminator's scheduler step size. - gamma: 0.5 # Discriminator's scheduler gamma. - # At each step size, lr will be multiplied by this parameter. -discriminator_grad_norm: 1 # Discriminator's gradient norm. + learning_rate: 0.00005 # Discriminator's learning rate. + step_size: 200000 # Discriminator's scheduler step size. + gamma: 0.5 # Discriminator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +discriminator_grad_norm: 1 # Discriminator's gradient norm. ########################################################### # INTERVAL SETTING # diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh index 9f904ac0..d85d1b1d 100755 --- a/examples/ljspeech/voc1/local/synthesize.sh +++ b/examples/ljspeech/voc1/local/synthesize.sh @@ -6,8 +6,9 @@ ckpt_name=$3 FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/synthesize.py \ +python3 ${BIN_DIR}/../synthesize.py \ --config=${config_path} \ --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test + --output-dir=${train_output_path}/test \ + --generator-type=pwgan diff --git a/examples/vctk/voc1/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml index ba2d9f2e..eb6d350d 100644 --- a/examples/vctk/voc1/conf/default.yaml +++ b/examples/vctk/voc1/conf/default.yaml @@ -35,7 +35,7 @@ generator_params: dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. use_weight_norm: true # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. - upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size. + upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift ########################################################### # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # @@ -71,7 +71,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. # DATA LOADER SETTING # ########################################################### batch_size: 8 # Batch size. -batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by hop_size. +batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. pin_memory: true # Whether to pin memory in Pytorch DataLoader. num_workers: 4 # Number of workers in Pytorch DataLoader. remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh index 9f904ac0..d85d1b1d 100755 --- a/examples/vctk/voc1/local/synthesize.sh +++ b/examples/vctk/voc1/local/synthesize.sh @@ -6,8 +6,9 @@ ckpt_name=$3 FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/synthesize.py \ +python3 ${BIN_DIR}/../synthesize.py \ --config=${config_path} \ --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test + --output-dir=${train_output_path}/test \ + --generator-type=pwgan diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py deleted file mode 100644 index f275ed44..00000000 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -from pathlib import Path - -import jsonlines -import numpy as np -import paddle -import soundfile as sf -import yaml -from paddle import distributed as dist -from timer import timer -from yacs.config import CfgNode - -from paddlespeech.t2s.datasets.data_table import DataTable -from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator - - -def main(): - parser = argparse.ArgumentParser( - description="Synthesize with parallel wavegan.") - parser.add_argument( - "--config", type=str, help="parallel wavegan config file.") - parser.add_argument("--checkpoint", type=str, help="snapshot to load.") - parser.add_argument("--test-metadata", type=str, help="dev data.") - parser.add_argument("--output-dir", type=str, help="output dir.") - parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") - - args = parser.parse_args() - - with open(args.config) as f: - config = CfgNode(yaml.safe_load(f)) - - print("========Args========") - print(yaml.safe_dump(vars(args))) - print("========Config========") - print(config) - print( - f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" - ) - - if args.ngpu == 0: - paddle.set_device("cpu") - elif args.ngpu > 0: - paddle.set_device("gpu") - else: - print("ngpu should >= 0 !") - generator = PWGGenerator(**config["generator_params"]) - state_dict = paddle.load(args.checkpoint) - generator.set_state_dict(state_dict["generator_params"]) - - generator.remove_weight_norm() - generator.eval() - with jsonlines.open(args.test_metadata, 'r') as reader: - metadata = list(reader) - - test_dataset = DataTable( - metadata, - fields=['utt_id', 'feats'], - converters={ - 'utt_id': None, - 'feats': np.load, - }) - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - N = 0 - T = 0 - for example in test_dataset: - utt_id = example['utt_id'] - mel = example['feats'] - mel = paddle.to_tensor(mel) # (T, C) - with timer() as t: - with paddle.no_grad(): - wav = generator.inference(c=mel) - wav = wav.numpy() - N += wav.size - T += t.elapse - speed = wav.size / t.elapse - rtf = config.fs / speed - print( - f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." - ) - sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs) - print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }") - - -if __name__ == "__main__": - main() diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py new file mode 100644 index 00000000..bc746467 --- /dev/null +++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py @@ -0,0 +1,258 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import logging +import os +import shutil +from pathlib import Path + +import jsonlines +import numpy as np +import paddle +import yaml +from paddle import DataParallel +from paddle import distributed as dist +from paddle import nn +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.optimizer import Adam +from paddle.optimizer.lr import MultiStepDecay +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip +from paddlespeech.t2s.models.melgan import StyleMelGANDiscriminator +from paddlespeech.t2s.models.melgan import StyleMelGANEvaluator +from paddlespeech.t2s.models.melgan import StyleMelGANGenerator +from paddlespeech.t2s.models.melgan import StyleMelGANUpdater +from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss +from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss +from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss +from paddlespeech.t2s.training.extensions.snapshot import Snapshot +from paddlespeech.t2s.training.extensions.visualizer import VisualDL +from paddlespeech.t2s.training.seeding import seed_everything +from paddlespeech.t2s.training.trainer import Trainer + + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + world_size = paddle.distributed.get_world_size() + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + paddle.set_device("cpu") + else: + paddle.set_device("gpu") + if world_size > 1: + paddle.distributed.init_parallel_env() + + # set the random seed, it is a must for multiprocess training + seed_everything(config.seed) + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + + # dataloader has been too verbose + logging.getLogger("DataLoader").disabled = True + + # construct dataset for training and validation + with jsonlines.open(args.train_metadata, 'r') as reader: + train_metadata = list(reader) + train_dataset = DataTable( + data=train_metadata, + fields=["wave", "feats"], + converters={ + "wave": np.load, + "feats": np.load, + }, ) + with jsonlines.open(args.dev_metadata, 'r') as reader: + dev_metadata = list(reader) + dev_dataset = DataTable( + data=dev_metadata, + fields=["wave", "feats"], + converters={ + "wave": np.load, + "feats": np.load, + }, ) + + # collate function and dataloader + train_sampler = DistributedBatchSampler( + train_dataset, + batch_size=config.batch_size, + shuffle=True, + drop_last=True) + dev_sampler = DistributedBatchSampler( + dev_dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=False) + print("samplers done!") + + if "aux_context_window" in config.generator_params: + aux_context_window = config.generator_params.aux_context_window + else: + aux_context_window = 0 + train_batch_fn = Clip( + batch_max_steps=config.batch_max_steps, + hop_size=config.n_shift, + aux_context_window=aux_context_window) + + train_dataloader = DataLoader( + train_dataset, + batch_sampler=train_sampler, + collate_fn=train_batch_fn, + num_workers=config.num_workers) + + dev_dataloader = DataLoader( + dev_dataset, + batch_sampler=dev_sampler, + collate_fn=train_batch_fn, + num_workers=config.num_workers) + print("dataloaders done!") + + generator = StyleMelGANGenerator(**config["generator_params"]) + discriminator = StyleMelGANDiscriminator(**config["discriminator_params"]) + if world_size > 1: + generator = DataParallel(generator) + discriminator = DataParallel(discriminator) + print("models done!") + criterion_stft = MultiResolutionSTFTLoss(**config["stft_loss_params"]) + + criterion_gen_adv = GeneratorAdversarialLoss( + **config["generator_adv_loss_params"]) + criterion_dis_adv = DiscriminatorAdversarialLoss( + **config["discriminator_adv_loss_params"]) + print("criterions done!") + + lr_schedule_g = MultiStepDecay(**config["generator_scheduler_params"]) + # Compared to multi_band_melgan.v1 config, Adam optimizer without gradient norm is used + generator_grad_norm = config["generator_grad_norm"] + gradient_clip_g = nn.ClipGradByGlobalNorm( + generator_grad_norm) if generator_grad_norm > 0 else None + print("gradient_clip_g:", gradient_clip_g) + + optimizer_g = Adam( + learning_rate=lr_schedule_g, + grad_clip=gradient_clip_g, + parameters=generator.parameters(), + **config["generator_optimizer_params"]) + lr_schedule_d = MultiStepDecay(**config["discriminator_scheduler_params"]) + discriminator_grad_norm = config["discriminator_grad_norm"] + gradient_clip_d = nn.ClipGradByGlobalNorm( + discriminator_grad_norm) if discriminator_grad_norm > 0 else None + print("gradient_clip_d:", gradient_clip_d) + optimizer_d = Adam( + learning_rate=lr_schedule_d, + grad_clip=gradient_clip_d, + parameters=discriminator.parameters(), + **config["discriminator_optimizer_params"]) + print("optimizers done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + if dist.get_rank() == 0: + config_name = args.config.split("/")[-1] + # copy conf to output_dir + shutil.copyfile(args.config, output_dir / config_name) + + updater = StyleMelGANUpdater( + models={ + "generator": generator, + "discriminator": discriminator, + }, + optimizers={ + "generator": optimizer_g, + "discriminator": optimizer_d, + }, + criterions={ + "stft": criterion_stft, + "gen_adv": criterion_gen_adv, + "dis_adv": criterion_dis_adv, + }, + schedulers={ + "generator": lr_schedule_g, + "discriminator": lr_schedule_d, + }, + dataloader=train_dataloader, + discriminator_train_start_steps=config.discriminator_train_start_steps, + lambda_adv=config.lambda_adv, + output_dir=output_dir) + + evaluator = StyleMelGANEvaluator( + models={ + "generator": generator, + "discriminator": discriminator, + }, + criterions={ + "stft": criterion_stft, + "gen_adv": criterion_gen_adv, + "dis_adv": criterion_dis_adv, + }, + dataloader=dev_dataloader, + lambda_adv=config.lambda_adv, + output_dir=output_dir) + + trainer = Trainer( + updater, + stop_trigger=(config.train_max_steps, "iteration"), + out=output_dir) + + if dist.get_rank() == 0: + trainer.extend( + evaluator, trigger=(config.eval_interval_steps, 'iteration')) + trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) + + print("Trainer Done!") + trainer.run() + + +def main(): + # parse args and config and redirect to train_sp + + parser = argparse.ArgumentParser( + description="Train a Multi-Band MelGAN model.") + parser.add_argument( + "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--train-metadata", type=str, help="training data.") + parser.add_argument("--dev-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + parser.add_argument("--verbose", type=int, default=1, help="verbose.") + + args = parser.parse_args() + + with open(args.config, 'rt') as f: + config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + # dispatch + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) + else: + train_sp(args, config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py similarity index 79% rename from paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py rename to paddlespeech/t2s/exps/gan_vocoder/synthesize.py index 988d4590..d7fd2f94 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py +++ b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py @@ -24,15 +24,19 @@ from paddle import distributed as dist from timer import timer from yacs.config import CfgNode +import paddlespeech from paddlespeech.t2s.datasets.data_table import DataTable -from paddlespeech.t2s.models.melgan import MelGANGenerator def main(): - parser = argparse.ArgumentParser( - description="Synthesize with multi band melgan.") + parser = argparse.ArgumentParser(description="Synthesize with GANVocoder.") parser.add_argument( - "--config", type=str, help="multi band melgan config file.") + "--generator-type", + type=str, + default="pwgan", + help="type of GANVocoder, should in {pwgan, mb_melgan, style_melgan, } now" + ) + parser.add_argument("--config", type=str, help="GANVocoder config file.") parser.add_argument("--checkpoint", type=str, help="snapshot to load.") parser.add_argument("--test-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") @@ -59,15 +63,29 @@ def main(): paddle.set_device("gpu") else: print("ngpu should >= 0 !") - generator = MelGANGenerator(**config["generator_params"]) + + class_map = { + "hifigan": "HiFiGANGenerator", + "mb_melgan": "MelGANGenerator", + "pwgan": "PWGGenerator", + "style_melgan": "StyleMelGANGenerator", + } + + generator_type = args.generator_type + + assert generator_type in class_map + + print("generator_type:", generator_type) + + generator_class = getattr(paddlespeech.t2s.models, + class_map[generator_type]) + generator = generator_class(**config["generator_params"]) state_dict = paddle.load(args.checkpoint) generator.set_state_dict(state_dict["generator_params"]) - generator.remove_weight_norm() generator.eval() with jsonlines.open(args.test_metadata, 'r') as reader: metadata = list(reader) - test_dataset = DataTable( metadata, fields=['utt_id', 'feats'], diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py index 66720649..601bd9d6 100644 --- a/paddlespeech/t2s/models/__init__.py +++ b/paddlespeech/t2s/models/__init__.py @@ -14,6 +14,7 @@ from .fastspeech2 import * from .melgan import * from .parallel_wavegan import * +from .speedyspeech import * from .tacotron2 import * from .transformer_tts import * from .waveflow import * diff --git a/paddlespeech/t2s/models/melgan/__init__.py b/paddlespeech/t2s/models/melgan/__init__.py index d4f557db..df8ccd92 100644 --- a/paddlespeech/t2s/models/melgan/__init__.py +++ b/paddlespeech/t2s/models/melgan/__init__.py @@ -13,3 +13,5 @@ # limitations under the License. from .melgan import * from .multi_band_melgan_updater import * +from .style_melgan import * +from .style_melgan_updater import * diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py index 809403f6..8dfc05a0 100644 --- a/paddlespeech/t2s/models/melgan/melgan.py +++ b/paddlespeech/t2s/models/melgan/melgan.py @@ -21,6 +21,7 @@ import numpy as np import paddle from paddle import nn +from paddlespeech.t2s.modules.activation import get_activation from paddlespeech.t2s.modules.causal_conv import CausalConv1D from paddlespeech.t2s.modules.causal_conv import CausalConv1DTranspose from paddlespeech.t2s.modules.nets_utils import initialize @@ -41,7 +42,7 @@ class MelGANGenerator(nn.Layer): upsample_scales: List[int]=[8, 8, 2, 2], stack_kernel_size: int=3, stacks: int=3, - nonlinear_activation: str="LeakyReLU", + nonlinear_activation: str="leakyrelu", nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2}, pad: str="Pad1D", pad_params: Dict[str, Any]={"mode": "reflect"}, @@ -88,16 +89,19 @@ class MelGANGenerator(nn.Layer): """ super().__init__() + # initialize parameters + initialize(self, init_type) + + # for compatibility + if nonlinear_activation == "LeakyReLU": + nonlinear_activation = "leakyrelu" + # check hyper parameters is valid assert channels >= np.prod(upsample_scales) assert channels % (2**len(upsample_scales)) == 0 if not use_causal_conv: assert (kernel_size - 1 ) % 2 == 0, "Not support even number kernel size." - - # initialize parameters - initialize(self, init_type) - layers = [] if not use_causal_conv: layers += [ @@ -118,7 +122,8 @@ class MelGANGenerator(nn.Layer): for i, upsample_scale in enumerate(upsample_scales): # add upsampling layer layers += [ - getattr(nn, nonlinear_activation)(**nonlinear_activation_params) + get_activation(nonlinear_activation, + **nonlinear_activation_params) ] if not use_causal_conv: layers += [ @@ -158,7 +163,7 @@ class MelGANGenerator(nn.Layer): # add final layer layers += [ - getattr(nn, nonlinear_activation)(**nonlinear_activation_params) + get_activation(nonlinear_activation, **nonlinear_activation_params) ] if not use_causal_conv: layers += [ @@ -242,7 +247,6 @@ class MelGANGenerator(nn.Layer): This initialization follows official implementation manner. https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py """ - # 定义参数为float的正态分布。 dist = paddle.distribution.Normal(loc=0.0, scale=0.02) @@ -287,10 +291,11 @@ class MelGANDiscriminator(nn.Layer): max_downsample_channels: int=1024, bias: bool=True, downsample_scales: List[int]=[4, 4, 4, 4], - nonlinear_activation: str="LeakyReLU", + nonlinear_activation: str="leakyrelu", nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2}, pad: str="Pad1D", - pad_params: Dict[str, Any]={"mode": "reflect"}, ): + pad_params: Dict[str, Any]={"mode": "reflect"}, + init_type: str="xavier_uniform", ): """Initilize MelGAN discriminator module. Parameters ---------- @@ -321,6 +326,14 @@ class MelGANDiscriminator(nn.Layer): Hyperparameters for padding function. """ super().__init__() + + # for compatibility + if nonlinear_activation == "LeakyReLU": + nonlinear_activation = "leakyrelu" + + # initialize parameters + initialize(self, init_type) + self.layers = nn.LayerList() # check kernel size is valid @@ -338,8 +351,8 @@ class MelGANDiscriminator(nn.Layer): channels, int(np.prod(kernel_sizes)), bias_attr=bias), - getattr(nn, nonlinear_activation)( - **nonlinear_activation_params), )) + get_activation(nonlinear_activation, ** + nonlinear_activation_params), )) # add downsample layers in_chs = channels @@ -355,8 +368,8 @@ class MelGANDiscriminator(nn.Layer): padding=downsample_scale * 5, groups=in_chs // 4, bias_attr=bias, ), - getattr(nn, nonlinear_activation)( - **nonlinear_activation_params), )) + get_activation(nonlinear_activation, ** + nonlinear_activation_params), )) in_chs = out_chs # add final layers @@ -369,8 +382,8 @@ class MelGANDiscriminator(nn.Layer): kernel_sizes[0], padding=(kernel_sizes[0] - 1) // 2, bias_attr=bias, ), - getattr(nn, nonlinear_activation)( - **nonlinear_activation_params), )) + get_activation(nonlinear_activation, ** + nonlinear_activation_params), )) self.layers.append( nn.Conv1D( out_chs, @@ -419,7 +432,7 @@ class MelGANMultiScaleDiscriminator(nn.Layer): max_downsample_channels: int=1024, bias: bool=True, downsample_scales: List[int]=[4, 4, 4, 4], - nonlinear_activation: str="LeakyReLU", + nonlinear_activation: str="leakyrelu", nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2}, pad: str="Pad1D", pad_params: Dict[str, Any]={"mode": "reflect"}, @@ -461,9 +474,14 @@ class MelGANMultiScaleDiscriminator(nn.Layer): Whether to use causal convolution. """ super().__init__() + # initialize parameters initialize(self, init_type) + # for compatibility + if nonlinear_activation == "LeakyReLU": + nonlinear_activation = "leakyrelu" + self.discriminators = nn.LayerList() # add discriminators diff --git a/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py b/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py index a5d4cdeb..75e99627 100644 --- a/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py +++ b/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from pathlib import Path from typing import Dict import paddle @@ -41,7 +42,7 @@ class MBMelGANUpdater(StandardUpdater): dataloader: DataLoader, discriminator_train_start_steps: int, lambda_adv: float, - output_dir=None): + output_dir: Path=None): self.models = models self.generator: Layer = models['generator'] self.discriminator: Layer = models['discriminator'] @@ -159,11 +160,11 @@ class MBMelGANUpdater(StandardUpdater): class MBMelGANEvaluator(StandardEvaluator): def __init__(self, - models, - criterions, - dataloader, - lambda_adv, - output_dir=None): + models: Dict[str, Layer], + criterions: Dict[str, Layer], + dataloader: DataLoader, + lambda_adv: float, + output_dir: Path=None): self.models = models self.generator = models['generator'] self.discriminator = models['discriminator'] diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py new file mode 100644 index 00000000..4725a8d0 --- /dev/null +++ b/paddlespeech/t2s/models/melgan/style_melgan.py @@ -0,0 +1,404 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from espnet(https://github.com/espnet/espnet) +"""StyleMelGAN Modules.""" +import copy +import math +from typing import Any +from typing import Dict +from typing import List + +import numpy as np +import paddle +import paddle.nn.functional as F +from paddle import nn + +from paddlespeech.t2s.models.melgan import MelGANDiscriminator as BaseDiscriminator +from paddlespeech.t2s.modules.activation import get_activation +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.t2s.modules.pqmf import PQMF +from paddlespeech.t2s.modules.tade_res_block import TADEResBlock + + +class StyleMelGANGenerator(nn.Layer): + """Style MelGAN generator module.""" + + def __init__( + self, + in_channels: int=128, + aux_channels: int=80, + channels: int=64, + out_channels: int=1, + kernel_size: int=9, + dilation: int=2, + bias: bool=True, + noise_upsample_scales: List[int]=[11, 2, 2, 2], + noise_upsample_activation: str="leakyrelu", + noise_upsample_activation_params: Dict[str, + Any]={"negative_slope": 0.2}, + upsample_scales: List[int]=[2, 2, 2, 2, 2, 2, 2, 2, 1], + upsample_mode: str="linear", + gated_function: str="softmax", + use_weight_norm: bool=True, + init_type: str="xavier_uniform", ): + """Initilize Style MelGAN generator. + Parameters + ---------- + in_channels : int + Number of input noise channels. + aux_channels : int + Number of auxiliary input channels. + channels : int + Number of channels for conv layer. + out_channels : int + Number of output channels. + kernel_size : int + Kernel size of conv layers. + dilation : int + Dilation factor for conv layers. + bias : bool + Whether to add bias parameter in convolution layers. + noise_upsample_scales : list + List of noise upsampling scales. + noise_upsample_activation : str + Activation function module name for noise upsampling. + noise_upsample_activation_params : dict + Hyperparameters for the above activation function. + upsample_scales : list + List of upsampling scales. + upsample_mode : str + Upsampling mode in TADE layer. + gated_function : str + Gated function in TADEResBlock ("softmax" or "sigmoid"). + use_weight_norm : bool + Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + """ + super().__init__() + + # initialize parameters + initialize(self, init_type) + + self.in_channels = in_channels + noise_upsample = [] + in_chs = in_channels + for noise_upsample_scale in noise_upsample_scales: + noise_upsample.append( + nn.Conv1DTranspose( + in_chs, + channels, + noise_upsample_scale * 2, + stride=noise_upsample_scale, + padding=noise_upsample_scale // 2 + noise_upsample_scale % + 2, + output_padding=noise_upsample_scale % 2, + bias_attr=bias, )) + noise_upsample.append( + get_activation(noise_upsample_activation, ** + noise_upsample_activation_params)) + in_chs = channels + self.noise_upsample = nn.Sequential(*noise_upsample) + self.noise_upsample_factor = np.prod(noise_upsample_scales) + + self.blocks = nn.LayerList() + aux_chs = aux_channels + for upsample_scale in upsample_scales: + self.blocks.append( + TADEResBlock( + in_channels=channels, + aux_channels=aux_chs, + kernel_size=kernel_size, + dilation=dilation, + bias=bias, + upsample_factor=upsample_scale, + upsample_mode=upsample_mode, + gated_function=gated_function, ), ) + aux_chs = channels + self.upsample_factor = np.prod(upsample_scales) + + self.output_conv = nn.Sequential( + nn.Conv1D( + channels, + out_channels, + kernel_size, + 1, + bias_attr=bias, + padding=(kernel_size - 1) // 2, ), + nn.Tanh(), ) + + nn.initializer.set_global_initializer(None) + + # apply weight norm + if use_weight_norm: + self.apply_weight_norm() + + # reset parameters + self.reset_parameters() + + def forward(self, c, z=None): + """Calculate forward propagation. + Parameters + ---------- + c : Tensor + Auxiliary input tensor (B, channels, T). + z : Tensor + Input noise tensor (B, in_channels, 1). + Returns + ---------- + Tensor + Output tensor (B, out_channels, T ** prod(upsample_scales)). + """ + # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300) + if z is None: + z = paddle.randn([paddle.shape(c)[0], self.in_channels, 1]) + # (B, in_channels, noise_upsample_factor). + x = self.noise_upsample(z) + for block in self.blocks: + x, c = block(x, c) + x = self.output_conv(x) + return x + + def apply_weight_norm(self): + """Recursively apply weight normalization to all the Convolution layers + in the sublayers. + """ + + def _apply_weight_norm(layer): + if isinstance(layer, (nn.Conv1D, nn.Conv1DTranspose)): + nn.utils.weight_norm(layer) + + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + """Recursively remove weight normalization from all the Convolution + layers in the sublayers. + """ + + def _remove_weight_norm(layer): + try: + if layer: + nn.utils.remove_weight_norm(layer) + except ValueError: + pass + + self.apply(_remove_weight_norm) + + def reset_parameters(self): + """Reset parameters. + This initialization follows official implementation manner. + https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py + """ + # 定义参数为float的正态分布。 + dist = paddle.distribution.Normal(loc=0.0, scale=0.02) + + def _reset_parameters(m): + if isinstance(m, nn.Conv1D) or isinstance(m, nn.Conv1DTranspose): + w = dist.sample(m.weight.shape) + m.weight.set_value(w) + + self.apply(_reset_parameters) + + def inference(self, c): + """Perform inference. + Parameters + ---------- + c : Tensor + Input tensor (T, in_channels). + Returns + ---------- + Tensor + Output tensor (T ** prod(upsample_scales), out_channels). + """ + # (1, in_channels, T) + c = c.transpose([1, 0]).unsqueeze(0) + c_shape = paddle.shape(c) + # prepare noise input + # there is a bug in Paddle int division, we must convert a int tensor to int here + noise_size = (1, self.in_channels, + math.ceil(int(c_shape[2]) / self.noise_upsample_factor)) + # (1, in_channels, T/noise_upsample_factor) + noise = paddle.randn(noise_size) + # (1, in_channels, T) + x = self.noise_upsample(noise) + x_shape = paddle.shape(x) + total_length = c_shape[2] * self.upsample_factor + c = F.pad( + c, (0, x_shape[2] - c_shape[2]), "replicate", data_format="NCL") + # c.shape[2] == x.shape[2] here + # (1, in_channels, T*prod(upsample_scales)) + for block in self.blocks: + x, c = block(x, c) + x = self.output_conv(x)[..., :total_length] + return x.squeeze(0).transpose([1, 0]) + + +# StyleMelGANDiscriminator 不需要 remove weight norm 嘛? +class StyleMelGANDiscriminator(nn.Layer): + """Style MelGAN disciminator module.""" + + def __init__( + self, + repeats: int=2, + window_sizes: List[int]=[512, 1024, 2048, 4096], + pqmf_params: List[List[int]]=[ + [1, None, None, None], + [2, 62, 0.26700, 9.0], + [4, 62, 0.14200, 9.0], + [8, 62, 0.07949, 9.0], + ], + discriminator_params: Dict[str, Any]={ + "out_channels": 1, + "kernel_sizes": [5, 3], + "channels": 16, + "max_downsample_channels": 512, + "bias": True, + "downsample_scales": [4, 4, 4, 1], + "nonlinear_activation": "leakyrelu", + "nonlinear_activation_params": { + "negative_slope": 0.2 + }, + "pad": "Pad1D", + "pad_params": { + "mode": "reflect" + }, + }, + use_weight_norm: bool=True, + init_type: str="xavier_uniform", ): + """Initilize Style MelGAN discriminator. + Parameters + ---------- + repeats : int + Number of repititons to apply RWD. + window_sizes : list + List of random window sizes. + pqmf_params : list + List of list of Parameters for PQMF modules + discriminator_params : dict + Parameters for base discriminator module. + use_weight_nom : bool + Whether to apply weight normalization. + """ + super().__init__() + + # initialize parameters + initialize(self, init_type) + + # window size check + assert len(window_sizes) == len(pqmf_params) + sizes = [ws // p[0] for ws, p in zip(window_sizes, pqmf_params)] + assert len(window_sizes) == sum([sizes[0] == size for size in sizes]) + + self.repeats = repeats + self.window_sizes = window_sizes + self.pqmfs = nn.LayerList() + self.discriminators = nn.LayerList() + for pqmf_param in pqmf_params: + d_params = copy.deepcopy(discriminator_params) + d_params["in_channels"] = pqmf_param[0] + if pqmf_param[0] == 1: + self.pqmfs.append(nn.Identity()) + else: + self.pqmfs.append(PQMF(*pqmf_param)) + self.discriminators.append(BaseDiscriminator(**d_params)) + + nn.initializer.set_global_initializer(None) + + # apply weight norm + if use_weight_norm: + self.apply_weight_norm() + + # reset parameters + self.reset_parameters() + + def forward(self, x): + """Calculate forward propagation. + Parameters + ---------- + x : Tensor + Input tensor (B, 1, T). + Returns + ---------- + List + List of discriminator outputs, #items in the list will be + equal to repeats * #discriminators. + """ + outs = [] + for _ in range(self.repeats): + outs += self._forward(x) + return outs + + def _forward(self, x): + outs = [] + for idx, (ws, pqmf, disc) in enumerate( + zip(self.window_sizes, self.pqmfs, self.discriminators)): + start_idx = int(np.random.randint(paddle.shape(x)[-1] - ws)) + x_ = x[:, :, start_idx:start_idx + ws] + if idx == 0: + # nn.Identity() + x_ = pqmf(x_) + else: + x_ = pqmf.analysis(x_) + outs += [disc(x_)] + return outs + + def apply_weight_norm(self): + """Recursively apply weight normalization to all the Convolution layers + in the sublayers. + """ + + def _apply_weight_norm(layer): + if isinstance(layer, (nn.Conv1D, nn.Conv1DTranspose)): + nn.utils.weight_norm(layer) + + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + """Recursively remove weight normalization from all the Convolution + layers in the sublayers. + """ + + def _remove_weight_norm(layer): + try: + nn.utils.remove_weight_norm(layer) + except ValueError: + pass + + self.apply(_remove_weight_norm) + + def reset_parameters(self): + """Reset parameters. + This initialization follows official implementation manner. + https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py + """ + # 定义参数为float的正态分布。 + dist = paddle.distribution.Normal(loc=0.0, scale=0.02) + + def _reset_parameters(m): + if isinstance(m, nn.Conv1D) or isinstance(m, nn.Conv1DTranspose): + w = dist.sample(m.weight.shape) + m.weight.set_value(w) + + self.apply(_reset_parameters) + + +class StyleMelGANInference(nn.Layer): + def __init__(self, normalizer, style_melgan_generator): + super().__init__() + self.normalizer = normalizer + self.style_melgan_generator = style_melgan_generator + + def forward(self, logmel): + normalized_mel = self.normalizer(logmel) + wav = self.style_melgan_generator.inference(normalized_mel) + return wav diff --git a/paddlespeech/t2s/models/melgan/style_melgan_updater.py b/paddlespeech/t2s/models/melgan/style_melgan_updater.py new file mode 100644 index 00000000..49054aa7 --- /dev/null +++ b/paddlespeech/t2s/models/melgan/style_melgan_updater.py @@ -0,0 +1,221 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from pathlib import Path +from typing import Dict + +import paddle +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer +from paddle.optimizer.lr import LRScheduler + +from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator +from paddlespeech.t2s.training.reporter import report +from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater +from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState +logging.basicConfig( + format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class StyleMelGANUpdater(StandardUpdater): + def __init__(self, + models: Dict[str, Layer], + optimizers: Dict[str, Optimizer], + criterions: Dict[str, Layer], + schedulers: Dict[str, LRScheduler], + dataloader: DataLoader, + discriminator_train_start_steps: int, + lambda_adv: float, + lambda_aux: float=1.0, + output_dir: Path=None): + self.models = models + self.generator: Layer = models['generator'] + self.discriminator: Layer = models['discriminator'] + + self.optimizers = optimizers + self.optimizer_g: Optimizer = optimizers['generator'] + self.optimizer_d: Optimizer = optimizers['discriminator'] + + self.criterions = criterions + self.criterion_stft = criterions['stft'] + self.criterion_gen_adv = criterions["gen_adv"] + self.criterion_dis_adv = criterions["dis_adv"] + + self.schedulers = schedulers + self.scheduler_g = schedulers['generator'] + self.scheduler_d = schedulers['discriminator'] + + self.dataloader = dataloader + + self.discriminator_train_start_steps = discriminator_train_start_steps + self.lambda_adv = lambda_adv + self.lambda_aux = lambda_aux + self.state = UpdaterState(iteration=0, epoch=0) + + self.train_iterator = iter(self.dataloader) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def update_core(self, batch): + self.msg = "Rank: {}, ".format(dist.get_rank()) + losses_dict = {} + + # parse batch + wav, mel = batch + # Generator + # (B, out_channels, T ** prod(upsample_scales) + wav_ = self.generator(mel) + + # initialize + gen_loss = 0.0 + + # full band Multi-resolution stft loss + sc_loss, mag_loss = self.criterion_stft(wav_, wav) + gen_loss += sc_loss + mag_loss + report("train/spectral_convergence_loss", float(sc_loss)) + report("train/log_stft_magnitude_loss", float(mag_loss)) + losses_dict["spectral_convergence_loss"] = float(sc_loss) + losses_dict["log_stft_magnitude_loss"] = float(mag_loss) + + gen_loss *= self.lambda_aux + + ## Adversarial loss + if self.state.iteration > self.discriminator_train_start_steps: + p_ = self.discriminator(wav_) + adv_loss = self.criterion_gen_adv(p_) + report("train/adversarial_loss", float(adv_loss)) + losses_dict["adversarial_loss"] = float(adv_loss) + gen_loss += self.lambda_adv * adv_loss + + report("train/generator_loss", float(gen_loss)) + losses_dict["generator_loss"] = float(gen_loss) + + self.optimizer_g.clear_grad() + gen_loss.backward() + + self.optimizer_g.step() + self.scheduler_g.step() + + # Disctiminator + if self.state.iteration > self.discriminator_train_start_steps: + # re-compute wav_ which leads better quality + with paddle.no_grad(): + wav_ = self.generator(mel) + + p = self.discriminator(wav) + p_ = self.discriminator(wav_.detach()) + real_loss, fake_loss = self.criterion_dis_adv(p_, p) + dis_loss = real_loss + fake_loss + report("train/real_loss", float(real_loss)) + report("train/fake_loss", float(fake_loss)) + report("train/discriminator_loss", float(dis_loss)) + losses_dict["real_loss"] = float(real_loss) + losses_dict["fake_loss"] = float(fake_loss) + losses_dict["discriminator_loss"] = float(dis_loss) + + self.optimizer_d.clear_grad() + dis_loss.backward() + + self.optimizer_d.step() + self.scheduler_d.step() + + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + +class StyleMelGANEvaluator(StandardEvaluator): + def __init__(self, + models: Dict[str, Layer], + criterions: Dict[str, Layer], + dataloader: DataLoader, + lambda_adv: float, + lambda_aux: float=1.0, + output_dir: Path=None): + self.models = models + self.generator = models['generator'] + self.discriminator = models['discriminator'] + + self.criterions = criterions + self.criterion_stft = criterions['stft'] + self.criterion_gen_adv = criterions["gen_adv"] + self.criterion_dis_adv = criterions["dis_adv"] + + self.dataloader = dataloader + self.lambda_adv = lambda_adv + self.lambda_aux = lambda_aux + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def evaluate_core(self, batch): + # logging.debug("Evaluate: ") + self.msg = "Evaluate: " + losses_dict = {} + + wav, mel = batch + # Generator + # (B, out_channels, T ** prod(upsample_scales) + wav_ = self.generator(mel) + + ## Adversarial loss + p_ = self.discriminator(wav_) + adv_loss = self.criterion_gen_adv(p_) + + report("eval/adversarial_loss", float(adv_loss)) + losses_dict["adversarial_loss"] = float(adv_loss) + gen_loss = self.lambda_adv * adv_loss + + # initialize + aux_loss = 0.0 + # Multi-resolution stft loss + sc_loss, mag_loss = self.criterion_stft(wav_, wav) + aux_loss += sc_loss + mag_loss + report("eval/spectral_convergence_loss", float(sc_loss)) + report("eval/log_stft_magnitude_loss", float(mag_loss)) + losses_dict["spectral_convergence_loss"] = float(sc_loss) + losses_dict["log_stft_magnitude_loss"] = float(mag_loss) + + aux_loss *= self.lambda_aux + gen_loss += aux_loss + + report("eval/generator_loss", float(gen_loss)) + losses_dict["generator_loss"] = float(gen_loss) + + # Disctiminator + p = self.discriminator(wav) + real_loss, fake_loss = self.criterion_dis_adv(p_, p) + dis_loss = real_loss + fake_loss + report("eval/real_loss", float(real_loss)) + report("eval/fake_loss", float(fake_loss)) + report("eval/discriminator_loss", float(dis_loss)) + + losses_dict["real_loss"] = float(real_loss) + losses_dict["fake_loss"] = float(fake_loss) + losses_dict["discriminator_loss"] = float(dis_loss) + + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + self.logger.info(self.msg) diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py index 4e3daaa3..8ed50bc8 100644 --- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py +++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py @@ -10,8 +10,10 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. +# limitations +# under the License. import logging +from pathlib import Path from typing import Dict import paddle @@ -42,7 +44,7 @@ class PWGUpdater(StandardUpdater): dataloader: DataLoader, discriminator_train_start_steps: int, lambda_adv: float, - output_dir=None): + output_dir: Path=None): self.models = models self.generator: Layer = models['generator'] self.discriminator: Layer = models['discriminator'] @@ -155,11 +157,11 @@ class PWGUpdater(StandardUpdater): class PWGEvaluator(StandardEvaluator): def __init__(self, - models, - criterions, - dataloader, - lambda_adv, - output_dir=None): + models: Dict[str, Layer], + criterions: Dict[str, Layer], + dataloader: DataLoader, + lambda_adv: float, + output_dir: Path=None): self.models = models self.generator = models['generator'] self.discriminator = models['discriminator'] diff --git a/paddlespeech/t2s/modules/activation.py b/paddlespeech/t2s/modules/activation.py index f5b0af6e..8d8cd62e 100644 --- a/paddlespeech/t2s/modules/activation.py +++ b/paddlespeech/t2s/modules/activation.py @@ -27,7 +27,7 @@ class GLU(nn.Layer): return F.glu(xs, axis=self.dim) -def get_activation(act): +def get_activation(act, **kwargs): """Return activation function.""" activation_funcs = { @@ -35,8 +35,9 @@ def get_activation(act): "tanh": paddle.nn.Tanh, "relu": paddle.nn.ReLU, "selu": paddle.nn.SELU, + "leakyrelu": paddle.nn.LeakyReLU, "swish": paddle.nn.Swish, "glu": GLU } - return activation_funcs[act]() + return activation_funcs[act](**kwargs) diff --git a/paddlespeech/t2s/modules/residual_stack.py b/paddlespeech/t2s/modules/residual_stack.py index 236f41d3..ee3ba64d 100644 --- a/paddlespeech/t2s/modules/residual_stack.py +++ b/paddlespeech/t2s/modules/residual_stack.py @@ -18,6 +18,7 @@ from typing import Dict from paddle import nn +from paddlespeech.t2s.modules.activation import get_activation from paddlespeech.t2s.modules.causal_conv import CausalConv1D @@ -30,7 +31,7 @@ class ResidualStack(nn.Layer): channels: int=32, dilation: int=1, bias: bool=True, - nonlinear_activation: str="LeakyReLU", + nonlinear_activation: str="leakyrelu", nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2}, pad: str="Pad1D", pad_params: Dict[str, Any]={"mode": "reflect"}, @@ -58,14 +59,17 @@ class ResidualStack(nn.Layer): Whether to use causal convolution. """ super().__init__() + # for compatibility + if nonlinear_activation == "LeakyReLU": + nonlinear_activation = "leakyrelu" # defile residual stack part if not use_causal_conv: assert (kernel_size - 1 ) % 2 == 0, "Not support even number kernel size." self.stack = nn.Sequential( - getattr(nn, nonlinear_activation)( - **nonlinear_activation_params), + get_activation(nonlinear_activation, + **nonlinear_activation_params), getattr(nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params), nn.Conv1D( @@ -74,13 +78,13 @@ class ResidualStack(nn.Layer): kernel_size, dilation=dilation, bias_attr=bias), - getattr(nn, nonlinear_activation)( - **nonlinear_activation_params), + get_activation(nonlinear_activation, + **nonlinear_activation_params), nn.Conv1D(channels, channels, 1, bias_attr=bias), ) else: self.stack = nn.Sequential( - getattr(nn, nonlinear_activation)( - **nonlinear_activation_params), + get_activation(nonlinear_activation, + **nonlinear_activation_params), CausalConv1D( channels, channels, @@ -89,8 +93,8 @@ class ResidualStack(nn.Layer): bias=bias, pad=pad, pad_params=pad_params, ), - getattr(nn, nonlinear_activation)( - **nonlinear_activation_params), + get_activation(nonlinear_activation, + **nonlinear_activation_params), nn.Conv1D(channels, channels, 1, bias_attr=bias), ) # defile extra layer for skip connection diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py index e76226f3..9d4b83a2 100644 --- a/paddlespeech/t2s/modules/style_encoder.py +++ b/paddlespeech/t2s/modules/style_encoder.py @@ -298,8 +298,8 @@ class MultiHeadedAttention(BaseMultiHeadedAttention): def __init__(self, q_dim, k_dim, v_dim, n_head, n_feat, dropout_rate=0.0): """Initialize multi head attention module.""" - # NOTE(kan-bayashi): Do not use super().__init__() here since we want to - # overwrite BaseMultiHeadedAttention.__init__() method. + # Do not use super().__init__() here since we want to + # overwrite BaseMultiHeadedAttention.__init__() method. nn.Layer.__init__(self) assert n_feat % n_head == 0 # We assume d_v always equals d_k diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py new file mode 100644 index 00000000..19b07639 --- /dev/null +++ b/paddlespeech/t2s/modules/tade_res_block.py @@ -0,0 +1,164 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from espnet(https://github.com/espnet/espnet) +"""StyleMelGAN's TADEResBlock Modules.""" +from functools import partial + +import paddle.nn.functional as F +from paddle import nn + + +class TADELayer(nn.Layer): + """TADE Layer module.""" + + def __init__( + self, + in_channels: int=64, + aux_channels: int=80, + kernel_size: int=9, + bias: bool=True, + upsample_factor: int=2, + upsample_mode: str="nearest", ): + """Initilize TADE layer.""" + super().__init__() + self.norm = nn.InstanceNorm1D( + in_channels, momentum=0.1, data_format="NCL") + self.aux_conv = nn.Sequential( + nn.Conv1D( + aux_channels, + in_channels, + kernel_size, + 1, + bias_attr=bias, + padding=(kernel_size - 1) // 2, ), ) + self.gated_conv = nn.Sequential( + nn.Conv1D( + in_channels, + in_channels * 2, + kernel_size, + 1, + bias_attr=bias, + padding=(kernel_size - 1) // 2, ), ) + self.upsample = nn.Upsample( + scale_factor=upsample_factor, mode=upsample_mode) + + def forward(self, x, c): + """Calculate forward propagation. + Parameters + ---------- + x : Tensor + Input tensor (B, in_channels, T). + c : Tensor + Auxiliary input tensor (B, aux_channels, T). + Returns + ---------- + Tensor + Output tensor (B, in_channels, T * upsample_factor). + Tensor + Upsampled aux tensor (B, in_channels, T * upsample_factor). + """ + + x = self.norm(x) + # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor. + c = self.upsample(c.unsqueeze(-1)) + c = c[:, :, :, 0] + + c = self.aux_conv(c) + cg = self.gated_conv(c) + cg1, cg2 = cg.split(2, axis=1) + # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor. + y = cg1 * self.upsample(x.unsqueeze(-1))[:, :, :, 0] + cg2 + return y, c + + +class TADEResBlock(nn.Layer): + """TADEResBlock module.""" + + def __init__( + self, + in_channels: int=64, + aux_channels: int=80, + kernel_size: int=9, + dilation: int=2, + bias: bool=True, + upsample_factor: int=2, + # this is a diff in paddle, the mode only can be "linear" when input is 3D + upsample_mode: str="nearest", + gated_function: str="softmax", ): + """Initialize TADEResBlock module.""" + super().__init__() + self.tade1 = TADELayer( + in_channels=in_channels, + aux_channels=aux_channels, + kernel_size=kernel_size, + bias=bias, + upsample_factor=1, + upsample_mode=upsample_mode, ) + self.gated_conv1 = nn.Conv1D( + in_channels, + in_channels * 2, + kernel_size, + 1, + bias_attr=bias, + padding=(kernel_size - 1) // 2, ) + self.tade2 = TADELayer( + in_channels=in_channels, + aux_channels=in_channels, + kernel_size=kernel_size, + bias=bias, + upsample_factor=upsample_factor, + upsample_mode=upsample_mode, ) + self.gated_conv2 = nn.Conv1D( + in_channels, + in_channels * 2, + kernel_size, + 1, + bias_attr=bias, + dilation=dilation, + padding=(kernel_size - 1) // 2 * dilation, ) + self.upsample = nn.Upsample( + scale_factor=upsample_factor, mode=upsample_mode) + if gated_function == "softmax": + self.gated_function = partial(F.softmax, axis=1) + elif gated_function == "sigmoid": + self.gated_function = F.sigmoid + else: + raise ValueError(f"{gated_function} is not supported.") + + def forward(self, x, c): + """Calculate forward propagation. + Parameters + ---------- + x : Tensor + Input tensor (B, in_channels, T). + c : Tensor + Auxiliary input tensor (B, aux_channels, T). + Returns + ---------- + Tensor + Output tensor (B, in_channels, T * upsample_factor). + Tensor + Upsampled auxirialy tensor (B, in_channels, T * upsample_factor). + """ + residual = x + x, c = self.tade1(x, c) + x = self.gated_conv1(x) + xa, xb = x.split(2, axis=1) + x = self.gated_function(xa) * F.tanh(xb) + x, c = self.tade2(x, c) + x = self.gated_conv2(x) + xa, xb = x.split(2, axis=1) + x = self.gated_function(xa) * F.tanh(xb) + # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor. + return self.upsample(residual.unsqueeze(-1))[:, :, :, 0] + x, c From 7bfafc83103d461d8caf5476b542711b4cea8a49 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 2 Dec 2021 11:11:46 +0000 Subject: [PATCH 02/50] add style_melgan readme, test=tts --- examples/csmsc/voc4/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 examples/csmsc/voc4/README.md diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md new file mode 100644 index 00000000..ba7d54d4 --- /dev/null +++ b/examples/csmsc/voc4/README.md @@ -0,0 +1 @@ +# Style MelGAN with CSMSC From a0f74ef63f885d3a5f7588d251b2d0b93eb41492 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 2 Dec 2021 11:19:09 +0000 Subject: [PATCH 03/50] add style_melgan readme, test=tts --- examples/csmsc/voc4/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md index ba7d54d4..8d956faf 100644 --- a/examples/csmsc/voc4/README.md +++ b/examples/csmsc/voc4/README.md @@ -1 +1,2 @@ # Style MelGAN with CSMSC +This example contains code used to train a [Style MelGAN](https://arxiv.org/abs/2011.01557) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). From 075aeee7f0297cb9903ef086e7b913c27a7b76b5 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 2 Dec 2021 12:18:43 +0000 Subject: [PATCH 04/50] add style_melgan readme, test=tts --- examples/aishell3/voc1/README.md | 2 +- examples/csmsc/voc1/README.md | 2 +- examples/csmsc/voc3/README.md | 2 +- examples/csmsc/voc4/README.md | 109 +++++++++++++++++++++++++++++++ examples/ljspeech/voc1/README.md | 2 +- examples/vctk/voc1/README.md | 2 +- 6 files changed, 114 insertions(+), 5 deletions(-) diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index 9189eb72..de7e04a6 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -105,7 +105,7 @@ benchmark: 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesizing -`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index e6ee7b4a..b13d5896 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -95,7 +95,7 @@ benchmark: 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesizing -`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index 52ca51e9..99cef233 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -80,7 +80,7 @@ optional arguments: 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesizing -`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md index 8d956faf..86030e39 100644 --- a/examples/csmsc/voc4/README.md +++ b/examples/csmsc/voc4/README.md @@ -1,2 +1,111 @@ # Style MelGAN with CSMSC This example contains code used to train a [Style MelGAN](https://arxiv.org/abs/2011.01557) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). +## Dataset +### Download and Extract +Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`. + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/BZNSYP`. +Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── feats_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`. + +Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. + +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--verbose VERBOSE] + +Train a Multi-Band MelGAN model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG config file to overwrite default config. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + --verbose VERBOSE verbose. +``` + +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +### Synthesizing +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] + [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--verbose VERBOSE] + +Synthesize with multi band melgan. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG multi band melgan config file. + --checkpoint CHECKPOINT + snapshot to load. + --test-metadata TEST_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + --verbose VERBOSE verbose. +``` + +1. `--config` multi band melgan config file. You should use the same config with which the model is trained. +2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. +3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. +4. `--output-dir` is the directory to save the synthesized audio files. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 3830156f..5c556124 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -95,7 +95,7 @@ benchmark: 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesizing -`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 6aa311fb..6d7b3256 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -100,7 +100,7 @@ benchmark: 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ### Synthesizing -`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` From 7b2ecb6eedeaa74f222aee07a7871715e3078fd9 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 3 Dec 2021 15:54:18 +0000 Subject: [PATCH 05/50] add style_melgan, test=tts --- .../t2s/models/melgan/style_melgan_updater.py | 86 ++++++++++--------- 1 file changed, 46 insertions(+), 40 deletions(-) diff --git a/paddlespeech/t2s/models/melgan/style_melgan_updater.py b/paddlespeech/t2s/models/melgan/style_melgan_updater.py index 49054aa7..b0cb4ed6 100644 --- a/paddlespeech/t2s/models/melgan/style_melgan_updater.py +++ b/paddlespeech/t2s/models/melgan/style_melgan_updater.py @@ -40,8 +40,9 @@ class StyleMelGANUpdater(StandardUpdater): criterions: Dict[str, Layer], schedulers: Dict[str, LRScheduler], dataloader: DataLoader, - discriminator_train_start_steps: int, - lambda_adv: float, + generator_train_start_steps: int=0, + discriminator_train_start_steps: int=100000, + lambda_adv: float=1.0, lambda_aux: float=1.0, output_dir: Path=None): self.models = models @@ -63,11 +64,12 @@ class StyleMelGANUpdater(StandardUpdater): self.dataloader = dataloader + self.generator_train_start_steps = generator_train_start_steps self.discriminator_train_start_steps = discriminator_train_start_steps self.lambda_adv = lambda_adv self.lambda_aux = lambda_aux - self.state = UpdaterState(iteration=0, epoch=0) + self.state = UpdaterState(iteration=0, epoch=0) self.train_iterator = iter(self.dataloader) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) @@ -79,42 +81,45 @@ class StyleMelGANUpdater(StandardUpdater): def update_core(self, batch): self.msg = "Rank: {}, ".format(dist.get_rank()) losses_dict = {} - # parse batch wav, mel = batch + # Generator - # (B, out_channels, T ** prod(upsample_scales) - wav_ = self.generator(mel) + if self.state.iteration > self.generator_train_start_steps: + # (B, out_channels, T ** prod(upsample_scales) + wav_ = self.generator(mel) - # initialize - gen_loss = 0.0 + # initialize + gen_loss = 0.0 + aux_loss = 0.0 - # full band Multi-resolution stft loss - sc_loss, mag_loss = self.criterion_stft(wav_, wav) - gen_loss += sc_loss + mag_loss - report("train/spectral_convergence_loss", float(sc_loss)) - report("train/log_stft_magnitude_loss", float(mag_loss)) - losses_dict["spectral_convergence_loss"] = float(sc_loss) - losses_dict["log_stft_magnitude_loss"] = float(mag_loss) + # full band multi-resolution stft loss + sc_loss, mag_loss = self.criterion_stft(wav_, wav) + aux_loss += sc_loss + mag_loss + report("train/spectral_convergence_loss", float(sc_loss)) + report("train/log_stft_magnitude_loss", float(mag_loss)) + losses_dict["spectral_convergence_loss"] = float(sc_loss) + losses_dict["log_stft_magnitude_loss"] = float(mag_loss) - gen_loss *= self.lambda_aux + gen_loss += aux_loss * self.lambda_aux - ## Adversarial loss - if self.state.iteration > self.discriminator_train_start_steps: - p_ = self.discriminator(wav_) - adv_loss = self.criterion_gen_adv(p_) - report("train/adversarial_loss", float(adv_loss)) - losses_dict["adversarial_loss"] = float(adv_loss) - gen_loss += self.lambda_adv * adv_loss + # adversarial loss + if self.state.iteration > self.discriminator_train_start_steps: + p_ = self.discriminator(wav_) + adv_loss = self.criterion_gen_adv(p_) + report("train/adversarial_loss", float(adv_loss)) + losses_dict["adversarial_loss"] = float(adv_loss) - report("train/generator_loss", float(gen_loss)) - losses_dict["generator_loss"] = float(gen_loss) + gen_loss += self.lambda_adv * adv_loss - self.optimizer_g.clear_grad() - gen_loss.backward() + report("train/generator_loss", float(gen_loss)) + losses_dict["generator_loss"] = float(gen_loss) - self.optimizer_g.step() - self.scheduler_g.step() + self.optimizer_g.clear_grad() + gen_loss.backward() + + self.optimizer_g.step() + self.scheduler_g.step() # Disctiminator if self.state.iteration > self.discriminator_train_start_steps: @@ -148,7 +153,7 @@ class StyleMelGANEvaluator(StandardEvaluator): models: Dict[str, Layer], criterions: Dict[str, Layer], dataloader: DataLoader, - lambda_adv: float, + lambda_adv: float=1.0, lambda_aux: float=1.0, output_dir: Path=None): self.models = models @@ -161,6 +166,7 @@ class StyleMelGANEvaluator(StandardEvaluator): self.criterion_dis_adv = criterions["dis_adv"] self.dataloader = dataloader + self.lambda_adv = lambda_adv self.lambda_aux = lambda_aux @@ -171,26 +177,27 @@ class StyleMelGANEvaluator(StandardEvaluator): self.msg = "" def evaluate_core(self, batch): - # logging.debug("Evaluate: ") self.msg = "Evaluate: " losses_dict = {} - wav, mel = batch + # Generator # (B, out_channels, T ** prod(upsample_scales) wav_ = self.generator(mel) - ## Adversarial loss + # initialize + gen_loss = 0.0 + aux_loss = 0.0 + + # adversarial loss p_ = self.discriminator(wav_) adv_loss = self.criterion_gen_adv(p_) - report("eval/adversarial_loss", float(adv_loss)) losses_dict["adversarial_loss"] = float(adv_loss) - gen_loss = self.lambda_adv * adv_loss - # initialize - aux_loss = 0.0 - # Multi-resolution stft loss + gen_loss += self.lambda_adv * adv_loss + + # multi-resolution stft loss sc_loss, mag_loss = self.criterion_stft(wav_, wav) aux_loss += sc_loss + mag_loss report("eval/spectral_convergence_loss", float(sc_loss)) @@ -198,8 +205,7 @@ class StyleMelGANEvaluator(StandardEvaluator): losses_dict["spectral_convergence_loss"] = float(sc_loss) losses_dict["log_stft_magnitude_loss"] = float(mag_loss) - aux_loss *= self.lambda_aux - gen_loss += aux_loss + gen_loss += aux_loss * self.lambda_aux report("eval/generator_loss", float(gen_loss)) losses_dict["generator_loss"] = float(gen_loss) From db1b0e037b7cd191f7264c64b13c365430a0f55e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 9 Dec 2021 17:39:06 +0800 Subject: [PATCH 06/50] Update README.md --- demos/README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/demos/README.md b/demos/README.md index 2183c1f2..28bab8bb 100644 --- a/demos/README.md +++ b/demos/README.md @@ -1 +1,10 @@ -# Demos for PaddleSpeech +# Speech Application based on PaddleSpeech + +The directory containes many speech applications in multi scenarios. + +* audio tagging - tag audio label in vedio +* metaverse - 2D AR with TTS +* speech recogintion - vidio understanding +* speech translation - end to end speech translation +* story talker - book reader based on OCR and TTS +* style_fs2 - multi style control for FastSpeech2 model From 252e0ceb71e5379aa6187ac90f6b5d13b75e1917 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 9 Dec 2021 18:19:06 +0800 Subject: [PATCH 07/50] Update README.md --- demos/metaverse/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/demos/metaverse/README.md b/demos/metaverse/README.md index 6f879a97..f41b7999 100644 --- a/demos/metaverse/README.md +++ b/demos/metaverse/README.md @@ -16,6 +16,8 @@ Run `run.sh` to complete all the essential procedures, including the installatio ``` In `run.sh`, it will execute `source path.sh` firstly, which will set the environment variants. +If you would like to try your own sentence, please replace the sentence in the `sentences.txt`. + If you would like to try your own image, please replace the image `download/Lamarr.png` in the shell script. The result has shown on our [notebook](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/tutorial/tts/tts_tutorial.ipynb). From 061823e53f66a781a23077abb7b7e33bb1273330 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 9 Dec 2021 18:19:21 +0800 Subject: [PATCH 08/50] Update README.md --- demos/metaverse/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/metaverse/README.md b/demos/metaverse/README.md index f41b7999..67c6b7c2 100644 --- a/demos/metaverse/README.md +++ b/demos/metaverse/README.md @@ -16,7 +16,7 @@ Run `run.sh` to complete all the essential procedures, including the installatio ``` In `run.sh`, it will execute `source path.sh` firstly, which will set the environment variants. -If you would like to try your own sentence, please replace the sentence in the `sentences.txt`. +If you would like to try your own sentence, please replace the sentence in `sentences.txt`. If you would like to try your own image, please replace the image `download/Lamarr.png` in the shell script. From d3aa7c6168b13fc00513a9b77046952e9572f5c6 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 9 Dec 2021 18:19:27 +0800 Subject: [PATCH 09/50] Update README.md --- demos/style_fs2/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/demos/style_fs2/README.md b/demos/style_fs2/README.md index c80b5731..d8daabde 100644 --- a/demos/style_fs2/README.md +++ b/demos/style_fs2/README.md @@ -18,6 +18,10 @@ Run the following command line to get started: ``` ./run.sh ``` +In `run.sh`, it will execute `source path.sh` firstly, which will set the environment variants. + +If you would like to try your own sentence, please replace the sentence in `sentences.txt`. + For more details, please see `style_syn.py` The audio samples are in [style-control-in-fastspeech2](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html#style-control-in-fastspeech2) From 31510d088ce29a86821ae4c4854fcd15f5925441 Mon Sep 17 00:00:00 2001 From: gongel Date: Thu, 9 Dec 2021 11:08:10 +0000 Subject: [PATCH 10/50] refactor: rm kaldi_io --- paddlespeech/cli/st/infer.py | 15 +++++++-------- setup.py | 1 - 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index d7b53a07..2bc98512 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -18,11 +18,14 @@ from typing import List from typing import Optional from typing import Union -import kaldi_io +import kaldiio import numpy as np import paddle import soundfile from kaldiio import WriteHelper +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from paddlespeech.s2t.utils.utility import UpdateConfig from yacs.config import CfgNode from ..executor import BaseExecutor @@ -30,9 +33,6 @@ from ..utils import cli_register from ..utils import download_and_decompress from ..utils import logger from ..utils import MODEL_HOME -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ["STExecutor"] @@ -234,7 +234,7 @@ class STExecutor(BaseExecutor): f"{utt_name} {wav_file}".encode("utf8")) fbank_extract_process.stdin.close() fbank_feat = dict( - kaldi_io.read_mat_ark(fbank_extract_process.stdout))[utt_name] + kaldiio.load_ark(fbank_extract_process.stdout))[utt_name] extract_command = ["compute-kaldi-pitch-feats", "scp:-", "ark:-"] pitch_extract_process = subprocess.Popen( @@ -251,8 +251,7 @@ class STExecutor(BaseExecutor): stdout=subprocess.PIPE, stderr=subprocess.PIPE) pitch_extract_process.stdin.close() - pitch_feat = dict( - kaldi_io.read_mat_ark(pitch_process.stdout))[utt_name] + pitch_feat = dict(kaldiio.load_ark(pitch_process.stdout))[utt_name] concated_feat = np.concatenate((fbank_feat, pitch_feat), axis=1) raw_feat = f"{utt_name}.raw" with WriteHelper( @@ -272,7 +271,7 @@ class STExecutor(BaseExecutor): stdin=cmvn_process.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - norm_feat = dict(kaldi_io.read_mat_ark(process.stdout))[utt_name] + norm_feat = dict(kaldiio.load_ark(process.stdout))[utt_name] self._inputs["audio"] = paddle.to_tensor(norm_feat).unsqueeze(0) self._inputs["audio_len"] = paddle.to_tensor( self._inputs["audio"].shape[1], dtype="int64") diff --git a/setup.py b/setup.py index 039ab82a..9aaaa6eb 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,6 @@ requirements = { "jieba", "jsonlines", "kaldiio", - "kaldi_io", "librosa", "loguru", "matplotlib", From dc60aeb8c228583a86fae0fbdaf5f2c1c4d23a7f Mon Sep 17 00:00:00 2001 From: gongel Date: Thu, 9 Dec 2021 11:43:01 +0000 Subject: [PATCH 11/50] format --- paddlespeech/cli/st/infer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index 2bc98512..32f9d425 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -23,9 +23,6 @@ import numpy as np import paddle import soundfile from kaldiio import WriteHelper -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.s2t.utils.utility import UpdateConfig from yacs.config import CfgNode from ..executor import BaseExecutor @@ -33,6 +30,9 @@ from ..utils import cli_register from ..utils import download_and_decompress from ..utils import logger from ..utils import MODEL_HOME +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ["STExecutor"] From a3055460c279d52b15c95ba9ea18ec4a0084908b Mon Sep 17 00:00:00 2001 From: Mingxue-Xu <92848346+Mingxue-Xu@users.noreply.github.com> Date: Thu, 9 Dec 2021 19:55:27 +0800 Subject: [PATCH 12/50] Update README.md Add CLI test examples for QuickStart. --- README.md | 54 ++++++++++++++---------------------------------------- 1 file changed, 14 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index da413001..98594089 100644 --- a/README.md +++ b/README.md @@ -105,49 +105,23 @@ If you want to set up PaddleSpeech in other environment, please see the [install ## Quick Start -Developers can have a try of our model with only a few lines of code. - -A tiny DeepSpeech2 **Speech-to-Text** model training on toy set of LibriSpeech: +Developers can have a try of our models with [PaddleSpeech Command Line](./paddlespeech/cli/README.md). Change `--input` to test your own audio/text file. +**Audio Classification** ```shell -cd examples/tiny/asr0/ -# source the environment -source path.sh -source ../../../utils/parse_options.sh -# prepare data -bash ./local/data.sh -# train model, all `ckpt` under `exp` dir, if you use paddlepaddle-gpu, you can set CUDA_VISIBLE_DEVICES before the train script -./local/train.sh conf/deepspeech2.yaml deepspeech2 offline -# avg n best model to get the test model, in this case, n = 1 -avg.sh best exp/deepspeech2/checkpoints 1 -# evaluate the test model -./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 offline +paddlespeech cls --input ./test_audio.wav ``` - -For **Text-to-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC: +**Automatic Speech Recognition** +```shell +paddlespeech asr --lang zh --sr 16000 --input ./input.wav +``` +**Speech Translation** (English to Chinese) +```shell +paddlespeech st --input ./test_audio.wav +``` +**Text-to-Speech** ```shell -cd examples/csmsc/tts3 -# download the pretrained models and unaip them -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip -unzip pwg_baker_ckpt_0.4.zip -wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip -unzip fastspeech2_nosil_baker_ckpt_0.4.zip -# source the environment -source path.sh -# run end-to-end synthesize -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/synthesize_e2e.py \ - --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=${BIN_DIR}/../sentences.txt \ - --output-dir=exp/default/test_e2e \ - --inference-dir=exp/default/inference \ - --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt +paddlespeech tts --lang zh --input ./test_text.txt ``` If you want to try more functions like training and tuning, please see [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md). @@ -315,7 +289,7 @@ PaddleSpeech Text-to-Speech mainly contains three modules: *Text Frontend*, *Aco ## Tutorials -Normally, [Speech SoTA](https://paperswithcode.com/area/speech) gives you an overview of the hot academic topics in speech. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas. +Normally, [Speech SoTA](https://paperswithcode.com/area/speech) and [Audio SoTA](https://paperswithcode.com/area/audio) give you an overview of the hot academic topics in this area. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas. - [Overview](./docs/source/introduction.md) - Quick Start From 9289ec8bf0bc8056bcc8c243a757002f3b3c73a8 Mon Sep 17 00:00:00 2001 From: Mingxue-Xu <92848346+Mingxue-Xu@users.noreply.github.com> Date: Thu, 9 Dec 2021 19:59:59 +0800 Subject: [PATCH 13/50] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 98594089..3228d8db 100644 --- a/README.md +++ b/README.md @@ -289,7 +289,7 @@ PaddleSpeech Text-to-Speech mainly contains three modules: *Text Frontend*, *Aco ## Tutorials -Normally, [Speech SoTA](https://paperswithcode.com/area/speech) and [Audio SoTA](https://paperswithcode.com/area/audio) give you an overview of the hot academic topics in this area. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas. +Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](https://paperswithcode.com/area/audio) and [Music SoTA](https://paperswithcode.com/area/music) give you an overview of the hot academic topics in the related area. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas. - [Overview](./docs/source/introduction.md) - Quick Start From ae885b2e648884a4934d33b0708161e038d8f43a Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 9 Dec 2021 20:10:47 +0800 Subject: [PATCH 14/50] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3228d8db..4e50bf09 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ paddlespeech st --input ./test_audio.wav ``` **Text-to-Speech** ```shell -paddlespeech tts --lang zh --input ./test_text.txt +paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" ``` If you want to try more functions like training and tuning, please see [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md). From 9f3d9aee138db7d6bb5b2a89b3abb2b615a16f0f Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 9 Dec 2021 20:14:06 +0800 Subject: [PATCH 15/50] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4e50bf09..fcf5953a 100644 --- a/README.md +++ b/README.md @@ -109,19 +109,19 @@ Developers can have a try of our models with [PaddleSpeech Command Line](./paddl **Audio Classification** ```shell -paddlespeech cls --input ./test_audio.wav +paddlespeech cls --input input.wav ``` **Automatic Speech Recognition** ```shell -paddlespeech asr --lang zh --sr 16000 --input ./input.wav +paddlespeech asr --lang zh --input input_16k.wav ``` **Speech Translation** (English to Chinese) ```shell -paddlespeech st --input ./test_audio.wav +paddlespeech st --input input_16k.wav ``` **Text-to-Speech** ```shell -paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav ``` If you want to try more functions like training and tuning, please see [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md). From f701882b6604326c1298d3be3799cf5a616d8c3c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 9 Dec 2021 12:46:15 +0000 Subject: [PATCH 16/50] update add_style_melgan --- paddlespeech/t2s/models/melgan/melgan.py | 9 +++------ .../models/parallel_wavegan/parallel_wavegan_updater.py | 4 ++-- paddlespeech/t2s/modules/residual_stack.py | 3 +-- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py index 8dfc05a0..32fcf658 100644 --- a/paddlespeech/t2s/models/melgan/melgan.py +++ b/paddlespeech/t2s/models/melgan/melgan.py @@ -93,8 +93,7 @@ class MelGANGenerator(nn.Layer): initialize(self, init_type) # for compatibility - if nonlinear_activation == "LeakyReLU": - nonlinear_activation = "leakyrelu" + nonlinear_activation = nonlinear_activation.lower() # check hyper parameters is valid assert channels >= np.prod(upsample_scales) @@ -328,8 +327,7 @@ class MelGANDiscriminator(nn.Layer): super().__init__() # for compatibility - if nonlinear_activation == "LeakyReLU": - nonlinear_activation = "leakyrelu" + nonlinear_activation = nonlinear_activation.lower() # initialize parameters initialize(self, init_type) @@ -479,8 +477,7 @@ class MelGANMultiScaleDiscriminator(nn.Layer): initialize(self, init_type) # for compatibility - if nonlinear_activation == "LeakyReLU": - nonlinear_activation = "leakyrelu" + nonlinear_activation = nonlinear_activation.lower() self.discriminators = nn.LayerList() diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py index 8ed50bc8..79707aa4 100644 --- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py +++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py @@ -10,8 +10,7 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations -# under the License. +# limitations under the License. import logging from pathlib import Path from typing import Dict @@ -28,6 +27,7 @@ from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator from paddlespeech.t2s.training.reporter import report from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState + logging.basicConfig( format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', datefmt='[%Y-%m-%d %H:%M:%S]') diff --git a/paddlespeech/t2s/modules/residual_stack.py b/paddlespeech/t2s/modules/residual_stack.py index ee3ba64d..b4f95229 100644 --- a/paddlespeech/t2s/modules/residual_stack.py +++ b/paddlespeech/t2s/modules/residual_stack.py @@ -60,8 +60,7 @@ class ResidualStack(nn.Layer): """ super().__init__() # for compatibility - if nonlinear_activation == "LeakyReLU": - nonlinear_activation = "leakyrelu" + nonlinear_activation = nonlinear_activation.lower() # defile residual stack part if not use_causal_conv: From 3701fba0be50a680a8ff3d4eb4fd46209fd21905 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 9 Dec 2021 20:03:56 +0800 Subject: [PATCH 17/50] Update download logic and fix README typos. --- demos/audio_tagging/README.md | 2 +- demos/speech_recognition/README.md | 2 +- demos/speech_translation/README.md | 2 +- paddlespeech/cli/asr/infer.py | 2 +- paddlespeech/cli/cls/infer.py | 6 +- paddlespeech/cli/download.py | 57 ++---------------- paddlespeech/cli/log.py | 60 +++++++++++++++++++ paddlespeech/cli/st/infer.py | 2 +- paddlespeech/cli/tts/infer.py | 4 +- paddlespeech/cli/utils.py | 95 ++---------------------------- 10 files changed, 81 insertions(+), 151 deletions(-) create mode 100644 paddlespeech/cli/log.py diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md index 5073393d..1144cbb1 100644 --- a/demos/audio_tagging/README.md +++ b/demos/audio_tagging/README.md @@ -22,7 +22,7 @@ wget https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech ### 3. Usage - Command Line(Recommended) ```bash - paddlespeech cls --input ~/cat.wav --topk 10 + paddlespeech cls --input ./cat.wav --topk 10 ``` Usage: ```bash diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index 60ee8e4d..c9116531 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -22,7 +22,7 @@ wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech. ### 3. Usage - Command Line(Recommended) ```bash - paddlespeech asr --input ~/zh.wav + paddlespeech asr --input ./zh.wav ``` Usage: ```bash diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md index b2f29168..8bb322c5 100644 --- a/demos/speech_translation/README.md +++ b/demos/speech_translation/README.md @@ -22,7 +22,7 @@ wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech. ### 3. Usage - Command Line(Recommended) ```bash - paddlespeech st --input ~/en.wav + paddlespeech st --input ./en.wav ``` Usage: ```bash diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 2db239c0..00f21293 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -27,9 +27,9 @@ import yaml from yacs.config import CfgNode from ..executor import BaseExecutor +from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress -from ..utils import logger from ..utils import MODEL_HOME from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.transform.transformation import Transformation diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 795d59f6..37f2a9d2 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -20,14 +20,14 @@ from typing import Union import numpy as np import paddle import yaml -from paddleaudio import load -from paddleaudio.features import LogMelSpectrogram from ..executor import BaseExecutor +from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress -from ..utils import logger from ..utils import MODEL_HOME +from paddleaudio import load +from paddleaudio.features import LogMelSpectrogram from paddlespeech.s2t.utils.dynamic_import import dynamic_import __all__ = ['CLSExecutor'] diff --git a/paddlespeech/cli/download.py b/paddlespeech/cli/download.py index 8de9f045..0f09b6fa 100644 --- a/paddlespeech/cli/download.py +++ b/paddlespeech/cli/download.py @@ -20,49 +20,21 @@ import os import os.path as osp import shutil import subprocess -import sys import tarfile import time import zipfile import requests +from tqdm import tqdm -try: - from tqdm import tqdm -except: +from .log import logger - class tqdm(object): - def __init__(self, total=None): - self.total = total - self.n = 0 - - def update(self, n): - self.n += n - if self.total is None: - sys.stderr.write("\r{0:.1f} bytes".format(self.n)) - else: - sys.stderr.write( - "\r{0:.1f}%".format(100 * self.n / float(self.total))) - sys.stderr.flush() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - sys.stderr.write('\n') - - -import logging -logger = logging.getLogger(__name__) - -__all__ = ['get_weights_path_from_url'] - -WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights") +__all__ = ['get_path_from_url'] DOWNLOAD_RETRY_LIMIT = 3 -def is_url(path): +def _is_url(path): """ Whether path is URL. Args: @@ -71,25 +43,6 @@ def is_url(path): return path.startswith('http://') or path.startswith('https://') -def get_weights_path_from_url(url, md5sum=None): - """Get weights path from WEIGHT_HOME, if not exists, - download it from url. - Args: - url (str): download url - md5sum (str): md5 sum of download package - - Returns: - str: a local path to save downloaded weights. - Examples: - .. code-block:: python - from paddle.utils.download import get_weights_path_from_url - resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams' - local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url) - """ - path = get_path_from_url(url, WEIGHTS_HOME, md5sum) - return path - - def _map_path(url, root_dir): # parse path after download under root_dir fname = osp.split(url)[-1] @@ -135,7 +88,7 @@ def get_path_from_url(url, from paddle.fluid.dygraph.parallel import ParallelEnv - assert is_url(url), "downloading from {} not a url".format(url) + assert _is_url(url), "downloading from {} not a url".format(url) # parse path after download to decompress under root_dir fullpath = _map_path(url, root_dir) # Mainly used to solve the problem of downloading data from different diff --git a/paddlespeech/cli/log.py b/paddlespeech/cli/log.py new file mode 100644 index 00000000..891b71a9 --- /dev/null +++ b/paddlespeech/cli/log.py @@ -0,0 +1,60 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import functools +import logging + +__all__ = [ + 'logger', +] + + +class Logger(object): + def __init__(self, name: str=None): + name = 'PaddleSpeech' if not name else name + self.logger = logging.getLogger(name) + + log_config = { + 'DEBUG': 10, + 'INFO': 20, + 'TRAIN': 21, + 'EVAL': 22, + 'WARNING': 30, + 'ERROR': 40, + 'CRITICAL': 50, + 'EXCEPTION': 100, + } + for key, level in log_config.items(): + logging.addLevelName(level, key) + if key == 'EXCEPTION': + self.__dict__[key.lower()] = self.logger.exception + else: + self.__dict__[key.lower()] = functools.partial(self.__call__, + level) + + self.format = logging.Formatter( + fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s' + ) + + self.handler = logging.StreamHandler() + self.handler.setFormatter(self.format) + + self.logger.addHandler(self.handler) + self.logger.setLevel(logging.DEBUG) + self.logger.propagate = False + + def __call__(self, log_level: str, msg: str): + self.logger.log(log_level, msg) + + +logger = Logger() diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index 32f9d425..6bb82821 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -26,9 +26,9 @@ from kaldiio import WriteHelper from yacs.config import CfgNode from ..executor import BaseExecutor +from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress -from ..utils import logger from ..utils import MODEL_HOME from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.utils.dynamic_import import dynamic_import diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 771b7d6d..d5eac2b2 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -25,9 +25,9 @@ import yaml from yacs.config import CfgNode from ..executor import BaseExecutor +from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress -from ..utils import logger from ..utils import MODEL_HOME from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.t2s.frontend import English @@ -535,7 +535,7 @@ class TTSExecutor(BaseExecutor): wav = self.voc_inference(mel) self._outputs['wav'] = wav - def postprocess(self, output: str='output.wav'): + def postprocess(self, output: str='output.wav') -> Union[str, os.PathLike]: """ Output postprocess and return results. This method get model output from self._outputs and convert it into human-readable results. diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py index 6ae6e7e5..8ba780a7 100644 --- a/paddlespeech/cli/utils.py +++ b/paddlespeech/cli/utils.py @@ -11,15 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import functools -import hashlib -import logging import os import tarfile import zipfile from typing import Any from typing import Dict -from typing import List from paddle.framework import load @@ -31,7 +27,6 @@ __all__ = [ 'get_command', 'download_and_decompress', 'load_state_dict_from_url', - 'logger', ] @@ -59,23 +54,6 @@ def get_command(name: str) -> Any: return com['_entry'] -def _md5check(filepath: os.PathLike, md5sum: str) -> bool: - logger.info("File {} md5 checking...".format(filepath)) - md5 = hashlib.md5() - with open(filepath, 'rb') as f: - for chunk in iter(lambda: f.read(4096), b""): - md5.update(chunk) - calc_md5sum = md5.hexdigest() - - if calc_md5sum != md5sum: - logger.info("File {} md5 check failed, {}(calc) != " - "{}(base)".format(filepath, calc_md5sum, md5sum)) - return False - else: - logger.info("File {} md5 check passed.".format(filepath)) - return True - - def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike: file_dir = os.path.dirname(filepath) if tarfile.is_tarfile(filepath): @@ -86,11 +64,12 @@ def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike: file_list = files.namelist() else: return file_dir - if _is_a_single_file(file_list): + + if download._is_a_single_file(file_list): rootpath = file_list[0] uncompressed_path = os.path.join(file_dir, rootpath) - elif _is_a_single_dir(file_list): - rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0] + elif download._is_a_single_dir(file_list): + rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1] uncompressed_path = os.path.join(file_dir, rootpath) else: rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1] @@ -100,28 +79,6 @@ def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike: return uncompressed_path -def _is_a_single_file(file_list: List[os.PathLike]) -> bool: - if len(file_list) == 1 and file_list[0].find(os.sep) < -1: - return True - return False - - -def _is_a_single_dir(file_list: List[os.PathLike]) -> bool: - new_file_list = [] - for file_path in file_list: - if '/' in file_path: - file_path = file_path.replace('/', os.sep) - elif '\\' in file_path: - file_path = file_path.replace('\\', os.sep) - new_file_list.append(file_path) - - file_name = new_file_list[0].split(os.sep)[0] - for i in range(1, len(new_file_list)): - if file_name != new_file_list[i].split(os.sep)[0]: - return False - return True - - def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike: """ Download archieves and decompress to specific path. @@ -133,7 +90,8 @@ def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike: 'Dictionary keys of "url" and "md5" are required in the archive, but got: {}'.format(list(archive.keys())) filepath = os.path.join(path, os.path.basename(archive['url'])) - if os.path.isfile(filepath) and _md5check(filepath, archive['md5']): + if os.path.isfile(filepath) and download._md5check(filepath, + archive['md5']): uncompress_path = _get_uncompress_path(filepath) if not os.path.isdir(uncompress_path): download._decompress(filepath) @@ -183,44 +141,3 @@ def _get_sub_home(directory): PPSPEECH_HOME = _get_paddlespcceh_home() MODEL_HOME = _get_sub_home('models') - - -class Logger(object): - def __init__(self, name: str=None): - name = 'PaddleSpeech' if not name else name - self.logger = logging.getLogger(name) - - log_config = { - 'DEBUG': 10, - 'INFO': 20, - 'TRAIN': 21, - 'EVAL': 22, - 'WARNING': 30, - 'ERROR': 40, - 'CRITICAL': 50, - 'EXCEPTION': 100, - } - for key, level in log_config.items(): - logging.addLevelName(level, key) - if key == 'EXCEPTION': - self.__dict__[key.lower()] = self.logger.exception - else: - self.__dict__[key.lower()] = functools.partial(self.__call__, - level) - - self.format = logging.Formatter( - fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s' - ) - - self.handler = logging.StreamHandler() - self.handler.setFormatter(self.format) - - self.logger.addHandler(self.handler) - self.logger.setLevel(logging.DEBUG) - self.logger.propagate = False - - def __call__(self, log_level: str, msg: str): - self.logger.log(log_level, msg) - - -logger = Logger() From 1909f2f620c7eeee7faff2cada38cd59695bfb27 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 9 Dec 2021 21:07:32 +0800 Subject: [PATCH 18/50] Add tts demo. --- demos/text_to_speech/README.md | 102 +++++++++++++++++++++++++++++++++ paddlespeech/cli/tts/infer.py | 5 +- paddlespeech/cli/utils.py | 7 ++- 3 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 demos/text_to_speech/README.md diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md new file mode 100644 index 00000000..9751adf2 --- /dev/null +++ b/demos/text_to_speech/README.md @@ -0,0 +1,102 @@ +# TTS(Text To Speech) + +## Introduction +Text-to-speech (TTS) is a natural language modeling process that requires changing units of text into units of speech for audio presentation. + +This demo is an implementation to generate an audio from the giving text. It can be done by a single command or a few lines in python using `PaddleSpeech`. + +## Usage +### 1. Installation +```bash +pip install paddlespeech +``` + +### 2. Prepare Input +Input of this demo should be a text of the specific language that can be passed via argument. + + +### 3. Usage +- Command Line(Recommended) + ```bash + paddlespeech tts --input 今天的天气不错啊 + ``` + Usage: + ```bash + paddlespeech tts --help + ``` + Arguments: + - `input`(required): Input text to generate.. + - `am`: Acoustic model type of tts task. Default: `fastspeech2_csmsc`. + - `am_config`: Config of acoustic model. Use deault config when it is None. Default: `None`. + - `am_ckpt`: Acoustic model checkpoint. Use pretrained model when it is None. Default: `None`. + - `am_stat`: Mean and standard deviation used to normalize spectrogram when training acoustic model. Default: `None`. + - `phones_dict`: Phone vocabulary file. Default: `None`. + - `tones_dict`: Tone vocabulary file. Default: `None`. + - `speaker_dict`: speaker id map file. Default: `None`. + - `spk_id`: Speaker id for multi speaker acoustic model. Default: `0`. + - `voc`: Vocoder type of tts task. Default: `pwgan_csmsc`. + - `voc_config`: Config of vocoder. Use deault config when it is None. Default: `None`. + - `voc_ckpt`: Vocoder checkpoint. Use pretrained model when it is None. Default: `None`. + - `voc_stat`: Mean and standard deviation used to normalize spectrogram when training vocoder. Default: `None`. + - `lang`: Language of tts task. Default: `zh`. + - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment. + - `output`: Output wave filepath. Default: `output.wav`. + + Output: + ```bash + [2021-12-09 20:49:58,955] [ INFO] [log.py] [L57] - Wave file has been generated: output.wav + ``` + +- Python API + ```python + import paddle + from paddlespeech.cli import TTSExecutor + + tts_executor = TTSExecutor() + wav_file = tts_executor( + text='今天的天气不错啊', + output='output.wav', + am='fastspeech2_csmsc', + am_config=None, + am_ckpt=None, + am_stat=None, + spk_id=0, + phones_dict=None, + tones_dict=None, + speaker_dict=None, + voc='pwgan_csmsc', + voc_config=None, + voc_ckpt=None, + voc_stat=None, + lang='zh', + device=paddle.get_device()) + print('Wave file has been generated: {}'.format(wav_file)) + ``` + + Output: + ```bash + Wave file has been generated: output.wav + ``` + + +### 4.Pretrained Models + +Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api: + +- Acoustic model + | Model | Language + | :--- | :---: | + | speedyspeech_csmsc| zh + | fastspeech2_csmsc| zh + | fastspeech2_aishell3| zh + | fastspeech2_ljspeech| en + | fastspeech2_vctk| en + +- Vocoder + | Model | Language + | :--- | :---: | + | pwgan_csmsc| zh + | pwgan_aishell3| zh + | pwgan_ljspeech| en + | pwgan_vctk| en + | mb_melgan_csmsc| zh diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index d5eac2b2..8fe5f90a 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -236,6 +236,7 @@ class TTSExecutor(BaseExecutor): self.parser.add_argument( "--am_stat", type=str, + default=None, help="mean and standard deviation used to normalize spectrogram when training acoustic model." ) self.parser.add_argument( @@ -282,6 +283,7 @@ class TTSExecutor(BaseExecutor): self.parser.add_argument( "--voc_stat", type=str, + default=None, help="mean and standard deviation used to normalize spectrogram when training voc." ) # other @@ -543,6 +545,7 @@ class TTSExecutor(BaseExecutor): Returns: Union[str, os.PathLike]: Human-readable results such as texts and audio files. """ + output = os.path.abspath(os.path.expanduser(output)) sf.write( output, self._outputs['wav'].numpy(), samplerate=self.am_config.fs) return output @@ -593,7 +596,7 @@ class TTSExecutor(BaseExecutor): lang=lang, device=device, output=output) - logger.info('TTS Result Saved in: {}'.format(res)) + logger.info('Wave file has been generated: {}'.format(res)) return True except Exception as e: logger.exception(e) diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py index 8ba780a7..ee31b771 100644 --- a/paddlespeech/cli/utils.py +++ b/paddlespeech/cli/utils.py @@ -56,12 +56,14 @@ def get_command(name: str) -> Any: def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike: file_dir = os.path.dirname(filepath) + is_zip_file = False if tarfile.is_tarfile(filepath): files = tarfile.open(filepath, "r:*") file_list = files.getnames() elif zipfile.is_zipfile(filepath): files = zipfile.ZipFile(filepath, 'r') file_list = files.namelist() + is_zip_file = True else: return file_dir @@ -69,7 +71,10 @@ def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike: rootpath = file_list[0] uncompressed_path = os.path.join(file_dir, rootpath) elif download._is_a_single_dir(file_list): - rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1] + if is_zip_file: + rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0] + else: + rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1] uncompressed_path = os.path.join(file_dir, rootpath) else: rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1] From 662b10dbeda3141f8a2b2496058f1fa6132b4e82 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Fri, 10 Dec 2021 10:30:04 +0800 Subject: [PATCH 19/50] Add run.sh. --- demos/audio_tagging/run.sh | 4 ++++ demos/speech_recognition/run.sh | 4 ++++ demos/speech_translation/run.sh | 4 ++++ demos/text_to_speech/run.sh | 3 +++ 4 files changed, 15 insertions(+) create mode 100755 demos/audio_tagging/run.sh create mode 100755 demos/speech_recognition/run.sh create mode 100755 demos/speech_translation/run.sh create mode 100755 demos/text_to_speech/run.sh diff --git a/demos/audio_tagging/run.sh b/demos/audio_tagging/run.sh new file mode 100755 index 00000000..b30eba35 --- /dev/null +++ b/demos/audio_tagging/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav +paddlespeech cls --input ./cat.wav --topk 10 diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh new file mode 100755 index 00000000..5efc8b81 --- /dev/null +++ b/demos/speech_recognition/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +paddlespeech asr --input ./zh.wav diff --git a/demos/speech_translation/run.sh b/demos/speech_translation/run.sh new file mode 100755 index 00000000..6619bd91 --- /dev/null +++ b/demos/speech_translation/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +paddlespeech st --input ./en.wav diff --git a/demos/text_to_speech/run.sh b/demos/text_to_speech/run.sh new file mode 100755 index 00000000..c2487aee --- /dev/null +++ b/demos/text_to_speech/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +paddlespeech tts --input 今天的天气不错啊 From 6fa373001d84630a36d4a5d297c38258b455ccab Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 14:16:33 +0800 Subject: [PATCH 20/50] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fcf5953a..c98fe136 100644 --- a/README.md +++ b/README.md @@ -309,7 +309,7 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](ht - [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) and [PaddleSpeech VS. Espnet](https://paddlespeech.readthedocs.io/en/latest/tts/demo_2.html) - [Released Models](./docs/source/released_model.md) -The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html) is a good guideline for the pipeline components. +The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) is a good guideline for the pipeline components. ## FAQ and Contributing From d4a76e41cd792c610581ff50222819e320f8f114 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 14:29:56 +0800 Subject: [PATCH 21/50] Update released_model.md --- docs/source/released_model.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 367b7c4b..58ad2286 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -37,7 +37,7 @@ Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(stati Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)||| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB| -FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB| +FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)
[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB| FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)||| FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)||| FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| From 92b6af82e4c6965a243dc4b65c73802d3483369c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 14:31:30 +0800 Subject: [PATCH 22/50] Update released_model.md --- docs/source/released_model.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 58ad2286..f0f8df1a 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -37,7 +37,8 @@ Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(stati Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)||| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB| -FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)
[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB| +FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB| +FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)||| FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)||| FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)||| FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| From df5fe035e5d724e80cefe9401cf66fb443c58fce Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 15:31:49 +0800 Subject: [PATCH 23/50] Update README.md --- paddlespeech/cli/README.md | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md index 264d66f7..25f1f718 100644 --- a/paddlespeech/cli/README.md +++ b/paddlespeech/cli/README.md @@ -3,10 +3,26 @@ The simplest approach to use PaddleSpeech models. ## Help - `paddlespeech help` + ```bash + paddlespeech help + ``` + ## Audio Classification + ```bash + paddlespeech cls --input input.wav + ``` - ## ASR - `paddlespeech asr --input ./test_audio.wav` - - ## Multi-label Classification - `paddlespeech cls --input ./test_audio.wav` + ## Automatic Speech Recognition + ``` + paddlespeech asr --lang zh --input input_16k.wav + ``` + + ## Speech Translation (English to Chinese) + ```bash + paddlespeech st --input input_16k.wav + ``` + + ## Text-to-Speech + ```bash + paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav + ``` + From 59f051a867ce29f1005a4de9764547775b74b2ca Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 15:49:11 +0800 Subject: [PATCH 24/50] Update README.md --- demos/text_to_speech/README.md | 39 ++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 9751adf2..caa372ed 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -1,3 +1,4 @@ + # TTS(Text To Speech) ## Introduction @@ -13,15 +14,35 @@ pip install paddlespeech ### 2. Prepare Input Input of this demo should be a text of the specific language that can be passed via argument. - - ### 3. Usage -- Command Line(Recommended) - ```bash - paddlespeech tts --input 今天的天气不错啊 - ``` - Usage: - ```bash +- Command Line (Recommended) + - Chinese + ```bash + paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" + ``` + The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`. + - Chinese, use `SpeedySpeech` as acoustic model + ```bash + paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" + ``` + - Chinese, multi speaker + ```bash + paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 + ``` + You can change `spk_id` here. + - English + ```bash + paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" + ``` + - English, multi speaker + ```bash + paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 + ``` + You can change `spk_id` here. + + +- Usage: +```bash paddlespeech tts --help ``` Arguments: @@ -79,7 +100,7 @@ Input of this demo should be a text of the specific language that can be passed ``` -### 4.Pretrained Models +### 4. Pretrained Models Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api: From 11a48901bac5c8a3df9506ba883cfcbd160af0ac Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 15:51:17 +0800 Subject: [PATCH 25/50] Update README.md --- demos/text_to_speech/README.md | 36 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index caa372ed..b8b75f83 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -19,26 +19,26 @@ Input of this demo should be a text of the specific language that can be passed - Chinese ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" - ``` + ``` The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`. - - Chinese, use `SpeedySpeech` as acoustic model + - Chinese, use `SpeedySpeech` as acoustic model ```bash - paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" - ``` - - Chinese, multi speaker - ```bash - paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 - ``` - You can change `spk_id` here. - - English - ```bash - paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" - ``` - - English, multi speaker - ```bash - paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 - ``` - You can change `spk_id` here. + paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" + ``` + - Chinese, multi speaker + ```bash + paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 + ``` + You can change `spk_id` here. + - English + ```bash + paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" + ``` + - English, multi speaker + ```bash + paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 + ``` + You can change `spk_id` here. - Usage: From 02b3b09987e5daf558c3a492c5146be14ead43ee Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 15:53:40 +0800 Subject: [PATCH 26/50] Update README.md --- demos/text_to_speech/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index b8b75f83..518f9adc 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -25,20 +25,20 @@ Input of this demo should be a text of the specific language that can be passed ```bash paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" ``` - - Chinese, multi speaker + - Chinese, multi speaker ```bash paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 ``` You can change `spk_id` here. - - English + - English ```bash paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" ``` - English, multi speaker ```bash paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 - ``` - You can change `spk_id` here. + ``` + You can change `spk_id` here. - Usage: From 88a8ed4fc306588589e476fba8ee3460228c01aa Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 15:54:05 +0800 Subject: [PATCH 27/50] Update README.md --- demos/text_to_speech/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 518f9adc..2474a900 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -1,5 +1,5 @@ -# TTS(Text To Speech) +# TTS (Text To Speech) ## Introduction Text-to-speech (TTS) is a natural language modeling process that requires changing units of text into units of speech for audio presentation. From a34a6b1d2071914fd737437f858caf2157b7af50 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 15:56:35 +0800 Subject: [PATCH 28/50] Update README.md --- demos/text_to_speech/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 2474a900..a07120e2 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -37,12 +37,12 @@ Input of this demo should be a text of the specific language that can be passed - English, multi speaker ```bash paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 - ``` + ``` You can change `spk_id` here. - -- Usage: -```bash + Usage: + + ```bash paddlespeech tts --help ``` Arguments: From f9672e16ae0523bdfc2ffff3565a80cf1ba73bbf Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 15:57:40 +0800 Subject: [PATCH 29/50] Update README.md --- demos/text_to_speech/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index a07120e2..a1198b4b 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -1,4 +1,5 @@ + # TTS (Text To Speech) ## Introduction @@ -39,8 +40,7 @@ Input of this demo should be a text of the specific language that can be passed paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 ``` You can change `spk_id` here. - - Usage: + Usage: ```bash paddlespeech tts --help From 689c44dc59c6ca3d6c1f444588081d0430ec89df Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 16:03:09 +0800 Subject: [PATCH 30/50] Update README.md --- demos/text_to_speech/README.md | 44 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index a1198b4b..f25dc606 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -17,29 +17,29 @@ pip install paddlespeech Input of this demo should be a text of the specific language that can be passed via argument. ### 3. Usage - Command Line (Recommended) - - Chinese - ```bash - paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" - ``` - The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`. - - Chinese, use `SpeedySpeech` as acoustic model - ```bash - paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" - ``` - - Chinese, multi speaker - ```bash - paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 - ``` - You can change `spk_id` here. + - Chinese + ```bash + paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" + ``` + The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`. + - Chinese, use `SpeedySpeech` as acoustic model + ```bash + paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" + ``` + - Chinese, multi speaker + ```bash + paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 + ``` + You can change `spk_id` here. - English - ```bash - paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" - ``` - - English, multi speaker - ```bash - paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 - ``` - You can change `spk_id` here. + ```bash + paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" + ``` + - English, multi speaker + ```bash + paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 + ``` + You can change `spk_id` here. Usage: ```bash From 1d4002409f96cce2d68d7de10c3d13a1b5866f31 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Fri, 10 Dec 2021 08:07:40 +0000 Subject: [PATCH 31/50] separate the sox and soxbindings with the requirements --- paddlespeech/s2t/frontend/audio.py | 38 ++++++++++++++++++--------- paddlespeech/s2t/transform/perturb.py | 28 ++++++++++++++------ setup.py | 4 +-- 3 files changed, 47 insertions(+), 23 deletions(-) diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py index 65dccad3..d494cc4f 100644 --- a/paddlespeech/s2t/frontend/audio.py +++ b/paddlespeech/s2t/frontend/audio.py @@ -21,7 +21,6 @@ import struct import numpy as np import resampy import soundfile -import soxbindings as sox from scipy import signal from .utility import convert_samples_from_float32 @@ -98,7 +97,7 @@ class AudioSegment(): :param file: Input audio filepath or file object. :type file: str|file :param start: Start time in seconds. If start is negative, it wraps - around from the end. If not provided, this function + around from the end. If not provided, this function reads from the very beginning. :type start: float :param end: End time in seconds. If end is negative, it wraps around @@ -199,7 +198,7 @@ class AudioSegment(): @classmethod def from_bytes(cls, bytes): """Create audio segment from a byte string containing audio samples. - + :param bytes: Byte string containing audio samples. :type bytes: str :return: Audio segment instance. @@ -217,7 +216,7 @@ class AudioSegment(): :type *segments: tuple of AudioSegment :return: Audio segment instance as concatenating results. :rtype: AudioSegment - :raises ValueError: If the number of segments is zero, or if the + :raises ValueError: If the number of segments is zero, or if the sample_rate of any segments does not match. :raises TypeError: If any segment is not AudioSegment instance. """ @@ -251,7 +250,7 @@ class AudioSegment(): def to_wav_file(self, filepath, dtype='float32'): """Save audio segment to disk as wav file. - + :param filepath: WAV filepath or file object to save the audio segment. :type filepath: str|file @@ -297,7 +296,7 @@ class AudioSegment(): def to_bytes(self, dtype='float32'): """Create a byte string containing the audio content. - + :param dtype: Data type for export samples. Options: 'int16', 'int32', 'float32', 'float64'. Default is 'float32'. :type dtype: str @@ -309,7 +308,7 @@ class AudioSegment(): def to(self, dtype='int16'): """Create a `dtype` audio content. - + :param dtype: Data type for export samples. Options: 'int16', 'int32', 'float32', 'float64'. Default is 'float32'. :type dtype: str @@ -323,8 +322,8 @@ class AudioSegment(): """Apply gain in decibels to samples. Note that this is an in-place transformation. - - :param gain: Gain in decibels to apply to samples. + + :param gain: Gain in decibels to apply to samples. :type gain: float|1darray """ self._samples *= 10.**(gain / 20.) @@ -333,7 +332,7 @@ class AudioSegment(): """Change the audio speed by linear interpolation. Note that this is an in-place transformation. - + :param speed_rate: Rate of speed change: speed_rate > 1.0, speed up the audio; speed_rate = 1.0, unchanged; @@ -355,6 +354,19 @@ class AudioSegment(): # self._samples = np.interp(new_indices, old_indices, self._samples) # sox, slow + try: + import soxbindings as sox + except: + try: + from paddlespeech.s2t.utils import dynamic_pip_install + package = "sox" + dynamic_pip_install.install(package) + package = "soxbindings" + dynamic_pip_install.install(package) + import soxbindings as sox + except: + raise RuntimeError("Can not install soxbindings on your system." ) + tfm = sox.Transformer() tfm.set_globals(multithread=False) tfm.speed(speed_rate) @@ -405,7 +417,7 @@ class AudioSegment(): :param prior_samples: Prior strength in number of samples. :type prior_samples: float :param startup_delay: Default 0.0s. If provided, this function will - accrue statistics for the first startup_delay + accrue statistics for the first startup_delay seconds before applying online normalization. :type startup_delay: float """ @@ -557,7 +569,7 @@ class AudioSegment(): :param impulse_segment: Impulse response segments. :type impulse_segment: AudioSegment :param allow_resample: Indicates whether resampling is allowed when - the impulse_segment has a different sample + the impulse_segment has a different sample rate from this signal. :type allow_resample: bool :raises ValueError: If the sample rate is not match between two @@ -695,7 +707,7 @@ class AudioSegment(): def _convert_samples_from_float32(self, samples, dtype): """Convert sample type from float32 to dtype. - + Audio sample type is usually integer or float-point. For integer type, float32 will be rescaled from [-1, 1] to the maximum range supported by the integer type. diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index 873adb0b..90144197 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -16,7 +16,6 @@ import librosa import numpy import scipy import soundfile -import soxbindings as sox from paddlespeech.s2t.io.reader import SoundHDF5File @@ -115,10 +114,10 @@ class SpeedPerturbationSox(): and sox-speed just to resample the input, i.e pitch and tempo are changed both. - To speed up or slow down the sound of a file, - use speed to modify the pitch and the duration of the file. - This raises the speed and reduces the time. - The default factor is 1.0 which makes no change to the audio. + To speed up or slow down the sound of a file, + use speed to modify the pitch and the duration of the file. + This raises the speed and reduces the time. + The default factor is 1.0 which makes no change to the audio. 2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher. "Why use speed option instead of tempo -s in SoX for speed perturbation" @@ -130,7 +129,7 @@ class SpeedPerturbationSox(): speed option: sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9 - If we use speed option like above, the pitch of audio also will be changed, + If we use speed option like above, the pitch of audio also will be changed, but the tempo option does not change the pitch. """ @@ -146,6 +145,19 @@ class SpeedPerturbationSox(): self.keep_length = keep_length self.state = numpy.random.RandomState(seed) + try: + import soxbindings as sox + except: + try: + from paddlespeech.s2t.utils import dynamic_pip_install + package = "sox" + dynamic_pip_install.install(package) + package = "soxbindings" + dynamic_pip_install.install(package) + import soxbindings as sox + except: + raise RuntimeError("Can not install soxbindings on your system." ) + if utt2ratio is not None: self.utt2ratio = {} # Use the scheduled ratio for each utterances @@ -168,8 +180,8 @@ class SpeedPerturbationSox(): def __repr__(self): if self.utt2ratio is None: return f"""{self.__class__.__name__}( - lower={self.lower}, - upper={self.upper}, + lower={self.lower}, + upper={self.upper}, keep_length={self.keep_length}, sample_rate={self.sr})""" diff --git a/setup.py b/setup.py index 9aaaa6eb..1ac671f1 100644 --- a/setup.py +++ b/setup.py @@ -55,8 +55,6 @@ requirements = { "scipy", "sentencepiece~=0.1.96", "soundfile~=0.10", - "sox", - "soxbindings", "textgrid", "timer", "tqdm", @@ -74,6 +72,8 @@ requirements = { "Pillow", "pybind11", "snakeviz", + "sox", + "soxbindings", "unidecode", "yq", "pre-commit", From d97379d7cea6ebc5ee07981554f91d6026f99c77 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 16:11:14 +0800 Subject: [PATCH 32/50] Update README.md --- demos/text_to_speech/README.md | 44 ++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index f25dc606..769189e3 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -1,5 +1,3 @@ - - # TTS (Text To Speech) ## Introduction @@ -18,29 +16,33 @@ Input of this demo should be a text of the specific language that can be passed ### 3. Usage - Command Line (Recommended) - Chinese - ```bash - paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" - ``` - The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`. - - Chinese, use `SpeedySpeech` as acoustic model - ```bash - paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" - ``` + + The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`. + ```bash + paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" + ``` + - Chinese, use `SpeedySpeech` as acoustic model + ```bash + paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" + ``` - Chinese, multi speaker - ```bash - paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 - ``` - You can change `spk_id` here. + + You can change `spk_id` here. + ```bash + paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 + ``` + - English - ```bash - paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" - ``` + ```bash + paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" + ``` - English, multi speaker - ```bash + + You can change `spk_id` here. + ```bash paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 - ``` - You can change `spk_id` here. - Usage: + ``` + Usage: ```bash paddlespeech tts --help From b8d8fdccd6997723072c55b4b1d5cbc02a574676 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 16:22:33 +0800 Subject: [PATCH 33/50] Update quick_start.md --- docs/source/tts/quick_start.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tts/quick_start.md b/docs/source/tts/quick_start.md index e6ad46fb..64c13bdf 100644 --- a/docs/source/tts/quick_start.md +++ b/docs/source/tts/quick_start.md @@ -19,7 +19,7 @@ The models in PaddleSpeech TTS have the following mapping relationship: ## Quick Start -Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc) +Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. [./examples/csmsc/](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc) ### Train Parallel WaveGAN with CSMSC - Go to directory From 094d05f6b8c54007b24a87fc8d1d6d6cd9444d78 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 10 Dec 2021 16:23:04 +0800 Subject: [PATCH 34/50] Update quick_start.md --- docs/source/tts/quick_start.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tts/quick_start.md b/docs/source/tts/quick_start.md index 64c13bdf..a50f0a8b 100644 --- a/docs/source/tts/quick_start.md +++ b/docs/source/tts/quick_start.md @@ -19,7 +19,7 @@ The models in PaddleSpeech TTS have the following mapping relationship: ## Quick Start -Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. [./examples/csmsc/](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc) +Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. [examples/csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc) ### Train Parallel WaveGAN with CSMSC - Go to directory From aa04e2652f8b6add8a56e176a6a8396a196f02d7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 10 Dec 2021 08:50:02 +0000 Subject: [PATCH 35/50] rm uesless comment --- examples/aishell/asr1/conf/chunk_conformer.yaml | 2 +- examples/aishell/asr1/conf/transformer.yaml | 2 +- examples/callcenter/asr1/conf/chunk_conformer.yaml | 2 +- examples/callcenter/asr1/conf/conformer.yaml | 2 +- examples/librispeech/asr1/conf/chunk_conformer.yaml | 2 +- examples/librispeech/asr1/conf/chunk_transformer.yaml | 2 +- examples/librispeech/asr1/conf/conformer.yaml | 2 +- examples/librispeech/asr2/conf/transformer.yaml | 2 +- examples/ted_en_zh/st0/conf/transformer.yaml | 2 +- examples/ted_en_zh/st1/conf/transformer.yaml | 2 +- examples/timit/asr1/conf/transformer.yaml | 2 +- examples/tiny/asr1/conf/chunk_confermer.yaml | 2 +- examples/tiny/asr1/conf/chunk_transformer.yaml | 2 +- examples/tiny/asr1/conf/conformer.yaml | 2 +- examples/tiny/asr1/conf/transformer.yaml | 2 +- examples/wenetspeech/asr1/conf/conformer.yaml | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml index e07cd07c..80b45587 100644 --- a/examples/aishell/asr1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -90,7 +90,7 @@ training: optim_conf: lr: 0.001 weight_decay: 1e-6 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml index d13f9e2f..60ec0180 100644 --- a/examples/aishell/asr1/conf/transformer.yaml +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -80,7 +80,7 @@ training: optim_conf: lr: 0.002 weight_decay: 1e-6 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/callcenter/asr1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml index d20d2b9a..69959c68 100644 --- a/examples/callcenter/asr1/conf/chunk_conformer.yaml +++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml @@ -88,7 +88,7 @@ training: optim_conf: lr: 0.001 weight_decay: 1e-6 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/callcenter/asr1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml index f86cd4a3..80c15abb 100644 --- a/examples/callcenter/asr1/conf/conformer.yaml +++ b/examples/callcenter/asr1/conf/conformer.yaml @@ -83,7 +83,7 @@ training: optim_conf: lr: 0.002 weight_decay: 1e-6 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index 4a574190..54580664 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -90,7 +90,7 @@ training: optim_conf: lr: 0.001 weight_decay: 1e-06 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index c2644daf..70a9dc6a 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -83,7 +83,7 @@ training: optim_conf: lr: 0.001 weight_decay: 1e-06 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index 684b6297..ca934eb1 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -86,7 +86,7 @@ training: optim_conf: lr: 0.004 weight_decay: 1e-06 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml index 3e9350ab..00240743 100644 --- a/examples/librispeech/asr2/conf/transformer.yaml +++ b/examples/librispeech/asr2/conf/transformer.yaml @@ -75,7 +75,7 @@ optim: adam optim_conf: global_grad_clip: 5.0 weight_decay: 1.0e-06 -scheduler: warmuplr # pytorch v1.1.0+ required +scheduler: warmuplr scheduler_conf: lr: 0.004 warmup_steps: 25000 diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml index 5a05fa46..6ed75be4 100644 --- a/examples/ted_en_zh/st0/conf/transformer.yaml +++ b/examples/ted_en_zh/st0/conf/transformer.yaml @@ -82,7 +82,7 @@ training: optim_conf: lr: 0.004 weight_decay: 1e-06 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml index d553bde7..3bef7bc5 100644 --- a/examples/ted_en_zh/st1/conf/transformer.yaml +++ b/examples/ted_en_zh/st1/conf/transformer.yaml @@ -82,7 +82,7 @@ training: optim_conf: lr: 0.004 weight_decay: 1e-06 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml index 89ae2fd3..af05a6ce 100644 --- a/examples/timit/asr1/conf/transformer.yaml +++ b/examples/timit/asr1/conf/transformer.yaml @@ -80,7 +80,7 @@ training: optim_conf: lr: 0.004 weight_decay: 1e-06 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 1200 lr_decay: 1.0 diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml index 728a82e3..76b97adf 100644 --- a/examples/tiny/asr1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -90,7 +90,7 @@ training: optim_conf: lr: 0.001 weight_decay: 1e-06 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml index 7c927122..5f1991f9 100644 --- a/examples/tiny/asr1/conf/chunk_transformer.yaml +++ b/examples/tiny/asr1/conf/chunk_transformer.yaml @@ -83,7 +83,7 @@ training: optim_conf: lr: 0.002 weight_decay: 1e-06 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml index 21cc1128..b2937c1b 100644 --- a/examples/tiny/asr1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -86,7 +86,7 @@ training: optim_conf: lr: 0.002 weight_decay: 1e-06 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml index f4645c68..f5319756 100644 --- a/examples/tiny/asr1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -80,7 +80,7 @@ training: optim_conf: lr: 0.002 weight_decay: 1e-06 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml index a3a42ec6..fc040a79 100644 --- a/examples/wenetspeech/asr1/conf/conformer.yaml +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -87,7 +87,7 @@ training: optim_conf: lr: 0.001 weight_decay: 1e-6 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 5000 lr_decay: 1.0 From b1c80c45e01777701ab1d1f91b41cb9e58835c5b Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 10 Dec 2021 09:32:27 +0000 Subject: [PATCH 36/50] remove ctc grad norm type in config --- examples/aishell/asr0/conf/deepspeech2_online.yaml | 2 +- examples/librispeech/asr0/conf/deepspeech2.yaml | 2 +- examples/librispeech/asr0/conf/deepspeech2_online.yaml | 2 +- examples/librispeech/asr1/conf/chunk_conformer.yaml | 2 -- examples/librispeech/asr1/conf/chunk_transformer.yaml | 2 -- examples/librispeech/asr1/conf/conformer.yaml | 2 -- examples/librispeech/asr1/conf/transformer.yaml | 4 +--- examples/librispeech/asr2/conf/transformer.yaml | 2 -- examples/ted_en_zh/st0/conf/transformer.yaml | 2 -- examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml | 2 -- examples/ted_en_zh/st1/conf/transformer.yaml | 2 -- examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml | 2 -- examples/timit/asr1/conf/transformer.yaml | 2 -- examples/tiny/asr0/conf/deepspeech2.yaml | 2 +- examples/tiny/asr0/conf/deepspeech2_online.yaml | 2 +- examples/tiny/asr1/conf/chunk_confermer.yaml | 2 -- examples/tiny/asr1/conf/chunk_transformer.yaml | 2 -- examples/tiny/asr1/conf/conformer.yaml | 2 -- examples/tiny/asr1/conf/transformer.yaml | 2 -- examples/wenetspeech/asr1/conf/conformer.yaml | 2 -- paddlespeech/s2t/models/ds2/deepspeech2.py | 10 ++++------ paddlespeech/s2t/models/ds2_online/deepspeech2.py | 8 ++++---- paddlespeech/s2t/models/u2/u2.py | 6 ++++-- paddlespeech/s2t/models/u2_st/u2_st.py | 6 ++++-- tests/unit/asr/u2_model_test.py | 4 ---- 25 files changed, 22 insertions(+), 54 deletions(-) diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml index 010d8f15..2f63f4de 100644 --- a/examples/aishell/asr0/conf/deepspeech2_online.yaml +++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml @@ -43,7 +43,7 @@ model: fc_layers_size_list: -1, use_gru: False blank_id: 0 - ctc_grad_norm_type: null + training: n_epoch: 65 diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml index 70fa3fcb..f3574e15 100644 --- a/examples/librispeech/asr0/conf/deepspeech2.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2.yaml @@ -41,7 +41,7 @@ model: use_gru: False share_rnn_weights: True blank_id: 0 - ctc_grad_norm_type: null + training: n_epoch: 50 diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml index 3e07862d..0d16bc57 100644 --- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml @@ -43,7 +43,7 @@ model: fc_layers_size_list: 512, 256 use_gru: False blank_id: 0 - ctc_grad_norm_type: null + training: n_epoch: 50 diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index 54580664..7f593037 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -76,8 +76,6 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index 70a9dc6a..366d6de0 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -69,8 +69,6 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index ca934eb1..f02f24dc 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -72,8 +72,6 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml index 0cc0dae6..a90efe48 100644 --- a/examples/librispeech/asr1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -29,8 +29,6 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false @@ -81,7 +79,7 @@ training: optim_conf: lr: 0.004 weight_decay: 1e-06 - scheduler: warmuplr # pytorch v1.1.0+ required + scheduler: warmuplr scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml index 00240743..a16563a5 100644 --- a/examples/librispeech/asr2/conf/transformer.yaml +++ b/examples/librispeech/asr2/conf/transformer.yaml @@ -30,8 +30,6 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml index 6ed75be4..36f287b1 100644 --- a/examples/ted_en_zh/st0/conf/transformer.yaml +++ b/examples/ted_en_zh/st0/conf/transformer.yaml @@ -68,8 +68,6 @@ model: model_conf: asr_weight: 0.0 ctc_weight: 0.0 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml index 7e886cca..78887d3c 100644 --- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml @@ -68,8 +68,6 @@ model: model_conf: asr_weight: 0.5 ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml index 3bef7bc5..609c5824 100644 --- a/examples/ted_en_zh/st1/conf/transformer.yaml +++ b/examples/ted_en_zh/st1/conf/transformer.yaml @@ -68,8 +68,6 @@ model: model_conf: asr_weight: 0.0 ctc_weight: 0.0 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml index 3175aad9..10eccd1e 100644 --- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml @@ -68,8 +68,6 @@ model: model_conf: asr_weight: 0.5 ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml index af05a6ce..f518cc5e 100644 --- a/examples/timit/asr1/conf/transformer.yaml +++ b/examples/timit/asr1/conf/transformer.yaml @@ -66,8 +66,6 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.5 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml index ba453aad..7d841d47 100644 --- a/examples/tiny/asr0/conf/deepspeech2.yaml +++ b/examples/tiny/asr0/conf/deepspeech2.yaml @@ -42,7 +42,7 @@ model: use_gru: False share_rnn_weights: True blank_id: 0 - ctc_grad_norm_type: null + training: n_epoch: 5 diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml index 36c774e3..393b6439 100644 --- a/examples/tiny/asr0/conf/deepspeech2_online.yaml +++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml @@ -44,7 +44,7 @@ model: fc_layers_size_list: 512, 256 use_gru: True blank_id: 0 - ctc_grad_norm_type: null + training: n_epoch: 5 diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml index 76b97adf..ad27478d 100644 --- a/examples/tiny/asr1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -76,8 +76,6 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml index 5f1991f9..298518fb 100644 --- a/examples/tiny/asr1/conf/chunk_transformer.yaml +++ b/examples/tiny/asr1/conf/chunk_transformer.yaml @@ -69,8 +69,6 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml index b2937c1b..eb850902 100644 --- a/examples/tiny/asr1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -72,8 +72,6 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml index f5319756..c641d1f5 100644 --- a/examples/tiny/asr1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -66,8 +66,6 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml index fc040a79..a438236d 100644 --- a/examples/wenetspeech/asr1/conf/conformer.yaml +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -33,8 +33,6 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index 317abc69..f0a553ec 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -129,7 +129,7 @@ class DeepSpeech2Model(nn.Layer): rnn_layer_size=1024, #RNN layer size (number of RNN cells). use_gru=True, #Use gru if set True. Use simple rnn if set False. share_rnn_weights=True, #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - ctc_grad_norm_type='instance', )) + ctc_grad_norm_type=None,)) if config is not None: config.merge_from_other_cfg(default) return default @@ -143,7 +143,7 @@ class DeepSpeech2Model(nn.Layer): use_gru=False, share_rnn_weights=True, blank_id=0, - ctc_grad_norm_type='instance'): + ctc_grad_norm_type=None): super().__init__() self.encoder = CRNNEncoder( feat_size=feat_size, @@ -220,16 +220,14 @@ class DeepSpeech2Model(nn.Layer): """ model = cls( feat_size=dataloader.collate_fn.feature_size, - #feat_size=dataloader.dataset.feature_size, dict_size=dataloader.collate_fn.vocab_size, - #dict_size=dataloader.dataset.vocab_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, share_rnn_weights=config.model.share_rnn_weights, blank_id=config.model.blank_id, - ctc_grad_norm_type=config.model.ctc_grad_norm_type, ) + ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -257,7 +255,7 @@ class DeepSpeech2Model(nn.Layer): use_gru=config.use_gru, share_rnn_weights=config.share_rnn_weights, blank_id=config.blank_id, - ctc_grad_norm_type=config.ctc_grad_norm_type, ) + ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) return model diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py index d134239f..85876bce 100644 --- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py @@ -255,7 +255,7 @@ class DeepSpeech2ModelOnline(nn.Layer): fc_layers_size_list=[512, 256], use_gru=True, #Use gru if set True. Use simple rnn if set False. blank_id=0, # index of blank in vocob.txt - ctc_grad_norm_type='instance', )) + ctc_grad_norm_type=None, )) if config is not None: config.merge_from_other_cfg(default) return default @@ -272,7 +272,7 @@ class DeepSpeech2ModelOnline(nn.Layer): fc_layers_size_list=[512, 256], use_gru=False, blank_id=0, - ctc_grad_norm_type='instance', ): + ctc_grad_norm_type=None, ): super().__init__() self.encoder = CRNNEncoder( feat_size=feat_size, @@ -361,7 +361,7 @@ class DeepSpeech2ModelOnline(nn.Layer): fc_layers_size_list=config.model.fc_layers_size_list, use_gru=config.model.use_gru, blank_id=config.model.blank_id, - ctc_grad_norm_type=config.model.ctc_grad_norm_type, ) + ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -391,7 +391,7 @@ class DeepSpeech2ModelOnline(nn.Layer): fc_layers_size_list=config.fc_layers_size_list, use_gru=config.use_gru, blank_id=config.blank_id, - ctc_grad_norm_type=config.ctc_grad_norm_type, ) + ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) return model diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 4f833372..8053ed3a 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -894,14 +894,16 @@ class U2Model(U2DecodeModel): # ctc decoder and ctc loss model_conf = configs['model_conf'] + dropout_rate = model_conf.get('ctc_dropout_rate', 0.0) + grad_norm_type = model_conf.get('ctc_grad_norm_type', None) ctc = CTCDecoder( odim=vocab_size, enc_n_units=encoder.output_size(), blank_id=0, - dropout_rate=model_conf['ctc_dropoutrate'], + dropout_rate=dropout_rate, reduction=True, # sum batch_average=True, # sum / batch_size - grad_norm_type=model_conf['ctc_grad_norm_type']) + grad_norm_type=grad_norm_type) return vocab_size, encoder, decoder, ctc diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index a83e6707..3a23804f 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -655,14 +655,16 @@ class U2STModel(U2STBaseModel): **configs['decoder_conf']) # ctc decoder and ctc loss model_conf = configs['model_conf'] + dropout_rate = model_conf.get('ctc_dropout_rate', 0.0) + grad_norm_type = model_conf.get('ctc_grad_norm_type', None) ctc = CTCDecoder( odim=vocab_size, enc_n_units=encoder.output_size(), blank_id=0, - dropout_rate=model_conf['ctc_dropoutrate'], + dropout_rate=dropout_rate, reduction=True, # sum batch_average=True, # sum / batch_size - grad_norm_type=model_conf['ctc_grad_norm_type']) + grad_norm_type=grad_norm_type) return vocab_size, encoder, (st_decoder, decoder, ctc) else: diff --git a/tests/unit/asr/u2_model_test.py b/tests/unit/asr/u2_model_test.py index f46c6d40..5b11d2ad 100644 --- a/tests/unit/asr/u2_model_test.py +++ b/tests/unit/asr/u2_model_test.py @@ -74,8 +74,6 @@ class TestU2Model(unittest.TestCase): model_conf: ctc_weight: 0.3 lsm_weight: 0.1 # label smoothing option - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null length_normalized_loss: false """ cfg = CN().load_cfg(conf_str) @@ -128,8 +126,6 @@ class TestU2Model(unittest.TestCase): model_conf: ctc_weight: 0.3 lsm_weight: 0.1 # label smoothing option - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null length_normalized_loss: false """ cfg = CN().load_cfg(conf_str) From e884540fec82c52824838524838b73ebfd152b8f Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 13 Dec 2021 14:32:24 +0800 Subject: [PATCH 37/50] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c98fe136..6c7aa30b 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,7 @@ PaddleSpeech Text-to-Speech mainly contains three modules: *Text Frontend*, *Aco Voice Cloning GE2E - AISHELL-3, etc. + Librispeech, etc. ge2e From c759fef0aacefdcb89daa37f8ed39e471fd91e9a Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 13 Dec 2021 06:38:16 +0000 Subject: [PATCH 38/50] move pypi-kenlm from install requirements to develop requirements --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1ac671f1..a5b773ed 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,6 @@ requirements = { "paddleaudio", "paddlespeech_feat", "praatio~=4.1", - "pypi-kenlm", "pypinyin", "python-dateutil", "pyworld", @@ -71,6 +70,7 @@ requirements = { "phkit", "Pillow", "pybind11", + "pypi-kenlm", "snakeviz", "sox", "soxbindings", From 9e31a606d10a3b34e8b236637f01b3257e786ed0 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 13 Dec 2021 14:46:20 +0800 Subject: [PATCH 39/50] set default encoding utf8 for win (#1101) Co-authored-by: KP <109694228@qq.com> --- paddlespeech/cli/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py index 99a53c37..c82168ae 100644 --- a/paddlespeech/cli/__init__.py +++ b/paddlespeech/cli/__init__.py @@ -11,9 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +import _locale from .asr import ASRExecutor from .base_commands import BaseCommand from .base_commands import HelpCommand from .cls import CLSExecutor from .st import STExecutor from .tts import TTSExecutor + +_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) From eeadee1e7f83d8397b578665143fcd022bffe5df Mon Sep 17 00:00:00 2001 From: Mingxue-Xu <92848346+Mingxue-Xu@users.noreply.github.com> Date: Mon, 13 Dec 2021 14:46:34 +0800 Subject: [PATCH 40/50] [README] Update ST and AC info in README.md --- README.md | 102 +++++++++++++++++++++++++++------- docs/source/released_model.md | 15 +++++ 2 files changed, 97 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 6c7aa30b..5004df5d 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme 4.What is the goal of this project? --> -**PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech, with the state-of-art and influential models. +**PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech and audio, with the state-of-art and influential models. ##### Speech-to-Text @@ -86,26 +86,49 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html). +##### Speech Translation + +
+ + + + + + + + + + + + + +
Input Audio Translations Result
+ +
+
“我 在 这栋 建筑 的 古老 门上 敲门。”
+ +
+ Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at: -- **Fast and Light-weight**: we provide high-speed and ultra-lightweight models that are convenient for industrial deployment. +- **Ease of Use**: low barries to install, and [CLI](#quick-start) is available to quick-start your journey. +- **Align to the State-of-the-Art**: we provide high-speed and ultra-lightweight models, and also cutting edge technology. - **Rule-based Chinese frontend**: our frontend contains Text Normalization and Grapheme-to-Phoneme (G2P, including Polyphone and Tone Sandhi). Moreover, we use self-defined linguistic rules to adapt Chinese context. - **Varieties of Functions that Vitalize both Industrial and Academia**: - - *Implementation of critical audio tasks*: this toolkit contains audio functions like Speech Translation, Automatic Speech Recognition, Text-to-Speech Synthesis, Voice Cloning, etc. + - *Implementation of critical audio tasks*: this toolkit contains audio functions like Audio Classification, Speech Translation, Automatic Speech Recognition, Text-to-Speech Synthesis, etc. - *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details. - - *Cascaded models application*: as an extension of the application of traditional audio tasks, we combine the workflows of aforementioned tasks with other fields like Natural language processing (NLP), like Punctuation Restoration. + - *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ## Installation -The base environment in this page is -- Ubuntu 16.04 -- python>=3.7 -- paddlepaddle>=2.2.0 - -If you want to set up PaddleSpeech in other environment, please see the [installation](./docs/source/install.md) documents for all the alternatives. +We strongly recommend our users to install PaddleSpeech in *Linux* with *python>=3.7* and *paddlepaddle>=2.2.0*, where `paddlespeech` can be easily installed with `pip`: +```python +pip install paddlespeech +``` +If you want to set up in other environment, please see the [installation](./docs/source/install.md) for all the alternatives. ## Quick Start -Developers can have a try of our models with [PaddleSpeech Command Line](./paddlespeech/cli/README.md). Change `--input` to test your own audio/text file. +Developers can have a try of our models with [PaddleSpeech Command Line](./paddlespeech/cli/README.md). Change `--input` to test your own audio/text. **Audio Classification** ```shell @@ -124,13 +147,13 @@ paddlespeech st --input input_16k.wav paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav ``` -If you want to try more functions like training and tuning, please see [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md). +If you want to try more functions like training and tuning, please have a look at [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md). ## Model List -PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_model.md) with available pretrained models. +PaddleSpeech supports a series of most popular models. They are summarized in [released models](./docs/source/released_model.md) and attached with available pretrained models. -Speech-to-Text module contains *Acoustic Model* and *Language Model*, with the following details: +**Speech-to-Text** contains *Acoustic Model* and *Language Model*, with the following details: