parent
58b24aa49f
commit
35c37ace17
@ -0,0 +1,115 @@
|
||||
# This is the hyperparameter configuration file for Parallel WaveGAN.
|
||||
# Please make sure this is adjusted for the VCTK corpus. If you want to
|
||||
# apply to the other dataset, you might need to carefully change some parameters.
|
||||
# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
|
||||
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 24000 # Sampling rate.
|
||||
n_fft: 2048 # FFT size. (in samples)
|
||||
n_shift: 300 # Hop size. (in samples)
|
||||
win_length: 1200 # Window length. (in samples)
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
|
||||
|
||||
###########################################################
|
||||
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
generator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_size: 3 # Kernel size of dilated convolution.
|
||||
layers: 30 # Number of residual block layers.
|
||||
stacks: 3 # Number of stacks i.e., dilation cycles.
|
||||
residual_channels: 64 # Number of channels in residual conv.
|
||||
gate_channels: 128 # Number of channels in gated conv.
|
||||
skip_channels: 64 # Number of channels in skip conv.
|
||||
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
||||
# Must be the same as num_mels.
|
||||
aux_context_window: 2 # Context window size for auxiliary feature.
|
||||
# If set to 2, previous 2 and future 2 frames will be considered.
|
||||
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
|
||||
use_weight_norm: true # Whether to use weight norm.
|
||||
# If set to true, it will be applied to all of the conv layers.
|
||||
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size.
|
||||
|
||||
###########################################################
|
||||
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_size: 3 # Number of output channels.
|
||||
layers: 10 # Number of conv layers.
|
||||
conv_channels: 64 # Number of chnn layers.
|
||||
bias: true # Whether to use bias parameter in conv.
|
||||
use_weight_norm: true # Whether to use weight norm.
|
||||
# If set to true, it will be applied to all of the conv layers.
|
||||
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
|
||||
nonlinear_activation_params: # Nonlinear function parameters
|
||||
negative_slope: 0.2 # Alpha in LeakyReLU.
|
||||
|
||||
###########################################################
|
||||
# STFT LOSS SETTING #
|
||||
###########################################################
|
||||
stft_loss_params:
|
||||
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
||||
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
||||
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
||||
window: "hann" # Window function for STFT-based loss
|
||||
|
||||
###########################################################
|
||||
# ADVERSARIAL LOSS SETTING #
|
||||
###########################################################
|
||||
lambda_adv: 4.0 # Loss balancing coefficient.
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 8 # Batch size.
|
||||
batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
||||
num_workers: 4 # Number of workers in Pytorch DataLoader.
|
||||
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
||||
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER & SCHEDULER SETTING #
|
||||
###########################################################
|
||||
generator_optimizer_params:
|
||||
epsilon: 1.0e-6 # Generator's epsilon.
|
||||
weight_decay: 0.0 # Generator's weight decay coefficient.
|
||||
generator_scheduler_params:
|
||||
learning_rate: 0.0001 # Generator's learning rate.
|
||||
step_size: 200000 # Generator's scheduler step size.
|
||||
gamma: 0.5 # Generator's scheduler gamma.
|
||||
# At each step size, lr will be multiplied by this parameter.
|
||||
generator_grad_norm: 10 # Generator's gradient norm.
|
||||
discriminator_optimizer_params:
|
||||
epsilon: 1.0e-6 # Discriminator's epsilon.
|
||||
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
||||
discriminator_scheduler_params:
|
||||
learning_rate: 0.00005 # Discriminator's learning rate.
|
||||
step_size: 200000 # Discriminator's scheduler step size.
|
||||
gamma: 0.5 # Discriminator's scheduler gamma.
|
||||
# At each step size, lr will be multiplied by this parameter.
|
||||
discriminator_grad_norm: 1 # Discriminator's gradient norm.
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
|
||||
train_max_steps: 1000000 # Number of training steps.
|
||||
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
config_path=$1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./aishell3_alignment_tone \
|
||||
--output=durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/../preprocess.py \
|
||||
--rootdir=~/datasets/data_aishell3/ \
|
||||
--dataset=aishell3 \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--cut-sil=True \
|
||||
--num-cpu=20
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="feats"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
fi
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--config=${config_path} \
|
||||
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=${train_output_path}/test
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
FLAGS_cudnn_exhaustive_search=true \
|
||||
FLAGS_conv_workspace_size_limit=4000 \
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=1
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=parallelwave_gan
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
|
@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_5000.pdz
|
||||
|
||||
# with the following command, you can choice the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -0,0 +1,20 @@
|
||||
# G2P
|
||||
For g2p, we use BZNSYP's phone label as the ground truth and we delete silence tokens in labels and predicted phones.
|
||||
|
||||
You should Download BZNSYP from it's [Official Website](https://test.data-baker.com/data/index/source) and extract it. Assume the path to the dataset is `~/datasets/BZNSYP`.
|
||||
|
||||
We use `WER` as evaluation criterion.
|
||||
|
||||
# Start
|
||||
Run the command below to get the results of test.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
The `avg WER` of g2p is: 0.027495061517943988
|
||||
```text
|
||||
,--------------------------------------------------------------------.
|
||||
| | # Snt # Wrd | Corr Sub Del Ins Err S.Err |
|
||||
|--------+-----------------+-----------------------------------------|
|
||||
| Sum/Avg| 9996 299181 | 97.3 2.7 0.0 0.0 2.7 52.5 |
|
||||
`--------------------------------------------------------------------'
|
||||
```
|
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
source path.sh
|
||||
USE_SCLITE=true
|
||||
|
||||
# test g2p
|
||||
echo "Start get g2p test data ..."
|
||||
python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p
|
||||
echo "Start test g2p ..."
|
||||
python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p
|
||||
|
||||
# whether use sclite to get more detail information of WER
|
||||
if [ "$USE_SCLITE" = true ];then
|
||||
echo "Start sclite g2p ..."
|
||||
${MAIN_ROOT}/tools/sctk/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all
|
||||
fi
|
@ -1,38 +0,0 @@
|
||||
# Chinese Text Frontend Example
|
||||
Here's an example for Chinese text frontend, including g2p and text normalization.
|
||||
## G2P
|
||||
For g2p, we use BZNSYP's phone label as the ground truth and we delete silence tokens in labels and predicted phones.
|
||||
|
||||
You should Download BZNSYP from it's [Official Website](https://test.data-baker.com/data/index/source) and extract it. Assume the path to the dataset is `~/datasets/BZNSYP`.
|
||||
|
||||
We use `WER` as evaluation criterion.
|
||||
## Text Normalization
|
||||
For text normalization, the test data is `data/textnorm_test_cases.txt`, we use `|` as the separator of raw_data and normed_data.
|
||||
|
||||
We use `CER` as evaluation criterion.
|
||||
## Start
|
||||
If you want to use sclite to get more detail information of WER, you should run the command below to make sclite first.
|
||||
```bash
|
||||
./make_sclite.sh
|
||||
```
|
||||
Run the command below to get the results of test.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
The `avg WER` of g2p is: 0.027495061517943988
|
||||
```text
|
||||
,--------------------------------------------------------------------.
|
||||
| | # Snt # Wrd | Corr Sub Del Ins Err S.Err |
|
||||
|--------+-----------------+-----------------------------------------|
|
||||
| Sum/Avg| 9996 299181 | 97.3 2.7 0.0 0.0 2.7 52.5 |
|
||||
`--------------------------------------------------------------------'
|
||||
```
|
||||
|
||||
The `avg CER` of text normalization is: 0.006388318503308237
|
||||
```text
|
||||
,-----------------------------------------------------------------.
|
||||
| | # Snt # Wrd | Corr Sub Del Ins Err S.Err |
|
||||
|--------+--------------+-----------------------------------------|
|
||||
| Sum/Avg| 125 2254 | 99.4 0.1 0.5 0.1 0.7 3.2 |
|
||||
`-----------------------------------------------------------------'
|
||||
```
|
@ -1,13 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ ! -d "./SCTK" ];then
|
||||
echo "Clone SCTK ..."
|
||||
git clone https://github.com/usnistgov/SCTK
|
||||
echo "Clone SCTK done!"
|
||||
fi
|
||||
|
||||
if [ ! -d "./SCTK/bin" ];then
|
||||
echo "Start make SCTK ..."
|
||||
pushd SCTK && make config && make all && make check && make install && make doc && popd
|
||||
echo "SCTK make done!"
|
||||
fi
|
@ -1,25 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
USE_SCLITE=true
|
||||
|
||||
# test g2p
|
||||
echo "Start get g2p test data ..."
|
||||
python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p
|
||||
echo "Start test g2p ..."
|
||||
python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p
|
||||
|
||||
# test text normalization
|
||||
echo "Start get text normalization test data ..."
|
||||
python3 get_textnorm_data.py --test-file=data/textnorm_test_cases.txt --output-dir=data/textnorm
|
||||
echo "Start test text normalization ..."
|
||||
python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm
|
||||
|
||||
# whether use sclite to get more detail information of WER
|
||||
if [ "$USE_SCLITE" = true ];then
|
||||
echo "Start sclite g2p ..."
|
||||
./SCTK/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all
|
||||
echo
|
||||
|
||||
echo "Start sclite textnorm ..."
|
||||
./SCTK/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean trn -h ./exp/textnorm/text.tn trn -e utf-8 -o all
|
||||
fi
|
@ -0,0 +1,17 @@
|
||||
# Text Normalization
|
||||
For text normalization, the test data is `data/textnorm_test_cases.txt`, we use `|` as the separator of raw_data and normed_data.
|
||||
|
||||
We use `CER` as evaluation criterion.
|
||||
## Start
|
||||
Run the command below to get the results of test.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
The `avg CER` of text normalization is: 0.006388318503308237
|
||||
```text
|
||||
,-----------------------------------------------------------------.
|
||||
| | # Snt # Wrd | Corr Sub Del Ins Err S.Err |
|
||||
|--------+--------------+-----------------------------------------|
|
||||
| Sum/Avg| 125 2254 | 99.4 0.1 0.5 0.1 0.7 3.2 |
|
||||
`-----------------------------------------------------------------'
|
||||
```
|
@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
source path.sh
|
||||
|
||||
USE_SCLITE=true
|
||||
|
||||
# test text normalization
|
||||
echo "Start get text normalization test data ..."
|
||||
python3 get_textnorm_data.py --test-file=data/textnorm_test_cases.txt --output-dir=data/textnorm
|
||||
echo "Start test text normalization ..."
|
||||
python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm
|
||||
|
||||
# whether use sclite to get more detail information of WER
|
||||
if [ "$USE_SCLITE" = true ];then
|
||||
echo "Start sclite textnorm ..."
|
||||
${MAIN_ROOT}/tools/sctk/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean trn -h ./exp/textnorm/text.tn trn -e utf-8 -o all
|
||||
fi
|
Loading…
Reference in new issue