Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleSpeech into develop
commit
5564c7c105
@ -0,0 +1,98 @@
|
|||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file:
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: conformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1 # sublayer output dropout
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: True
|
||||||
|
cnn_module_kernel: 15
|
||||||
|
use_cnn_module: True
|
||||||
|
activation_type: 'swish'
|
||||||
|
pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos
|
||||||
|
selfattention_layer_type: 'rel_selfattn' # unused
|
||||||
|
causal: true
|
||||||
|
use_dynamic_chunk: true
|
||||||
|
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
|
||||||
|
use_dynamic_left_chunk: false
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer # transformer, bitransformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
r_num_blocks: 0 # only for bitransformer
|
||||||
|
dropout_rate: 0.1 # sublayer output dropout
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
ctc_weight: 0.3
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
reverse_weight: 0.0 # only for bitransformer
|
||||||
|
length_normalized_loss: false
|
||||||
|
init_type: 'kaiming_uniform' # !Warning: need to convergence
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
|
||||||
|
train_manifest: data/manifest.train
|
||||||
|
dev_manifest: data/manifest.dev
|
||||||
|
test_manifest: data/manifest.test
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
|
||||||
|
vocab_filepath: data/lang_char/vocab.txt
|
||||||
|
spm_model_prefix: ''
|
||||||
|
unit_type: 'char'
|
||||||
|
preprocess_config: conf/preprocess.yaml
|
||||||
|
feat_dim: 80
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
batch_size: 32
|
||||||
|
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
num_workers: 2
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 240
|
||||||
|
accum_grad: 1
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
dist_sampler: True
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 0.001
|
||||||
|
weight_decay: 1.0e-6
|
||||||
|
scheduler: warmuplr
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 100
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
@ -0,0 +1,98 @@
|
|||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file:
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: conformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1 # sublayer output dropout
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: True
|
||||||
|
cnn_module_kernel: 15
|
||||||
|
use_cnn_module: True
|
||||||
|
activation_type: 'swish'
|
||||||
|
pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos
|
||||||
|
selfattention_layer_type: 'rel_selfattn' # unused
|
||||||
|
causal: true
|
||||||
|
use_dynamic_chunk: true
|
||||||
|
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
|
||||||
|
use_dynamic_left_chunk: false
|
||||||
|
# decoder related
|
||||||
|
decoder: bitransformer # transformer, bitransformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 3
|
||||||
|
r_num_blocks: 3 # only for bitransformer
|
||||||
|
dropout_rate: 0.1 # sublayer output dropout
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
ctc_weight: 0.3
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
reverse_weight: 0.3 # only for bitransformer
|
||||||
|
length_normalized_loss: false
|
||||||
|
init_type: 'kaiming_uniform' # !Warning: need to convergence
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
|
||||||
|
train_manifest: data/manifest.train
|
||||||
|
dev_manifest: data/manifest.dev
|
||||||
|
test_manifest: data/manifest.test
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
|
||||||
|
vocab_filepath: data/lang_char/vocab.txt
|
||||||
|
spm_model_prefix: ''
|
||||||
|
unit_type: 'char'
|
||||||
|
preprocess_config: conf/preprocess.yaml
|
||||||
|
feat_dim: 80
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
batch_size: 32
|
||||||
|
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
num_workers: 2
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 240
|
||||||
|
accum_grad: 1
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
dist_sampler: True
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 0.001
|
||||||
|
weight_decay: 1.0e-6
|
||||||
|
scheduler: warmuplr
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 100
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
@ -0,0 +1,46 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
train_output_path=$1
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
python3 ${BIN_DIR}/../inference.py \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--voc=pwgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/pd_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--device xpu
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for more GAN Vocoders
|
||||||
|
# multi band melgan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
python3 ${BIN_DIR}/../inference.py \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--voc=mb_melgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/pd_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--device xpu
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
python3 ${BIN_DIR}/../inference.py \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--voc=hifigan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/pd_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--device xpu
|
||||||
|
fi
|
@ -0,0 +1,122 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
ckpt_name=$3
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/feats_stats.npy \
|
||||||
|
--voc=pwgan_csmsc \
|
||||||
|
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||||
|
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||||
|
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||||
|
--lang=zh \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/test_e2e \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for more GAN Vocoders
|
||||||
|
# multi band melgan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/feats_stats.npy \
|
||||||
|
--voc=mb_melgan_csmsc \
|
||||||
|
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||||
|
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--lang=zh \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/test_e2e \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# the pretrained models haven't release now
|
||||||
|
# style melgan
|
||||||
|
# style melgan's Dygraph to Static Graph is not ready now
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/feats_stats.npy \
|
||||||
|
--voc=style_melgan_csmsc \
|
||||||
|
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||||
|
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--lang=zh \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/test_e2e \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
# --inference_dir=${train_output_path}/inference
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/feats_stats.npy \
|
||||||
|
--voc=hifigan_csmsc \
|
||||||
|
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||||
|
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--lang=zh \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/test_e2e \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# wavernn
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
echo "in wavernn syn_e2e"
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/feats_stats.npy \
|
||||||
|
--voc=wavernn_csmsc \
|
||||||
|
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
|
||||||
|
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
|
||||||
|
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
|
||||||
|
--lang=zh \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/test_e2e \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
@ -0,0 +1,110 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
ckpt_name=$3
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize.py \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/feats_stats.npy \
|
||||||
|
--voc=pwgan_csmsc \
|
||||||
|
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||||
|
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||||
|
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||||
|
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||||
|
--output_dir=${train_output_path}/test \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for more GAN Vocoders
|
||||||
|
# multi band melgan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize.py \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/feats_stats.npy \
|
||||||
|
--voc=mb_melgan_csmsc \
|
||||||
|
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||||
|
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||||
|
--output_dir=${train_output_path}/test \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# style melgan
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize.py \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/feats_stats.npy \
|
||||||
|
--voc=style_melgan_csmsc \
|
||||||
|
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||||
|
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||||
|
--output_dir=${train_output_path}/test \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
echo "in hifigan syn"
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize.py \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/feats_stats.npy \
|
||||||
|
--voc=hifigan_csmsc \
|
||||||
|
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||||
|
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||||
|
--output_dir=${train_output_path}/test \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# wavernn
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
echo "in wavernn syn"
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize.py \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/feats_stats.npy \
|
||||||
|
--voc=wavernn_csmsc \
|
||||||
|
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
|
||||||
|
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
|
||||||
|
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
|
||||||
|
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||||
|
--output_dir=${train_output_path}/test \
|
||||||
|
--tones_dict=dump/tone_id_map.txt \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
@ -0,0 +1,16 @@
|
|||||||
|
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
|
||||||
|
python ${BIN_DIR}/train.py \
|
||||||
|
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||||
|
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||||
|
--config=${config_path} \
|
||||||
|
--output-dir=${train_output_path} \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1 \
|
||||||
|
--phones-dict=dump/phone_id_map.txt \
|
||||||
|
--tones-dict=dump/tone_id_map.txt \
|
||||||
|
--use-relative-path=True
|
@ -0,0 +1,42 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
xpus=0,1
|
||||||
|
stage=0
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
conf_path=conf/default.yaml
|
||||||
|
train_output_path=exp/default
|
||||||
|
ckpt_name=snapshot_iter_76.pdz
|
||||||
|
|
||||||
|
# with the following command, you can choose the stage range you want to run
|
||||||
|
# such as `./run_xpu.sh --stage 0 --stop-stage 0`
|
||||||
|
# this can not be mixed use with `$1`, `$2` ...
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# prepare data
|
||||||
|
./local/preprocess.sh ${conf_path} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||||
|
FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
# synthesize, vocoder is pwgan by default
|
||||||
|
FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
# synthesize_e2e, vocoder is pwgan by default
|
||||||
|
FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
# inference with static model
|
||||||
|
FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1
|
||||||
|
fi
|
@ -0,0 +1,55 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
train_output_path=$1
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
python3 ${BIN_DIR}/../inference.py \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--voc=pwgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/pd_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--device xpu
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for more GAN Vocoders
|
||||||
|
# multi band melgan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
python3 ${BIN_DIR}/../inference.py \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--voc=mb_melgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/pd_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--device xpu
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
python3 ${BIN_DIR}/../inference.py \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--voc=hifigan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/pd_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--device xpu
|
||||||
|
fi
|
||||||
|
|
||||||
|
# wavernn
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
python3 ${BIN_DIR}/../inference.py \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--voc=wavernn_csmsc \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/pd_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--device xpu
|
||||||
|
fi
|
@ -0,0 +1,119 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
ckpt_name=$3
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=pwgan_csmsc \
|
||||||
|
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||||
|
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||||
|
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||||
|
--lang=zh \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/test_e2e \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for more GAN Vocoders
|
||||||
|
# multi band melgan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=mb_melgan_csmsc \
|
||||||
|
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||||
|
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--lang=zh \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/test_e2e \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# the pretrained models haven't release now
|
||||||
|
# style melgan
|
||||||
|
# style melgan's Dygraph to Static Graph is not ready now
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=style_melgan_csmsc \
|
||||||
|
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||||
|
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--lang=zh \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/test_e2e \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
# --inference_dir=${train_output_path}/inference
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
echo "in hifigan syn_e2e"
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=hifigan_csmsc \
|
||||||
|
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||||
|
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--lang=zh \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/test_e2e \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# wavernn
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
echo "in wavernn syn_e2e"
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=wavernn_csmsc \
|
||||||
|
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
|
||||||
|
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
|
||||||
|
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
|
||||||
|
--lang=zh \
|
||||||
|
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/test_e2e \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--inference_dir=${train_output_path}/inference \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
@ -0,0 +1,105 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
ckpt_name=$3
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize.py \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=pwgan_csmsc \
|
||||||
|
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||||
|
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||||
|
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||||
|
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||||
|
--output_dir=${train_output_path}/test \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for more GAN Vocoders
|
||||||
|
# multi band melgan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize.py \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=mb_melgan_csmsc \
|
||||||
|
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||||
|
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||||
|
--output_dir=${train_output_path}/test \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# style melgan
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize.py \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=style_melgan_csmsc \
|
||||||
|
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||||
|
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||||
|
--output_dir=${train_output_path}/test \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
echo "in hifigan syn"
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize.py \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=hifigan_csmsc \
|
||||||
|
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||||
|
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||||
|
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||||
|
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||||
|
--output_dir=${train_output_path}/test \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# wavernn
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
echo "in wavernn syn"
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
python3 ${BIN_DIR}/../synthesize.py \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_config=${config_path} \
|
||||||
|
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=wavernn_csmsc \
|
||||||
|
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
|
||||||
|
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
|
||||||
|
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
|
||||||
|
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||||
|
--output_dir=${train_output_path}/test \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1
|
||||||
|
fi
|
@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/train.py \
|
||||||
|
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||||
|
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||||
|
--config=${config_path} \
|
||||||
|
--output-dir=${train_output_path} \
|
||||||
|
--ngpu=0 \
|
||||||
|
--nxpu=1 \
|
||||||
|
--phones-dict=dump/phone_id_map.txt
|
@ -0,0 +1,42 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
xpus=0,1
|
||||||
|
stage=0
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
conf_path=conf/default.yaml
|
||||||
|
train_output_path=exp/default
|
||||||
|
ckpt_name=snapshot_iter_153.pdz
|
||||||
|
|
||||||
|
# with the following command, you can choose the stage range you want to run
|
||||||
|
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||||
|
# this can not be mixed use with `$1`, `$2` ...
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# prepare data
|
||||||
|
./local/preprocess.sh ${conf_path} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||||
|
FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
# synthesize, vocoder is pwgan by default
|
||||||
|
FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
# synthesize_e2e, vocoder is pwgan by default
|
||||||
|
FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
# inference with static model, vocoder is pwgan by default
|
||||||
|
FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1
|
||||||
|
fi
|
@ -1,86 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import paddle
|
|
||||||
from paddle.fluid import core
|
|
||||||
from paddle.fluid import layers
|
|
||||||
from paddle.fluid.dygraph import base as imperative_base
|
|
||||||
|
|
||||||
from paddlespeech.s2t.utils.log import Log
|
|
||||||
|
|
||||||
__all__ = ["ClipGradByGlobalNormWithLog"]
|
|
||||||
|
|
||||||
logger = Log(__name__).getlog()
|
|
||||||
|
|
||||||
|
|
||||||
class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
|
|
||||||
def __init__(self, clip_norm):
|
|
||||||
super().__init__(clip_norm)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
|
|
||||||
|
|
||||||
@imperative_base.no_grad
|
|
||||||
def _dygraph_clip(self, params_grads):
|
|
||||||
params_and_grads = []
|
|
||||||
sum_square_list = []
|
|
||||||
for i, (p, g) in enumerate(params_grads):
|
|
||||||
if g is None:
|
|
||||||
continue
|
|
||||||
if getattr(p, 'need_clip', True) is False:
|
|
||||||
continue
|
|
||||||
merge_grad = g
|
|
||||||
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
|
|
||||||
merge_grad = layers.merge_selected_rows(g)
|
|
||||||
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
|
|
||||||
square = paddle.square(merge_grad)
|
|
||||||
sum_square = paddle.sum(square)
|
|
||||||
sum_square_list.append(sum_square)
|
|
||||||
|
|
||||||
# debug log, not dump all since slow down train process
|
|
||||||
if i < 10:
|
|
||||||
logger.debug(
|
|
||||||
f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
|
|
||||||
|
|
||||||
# all parameters have been filterd out
|
|
||||||
if len(sum_square_list) == 0:
|
|
||||||
return params_grads
|
|
||||||
|
|
||||||
global_norm_var = paddle.concat(sum_square_list)
|
|
||||||
global_norm_var = paddle.sum(global_norm_var)
|
|
||||||
global_norm_var = paddle.sqrt(global_norm_var)
|
|
||||||
|
|
||||||
# debug log
|
|
||||||
logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
|
|
||||||
|
|
||||||
max_global_norm = paddle.full(
|
|
||||||
shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm)
|
|
||||||
clip_var = paddle.divide(
|
|
||||||
x=max_global_norm,
|
|
||||||
y=paddle.maximum(x=global_norm_var, y=max_global_norm))
|
|
||||||
for i, (p, g) in enumerate(params_grads):
|
|
||||||
if g is None:
|
|
||||||
continue
|
|
||||||
if getattr(p, 'need_clip', True) is False:
|
|
||||||
params_and_grads.append((p, g))
|
|
||||||
continue
|
|
||||||
new_grad = paddle.multiply(x=g, y=clip_var)
|
|
||||||
params_and_grads.append((p, new_grad))
|
|
||||||
|
|
||||||
# debug log, not dump all since slow down train process
|
|
||||||
if i < 10:
|
|
||||||
logger.debug(
|
|
||||||
f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return params_and_grads
|
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,14 @@
|
|||||||
|
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .phonectic import English
|
@ -0,0 +1,36 @@
|
|||||||
|
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import os
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
class Polyphonic():
|
||||||
|
def __init__(self):
|
||||||
|
with open(
|
||||||
|
os.path.join(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)),
|
||||||
|
'polyphonic.yaml'),
|
||||||
|
'r',
|
||||||
|
encoding='utf-8') as polyphonic_file:
|
||||||
|
# 解析yaml
|
||||||
|
polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
|
||||||
|
self.polyphonic_words = polyphonic_dict["polyphonic"]
|
||||||
|
|
||||||
|
def correct_pronunciation(self, word, pinyin):
|
||||||
|
# 词汇被词典收录则返回纠正后的读音
|
||||||
|
if word in self.polyphonic_words.keys():
|
||||||
|
pinyin = self.polyphonic_words[word]
|
||||||
|
# 否则返回原读音
|
||||||
|
return pinyin
|
@ -1,4 +1,4 @@
|
|||||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
@ -1,5 +1,5 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
PYTHON=python3.7
|
PYTHON=python3.8
|
||||||
test -d venv || virtualenv -p ${PYTHON} venv
|
test -d venv || virtualenv -p ${PYTHON} venv
|
||||||
|
@ -0,0 +1,28 @@
|
|||||||
|
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
fe = EnFrontend()
|
||||||
|
|
||||||
|
text = "AI for Sceience"
|
||||||
|
phonemes = fe.phoneticize(text)
|
||||||
|
print(text)
|
||||||
|
print(phonemes)
|
||||||
|
|
||||||
|
text = "eight"
|
||||||
|
phonemes = fe.phoneticize(text)
|
||||||
|
print(text)
|
||||||
|
print(phonemes)
|
@ -0,0 +1,83 @@
|
|||||||
|
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
text = "你好吗,<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>thank you."
|
||||||
|
|
||||||
|
# SSML: 13
|
||||||
|
# 0 ['你好吗,', []]
|
||||||
|
# 1 ['我们的声学模型使用了FastSpeechTwo。前浪', []]
|
||||||
|
# 2 ['倒', ['dao3']]
|
||||||
|
# 3 ['在沙滩上,沙滩上倒了一堆', []]
|
||||||
|
# 4 ['土', ['tu3']]
|
||||||
|
# 5 ['。想象', []]
|
||||||
|
# 6 ['干干', ['gan1', 'gan1']]
|
||||||
|
# 7 ['的树干', []]
|
||||||
|
# 8 ['倒', ['dao3']]
|
||||||
|
# 9 ['了,里面有个干尸,不知是被谁', []]
|
||||||
|
# 10 ['干', ['gan4']]
|
||||||
|
# 11 ['死的。', []]
|
||||||
|
# 12 ['thank you.', []]
|
||||||
|
inputs = MixTextProcessor.get_pinyin_split(text)
|
||||||
|
print(f"SSML get_pinyin_split: {len(inputs)}")
|
||||||
|
for i, sub in enumerate(inputs):
|
||||||
|
print(i, sub)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# SSML get_dom_split: 13
|
||||||
|
# 0 你好吗,
|
||||||
|
# 1 我们的声学模型使用了 Fast Speech Two。前浪
|
||||||
|
# 2 <say-as pinyin="dao3">倒</say-as>
|
||||||
|
# 3 在沙滩上,沙滩上倒了一堆
|
||||||
|
# 4 <say-as pinyin="tu3">土</say-as>
|
||||||
|
# 5 。 想象
|
||||||
|
# 6 <say-as pinyin="gan1 gan1">干干</say-as>
|
||||||
|
# 7 的树干
|
||||||
|
# 8 <say-as pinyin="dao3">倒</say-as>
|
||||||
|
# 9 了, 里面有个干尸,不知是被谁
|
||||||
|
# 10 <say-as pinyin="gan4">干</say-as>
|
||||||
|
# 11 死的。
|
||||||
|
# 12 thank you.
|
||||||
|
inputs = MixTextProcessor.get_dom_split(text)
|
||||||
|
print(f"SSML get_dom_split: {len(inputs)}")
|
||||||
|
for i, sub in enumerate(inputs):
|
||||||
|
print(i, sub)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# SSML object.get_pinyin_split: 246
|
||||||
|
# <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
|
||||||
|
outs = MixTextProcessor().get_xml_content(text)
|
||||||
|
print(f"SSML object.get_pinyin_split: {len(outs)}")
|
||||||
|
print(outs)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# SSML object.get_content_split: 30 你好吗,
|
||||||
|
# 1 <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>
|
||||||
|
# 倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
|
||||||
|
# 2 thank you.
|
||||||
|
outs = MixTextProcessor().get_content_split(text)
|
||||||
|
print(f"SSML object.get_content_split: {len(outs)}")
|
||||||
|
for i, sub in enumerate(outs):
|
||||||
|
print(i, sub)
|
||||||
|
print()
|
||||||
|
|
||||||
|
import json
|
||||||
|
import xmltodict
|
||||||
|
text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
|
||||||
|
ssml = xmltodict.parse(text)
|
||||||
|
print(json.dumps(ssml))
|
||||||
|
print(ssml['speak'].keys())
|
||||||
|
print(ssml['speak']['#text'])
|
||||||
|
print(ssml['speak']['say-as'])
|
Loading…
Reference in new issue