Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleSpeech into develop
commit
5564c7c105
@ -0,0 +1,98 @@
|
||||
############################################
|
||||
# Network Architecture #
|
||||
############################################
|
||||
cmvn_file:
|
||||
cmvn_file_type: "json"
|
||||
# encoder related
|
||||
encoder: conformer
|
||||
encoder_conf:
|
||||
output_size: 256 # dimension of attention
|
||||
attention_heads: 4
|
||||
linear_units: 2048 # the number of units of position-wise feed forward
|
||||
num_blocks: 12 # the number of encoder blocks
|
||||
dropout_rate: 0.1 # sublayer output dropout
|
||||
positional_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.0
|
||||
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||
normalize_before: True
|
||||
cnn_module_kernel: 15
|
||||
use_cnn_module: True
|
||||
activation_type: 'swish'
|
||||
pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos
|
||||
selfattention_layer_type: 'rel_selfattn' # unused
|
||||
causal: true
|
||||
use_dynamic_chunk: true
|
||||
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
|
||||
use_dynamic_left_chunk: false
|
||||
# decoder related
|
||||
decoder: transformer # transformer, bitransformer
|
||||
decoder_conf:
|
||||
attention_heads: 4
|
||||
linear_units: 2048
|
||||
num_blocks: 6
|
||||
r_num_blocks: 0 # only for bitransformer
|
||||
dropout_rate: 0.1 # sublayer output dropout
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.0
|
||||
src_attention_dropout_rate: 0.0
|
||||
# hybrid CTC/attention
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1 # label smoothing option
|
||||
reverse_weight: 0.0 # only for bitransformer
|
||||
length_normalized_loss: false
|
||||
init_type: 'kaiming_uniform' # !Warning: need to convergence
|
||||
|
||||
###########################################
|
||||
# Data #
|
||||
###########################################
|
||||
|
||||
train_manifest: data/manifest.train
|
||||
dev_manifest: data/manifest.dev
|
||||
test_manifest: data/manifest.test
|
||||
|
||||
|
||||
###########################################
|
||||
# Dataloader #
|
||||
###########################################
|
||||
|
||||
vocab_filepath: data/lang_char/vocab.txt
|
||||
spm_model_prefix: ''
|
||||
unit_type: 'char'
|
||||
preprocess_config: conf/preprocess.yaml
|
||||
feat_dim: 80
|
||||
stride_ms: 10.0
|
||||
window_ms: 25.0
|
||||
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||
batch_size: 32
|
||||
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||
minibatches: 0 # for debug
|
||||
batch_count: auto
|
||||
batch_bins: 0
|
||||
batch_frames_in: 0
|
||||
batch_frames_out: 0
|
||||
batch_frames_inout: 0
|
||||
num_workers: 2
|
||||
subsampling_factor: 1
|
||||
num_encs: 1
|
||||
|
||||
###########################################
|
||||
# Training #
|
||||
###########################################
|
||||
n_epoch: 240
|
||||
accum_grad: 1
|
||||
global_grad_clip: 5.0
|
||||
dist_sampler: True
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.001
|
||||
weight_decay: 1.0e-6
|
||||
scheduler: warmuplr
|
||||
scheduler_conf:
|
||||
warmup_steps: 25000
|
||||
lr_decay: 1.0
|
||||
log_interval: 100
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
@ -0,0 +1,98 @@
|
||||
############################################
|
||||
# Network Architecture #
|
||||
############################################
|
||||
cmvn_file:
|
||||
cmvn_file_type: "json"
|
||||
# encoder related
|
||||
encoder: conformer
|
||||
encoder_conf:
|
||||
output_size: 256 # dimension of attention
|
||||
attention_heads: 4
|
||||
linear_units: 2048 # the number of units of position-wise feed forward
|
||||
num_blocks: 12 # the number of encoder blocks
|
||||
dropout_rate: 0.1 # sublayer output dropout
|
||||
positional_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.0
|
||||
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||
normalize_before: True
|
||||
cnn_module_kernel: 15
|
||||
use_cnn_module: True
|
||||
activation_type: 'swish'
|
||||
pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos
|
||||
selfattention_layer_type: 'rel_selfattn' # unused
|
||||
causal: true
|
||||
use_dynamic_chunk: true
|
||||
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
|
||||
use_dynamic_left_chunk: false
|
||||
# decoder related
|
||||
decoder: bitransformer # transformer, bitransformer
|
||||
decoder_conf:
|
||||
attention_heads: 4
|
||||
linear_units: 2048
|
||||
num_blocks: 3
|
||||
r_num_blocks: 3 # only for bitransformer
|
||||
dropout_rate: 0.1 # sublayer output dropout
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.0
|
||||
src_attention_dropout_rate: 0.0
|
||||
# hybrid CTC/attention
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1 # label smoothing option
|
||||
reverse_weight: 0.3 # only for bitransformer
|
||||
length_normalized_loss: false
|
||||
init_type: 'kaiming_uniform' # !Warning: need to convergence
|
||||
|
||||
###########################################
|
||||
# Data #
|
||||
###########################################
|
||||
|
||||
train_manifest: data/manifest.train
|
||||
dev_manifest: data/manifest.dev
|
||||
test_manifest: data/manifest.test
|
||||
|
||||
|
||||
###########################################
|
||||
# Dataloader #
|
||||
###########################################
|
||||
|
||||
vocab_filepath: data/lang_char/vocab.txt
|
||||
spm_model_prefix: ''
|
||||
unit_type: 'char'
|
||||
preprocess_config: conf/preprocess.yaml
|
||||
feat_dim: 80
|
||||
stride_ms: 10.0
|
||||
window_ms: 25.0
|
||||
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||
batch_size: 32
|
||||
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
|
||||
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
|
||||
minibatches: 0 # for debug
|
||||
batch_count: auto
|
||||
batch_bins: 0
|
||||
batch_frames_in: 0
|
||||
batch_frames_out: 0
|
||||
batch_frames_inout: 0
|
||||
num_workers: 2
|
||||
subsampling_factor: 1
|
||||
num_encs: 1
|
||||
|
||||
###########################################
|
||||
# Training #
|
||||
###########################################
|
||||
n_epoch: 240
|
||||
accum_grad: 1
|
||||
global_grad_clip: 5.0
|
||||
dist_sampler: True
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.001
|
||||
weight_decay: 1.0e-6
|
||||
scheduler: warmuplr
|
||||
scheduler_conf:
|
||||
warmup_steps: 25000
|
||||
lr_decay: 1.0
|
||||
log_interval: 100
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=speedyspeech_csmsc \
|
||||
--voc=pwgan_csmsc \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--device xpu
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=speedyspeech_csmsc \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--device xpu
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=speedyspeech_csmsc \
|
||||
--voc=hifigan_csmsc \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--device xpu
|
||||
fi
|
@ -0,0 +1,122 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=pwgan_csmsc \
|
||||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# the pretrained models haven't release now
|
||||
# style melgan
|
||||
# style melgan's Dygraph to Static Graph is not ready now
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=style_melgan_csmsc \
|
||||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
# --inference_dir=${train_output_path}/inference
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=hifigan_csmsc \
|
||||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# wavernn
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
echo "in wavernn syn_e2e"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=wavernn_csmsc \
|
||||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
|
||||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
|
||||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
@ -0,0 +1,110 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=pwgan_csmsc \
|
||||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# style melgan
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=style_melgan_csmsc \
|
||||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "in hifigan syn"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=hifigan_csmsc \
|
||||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# wavernn
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
echo "in wavernn syn"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=wavernn_csmsc \
|
||||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
|
||||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
|
||||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
@ -0,0 +1,16 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=0 \
|
||||
--nxpu=1 \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt \
|
||||
--use-relative-path=True
|
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
xpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_76.pdz
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run_xpu.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize, vocoder is pwgan by default
|
||||
FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize_e2e, vocoder is pwgan by default
|
||||
FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# inference with static model
|
||||
FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1
|
||||
fi
|
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=fastspeech2_csmsc \
|
||||
--voc=pwgan_csmsc \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--device xpu
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=fastspeech2_csmsc \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--device xpu
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=fastspeech2_csmsc \
|
||||
--voc=hifigan_csmsc \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--device xpu
|
||||
fi
|
||||
|
||||
# wavernn
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=fastspeech2_csmsc \
|
||||
--voc=wavernn_csmsc \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--device xpu
|
||||
fi
|
@ -0,0 +1,119 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_csmsc \
|
||||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# the pretrained models haven't release now
|
||||
# style melgan
|
||||
# style melgan's Dygraph to Static Graph is not ready now
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=style_melgan_csmsc \
|
||||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
# --inference_dir=${train_output_path}/inference
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "in hifigan syn_e2e"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=hifigan_csmsc \
|
||||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
|
||||
# wavernn
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
echo "in wavernn syn_e2e"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=wavernn_csmsc \
|
||||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
|
||||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
|
||||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
@ -0,0 +1,105 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_csmsc \
|
||||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# style melgan
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=style_melgan_csmsc \
|
||||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "in hifigan syn"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=hifigan_csmsc \
|
||||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
||||
|
||||
# wavernn
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
echo "in wavernn syn"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=wavernn_csmsc \
|
||||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
|
||||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
|
||||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nxpu=1
|
||||
fi
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=0 \
|
||||
--nxpu=1 \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
xpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_153.pdz
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize, vocoder is pwgan by default
|
||||
FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize_e2e, vocoder is pwgan by default
|
||||
FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# inference with static model, vocoder is pwgan by default
|
||||
FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1
|
||||
fi
|
@ -1,86 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle
|
||||
from paddle.fluid import core
|
||||
from paddle.fluid import layers
|
||||
from paddle.fluid.dygraph import base as imperative_base
|
||||
|
||||
from paddlespeech.s2t.utils.log import Log
|
||||
|
||||
__all__ = ["ClipGradByGlobalNormWithLog"]
|
||||
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
|
||||
class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
|
||||
def __init__(self, clip_norm):
|
||||
super().__init__(clip_norm)
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
|
||||
|
||||
@imperative_base.no_grad
|
||||
def _dygraph_clip(self, params_grads):
|
||||
params_and_grads = []
|
||||
sum_square_list = []
|
||||
for i, (p, g) in enumerate(params_grads):
|
||||
if g is None:
|
||||
continue
|
||||
if getattr(p, 'need_clip', True) is False:
|
||||
continue
|
||||
merge_grad = g
|
||||
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
|
||||
merge_grad = layers.merge_selected_rows(g)
|
||||
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
|
||||
square = paddle.square(merge_grad)
|
||||
sum_square = paddle.sum(square)
|
||||
sum_square_list.append(sum_square)
|
||||
|
||||
# debug log, not dump all since slow down train process
|
||||
if i < 10:
|
||||
logger.debug(
|
||||
f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
|
||||
|
||||
# all parameters have been filterd out
|
||||
if len(sum_square_list) == 0:
|
||||
return params_grads
|
||||
|
||||
global_norm_var = paddle.concat(sum_square_list)
|
||||
global_norm_var = paddle.sum(global_norm_var)
|
||||
global_norm_var = paddle.sqrt(global_norm_var)
|
||||
|
||||
# debug log
|
||||
logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
|
||||
|
||||
max_global_norm = paddle.full(
|
||||
shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm)
|
||||
clip_var = paddle.divide(
|
||||
x=max_global_norm,
|
||||
y=paddle.maximum(x=global_norm_var, y=max_global_norm))
|
||||
for i, (p, g) in enumerate(params_grads):
|
||||
if g is None:
|
||||
continue
|
||||
if getattr(p, 'need_clip', True) is False:
|
||||
params_and_grads.append((p, g))
|
||||
continue
|
||||
new_grad = paddle.multiply(x=g, y=clip_var)
|
||||
params_and_grads.append((p, new_grad))
|
||||
|
||||
# debug log, not dump all since slow down train process
|
||||
if i < 10:
|
||||
logger.debug(
|
||||
f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
|
||||
)
|
||||
|
||||
return params_and_grads
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,14 @@
|
||||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .phonectic import English
|
@ -0,0 +1,36 @@
|
||||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
class Polyphonic():
|
||||
def __init__(self):
|
||||
with open(
|
||||
os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
'polyphonic.yaml'),
|
||||
'r',
|
||||
encoding='utf-8') as polyphonic_file:
|
||||
# 解析yaml
|
||||
polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
|
||||
self.polyphonic_words = polyphonic_dict["polyphonic"]
|
||||
|
||||
def correct_pronunciation(self, word, pinyin):
|
||||
# 词汇被词典收录则返回纠正后的读音
|
||||
if word in self.polyphonic_words.keys():
|
||||
pinyin = self.polyphonic_words[word]
|
||||
# 否则返回原读音
|
||||
return pinyin
|
@ -1,4 +1,4 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
@ -1,5 +1,5 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
PYTHON=python3.7
|
||||
PYTHON=python3.8
|
||||
test -d venv || virtualenv -p ${PYTHON} venv
|
||||
|
@ -0,0 +1,28 @@
|
||||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
fe = EnFrontend()
|
||||
|
||||
text = "AI for Sceience"
|
||||
phonemes = fe.phoneticize(text)
|
||||
print(text)
|
||||
print(phonemes)
|
||||
|
||||
text = "eight"
|
||||
phonemes = fe.phoneticize(text)
|
||||
print(text)
|
||||
print(phonemes)
|
@ -0,0 +1,83 @@
|
||||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
|
||||
|
||||
if __name__ == '__main__':
|
||||
text = "你好吗,<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>thank you."
|
||||
|
||||
# SSML: 13
|
||||
# 0 ['你好吗,', []]
|
||||
# 1 ['我们的声学模型使用了FastSpeechTwo。前浪', []]
|
||||
# 2 ['倒', ['dao3']]
|
||||
# 3 ['在沙滩上,沙滩上倒了一堆', []]
|
||||
# 4 ['土', ['tu3']]
|
||||
# 5 ['。想象', []]
|
||||
# 6 ['干干', ['gan1', 'gan1']]
|
||||
# 7 ['的树干', []]
|
||||
# 8 ['倒', ['dao3']]
|
||||
# 9 ['了,里面有个干尸,不知是被谁', []]
|
||||
# 10 ['干', ['gan4']]
|
||||
# 11 ['死的。', []]
|
||||
# 12 ['thank you.', []]
|
||||
inputs = MixTextProcessor.get_pinyin_split(text)
|
||||
print(f"SSML get_pinyin_split: {len(inputs)}")
|
||||
for i, sub in enumerate(inputs):
|
||||
print(i, sub)
|
||||
print()
|
||||
|
||||
# SSML get_dom_split: 13
|
||||
# 0 你好吗,
|
||||
# 1 我们的声学模型使用了 Fast Speech Two。前浪
|
||||
# 2 <say-as pinyin="dao3">倒</say-as>
|
||||
# 3 在沙滩上,沙滩上倒了一堆
|
||||
# 4 <say-as pinyin="tu3">土</say-as>
|
||||
# 5 。 想象
|
||||
# 6 <say-as pinyin="gan1 gan1">干干</say-as>
|
||||
# 7 的树干
|
||||
# 8 <say-as pinyin="dao3">倒</say-as>
|
||||
# 9 了, 里面有个干尸,不知是被谁
|
||||
# 10 <say-as pinyin="gan4">干</say-as>
|
||||
# 11 死的。
|
||||
# 12 thank you.
|
||||
inputs = MixTextProcessor.get_dom_split(text)
|
||||
print(f"SSML get_dom_split: {len(inputs)}")
|
||||
for i, sub in enumerate(inputs):
|
||||
print(i, sub)
|
||||
print()
|
||||
|
||||
# SSML object.get_pinyin_split: 246
|
||||
# <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
|
||||
outs = MixTextProcessor().get_xml_content(text)
|
||||
print(f"SSML object.get_pinyin_split: {len(outs)}")
|
||||
print(outs)
|
||||
print()
|
||||
|
||||
# SSML object.get_content_split: 30 你好吗,
|
||||
# 1 <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>
|
||||
# 倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
|
||||
# 2 thank you.
|
||||
outs = MixTextProcessor().get_content_split(text)
|
||||
print(f"SSML object.get_content_split: {len(outs)}")
|
||||
for i, sub in enumerate(outs):
|
||||
print(i, sub)
|
||||
print()
|
||||
|
||||
import json
|
||||
import xmltodict
|
||||
text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
|
||||
ssml = xmltodict.parse(text)
|
||||
print(json.dumps(ssml))
|
||||
print(ssml['speak'].keys())
|
||||
print(ssml['speak']['#text'])
|
||||
print(ssml['speak']['say-as'])
|
Loading…
Reference in new issue