parent
3eef444dec
commit
3ce5dff460
@ -1,4 +1,11 @@
|
||||
# Aishell3
|
||||
|
||||
* tts0 - fastspeech2
|
||||
* vc0 - tactron2 voice clone
|
||||
* tts0 - Tactron2
|
||||
* tts1 - TransformerTTS
|
||||
* tts2 - SpeedySpeech
|
||||
* tts3 - FastSpeech2
|
||||
* voc0 - WaveFlow
|
||||
* voc1 - Parallel WaveGAN
|
||||
* voc2 - MelGAN
|
||||
* voc3 - MultiBand MelGAN
|
||||
* vc0 - Tactron2 Voice Clone with GE2E
|
||||
|
@ -1,15 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ../synthesize.py \
|
||||
--fastspeech2-config=conf/default.yaml \
|
||||
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak\
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--fastspeech2-config=${config_path} \
|
||||
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/default/test \
|
||||
--output-dir=${train_output_path}/test \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
@ -1,15 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 synthesize_e2e.py \
|
||||
--fastspeech2-config=conf/default.yaml \
|
||||
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak \
|
||||
python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
|
||||
--fastspeech2-config=${config_path} \
|
||||
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--text=../sentences_en.txt \
|
||||
--output-dir=exp/default/test_e2e \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output-dir=${train_output_path}/test_e2e \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
@ -1,10 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
python3 ../train.py \
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=conf/default.yaml \
|
||||
--output-dir=exp/default \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--nprocs=2 \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=fastspeech2
|
||||
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
|
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_482.pdz
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize_e2e, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
Before Width: | Height: | Size: 221 KiB After Width: | Height: | Size: 221 KiB |
Before Width: | Height: | Size: 550 KiB After Width: | Height: | Size: 550 KiB |
Before Width: | Height: | Size: 514 KiB After Width: | Height: | Size: 514 KiB |
@ -0,0 +1,43 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
input=$1
|
||||
preprocess_path=$2
|
||||
alignment=$3
|
||||
ge2e_ckpt_path=$4
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../ge2e/inference.py \
|
||||
--input=${input} \
|
||||
--output=${preprocess_path}/embed \
|
||||
--device="gpu" \
|
||||
--checkpoint_path=${ge2e_ckpt_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "Process wav ..."
|
||||
python3 ${BIN_DIR}/process_wav.py \
|
||||
--input=${input}/wav \
|
||||
--output=${preprocess_path}/normalized_wav \
|
||||
--alignment=${alignment}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
python3 ${BIN_DIR}/preprocess_transcription.py \
|
||||
--input=${input} \
|
||||
--output=${preprocess_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
python3 ${BIN_DIR}/extract_mel.py \
|
||||
--input=${preprocess_path}/normalized_wav \
|
||||
--output=${preprocess_path}/mel
|
||||
fi
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--data=${preprocess_path} \
|
||||
--output=${train_output_path} \
|
||||
--device="gpu"
|
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
ge2e_params_path=$1
|
||||
tacotron2_params_path=$2
|
||||
waveflow_params_path=$3
|
||||
vc_input=$4
|
||||
vc_output=$5
|
||||
|
||||
python3 ${BIN_DIR}/voice_cloning.py \
|
||||
--ge2e_params_path=${ge2e_params_path} \
|
||||
--tacotron2_params_path=${tacotron2_params_path} \
|
||||
--waveflow_params_path=${waveflow_params_path} \
|
||||
--input-dir=${vc_input} \
|
||||
--output-dir=${vc_output}
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=tacotron2_ge2e
|
||||
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
|
@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
input=~/datasets/data_aishell3/train
|
||||
preprocess_path=dump
|
||||
alignment=./alignment
|
||||
|
||||
# not include ".pdparams" here
|
||||
ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
|
||||
train_output_path=output
|
||||
# include ".pdparams" here
|
||||
ge2e_params_path=${ge2e_ckpt_path}.pdparams
|
||||
tacotron2_params_path=${train_output_path}/checkpoints/step-1000.pdparams
|
||||
# pretrained model
|
||||
# tacotron2_params_path=./tacotron2_aishell3_ckpt_0.3/step-450000.pdparams
|
||||
waveflow_params_path=./waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams
|
||||
vc_input=ref_audio
|
||||
vc_output=syn_audio
|
||||
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} || exit -1
|
||||
fi
|
||||
|
||||
|
@ -0,0 +1,11 @@
|
||||
|
||||
# CSMSC
|
||||
|
||||
* tts0 - Tactron2
|
||||
* tts1 - TransformerTTS
|
||||
* tts2 - SpeedySpeech
|
||||
* tts3 - FastSpeech2
|
||||
* voc0 - WaveFlow
|
||||
* voc1 - Parallel WaveGAN
|
||||
* voc2 - MelGAN
|
||||
* voc3 - MultiBand MelGAN
|
@ -1,8 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
python3 inference.py \
|
||||
--inference-dir=exp/default/inference \
|
||||
--text=../sentences.txt \
|
||||
--output-dir=exp/default/pd_infer_out \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt
|
@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
python3 ${BIN_DIR}/inference.py \
|
||||
--inference-dir=${train_output_path}/inference \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output-dir=${train_output_path}/pd_infer_out \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt
|
@ -1,16 +1,20 @@
|
||||
#!/bin/bash
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ../synthesize.py \
|
||||
--speedyspeech-config=conf/default.yaml \
|
||||
--speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--speedyspeech-config=${config_path} \
|
||||
--speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--speedyspeech-stat=dump/train/feats_stats.npy \
|
||||
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/default/test \
|
||||
--inference-dir=exp/default/inference \
|
||||
--output-dir=${train_output_path}/test \
|
||||
--inference-dir=${train_output_path}/inference \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt \
|
||||
--device="gpu"
|
@ -1,16 +1,21 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python synthesize_e2e.py \
|
||||
--speedyspeech-config=conf/default.yaml \
|
||||
--speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
|
||||
python3 ${BIN_DIR}/synthesize_e2e.py \
|
||||
--speedyspeech-config=${config_path} \
|
||||
--speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--speedyspeech-stat=dump/train/feats_stats.npy \
|
||||
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--text=../sentences.txt \
|
||||
--output-dir=exp/default/test_e2e \
|
||||
--inference-dir=exp/default/inference \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output-dir=${train_output_path}/test_e2e \
|
||||
--inference-dir=${train_output_path}/inference \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt
|
@ -1,11 +1,14 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
python ../train.py \
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=conf/default.yaml \
|
||||
--output-dir=exp/default \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--nprocs=2 \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt \
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=speedyspeech
|
||||
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
|
@ -0,0 +1,37 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_76.pdz
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize_e2e, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# inference with static model
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
|
||||
fi
|
@ -1,14 +1,19 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ../synthesize.py \
|
||||
--fastspeech2-config=conf/default.yaml \
|
||||
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_76000.pdz \
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--fastspeech2-config=${config_path} \
|
||||
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/default/test \
|
||||
--output-dir=${train_output_path}/test \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -1,14 +1,19 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 synthesize_e2e.py \
|
||||
--fastspeech2-config=conf/default.yaml \
|
||||
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_153.pdz \
|
||||
python3 ${BIN_DIR}/synthesize_e2e.py \
|
||||
--fastspeech2-config=${config_path} \
|
||||
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--text=../sentences.txt \
|
||||
--output-dir=exp/default/test_e2e \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output-dir=${train_output_path}/test_e2e \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--nprocs=1 \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=fastspeech2
|
||||
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
|
@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_153.pdz
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
bash ./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize_e2e, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--config=${config_path} \
|
||||
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=${train_output_path}/test
|
@ -1,10 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
FLAGS_cudnn_exhaustive_search=true \
|
||||
FLAGS_conv_workspace_size_limit=4000 \
|
||||
python ../train.py \
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=conf/default.yaml \
|
||||
--output-dir=exp/default \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--nprocs=1
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=parallelwave_gan
|
||||
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
|
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_5000.pdz
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -0,0 +1,87 @@
|
||||
# Tacotron2 with LJSpeech
|
||||
PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884).
|
||||
|
||||
## Dataset
|
||||
We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
|
||||
|
||||
```bash
|
||||
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
||||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
## Get Started
|
||||
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
|
||||
Run the command below to
|
||||
1. **source path**.
|
||||
2. preprocess the dataset,
|
||||
3. train the model.
|
||||
4. synthesize mels.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
### Preprocess the dataset
|
||||
```bash
|
||||
./local/preprocess.sh ${conf_path}
|
||||
```
|
||||
### Train the model
|
||||
`./local/train.sh` calls `${BIN_DIR}/train.py`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
|
||||
```
|
||||
Here's the complete help message.
|
||||
```text
|
||||
usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR]
|
||||
[--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}]
|
||||
[--nprocs NPROCS] [--opts ...]
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config FILE path of the config file to overwrite to default config
|
||||
with.
|
||||
--data DATA_DIR path to the datatset.
|
||||
--output OUTPUT_DIR path to save checkpoint and logs.
|
||||
--checkpoint_path CHECKPOINT_PATH
|
||||
path of the checkpoint to load
|
||||
--device {cpu,gpu} device type to use, cpu and gpu are supported.
|
||||
--nprocs NPROCS number of parallel processes to use.
|
||||
--opts ... options to overwrite --config file and the default
|
||||
config, passing in KEY VALUE pairs
|
||||
```
|
||||
|
||||
If you want to train on CPU, just set ``--device=cpu``.
|
||||
If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU.
|
||||
By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint.
|
||||
And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load.
|
||||
**Note: The checkpoint path cannot contain the file extension.**
|
||||
|
||||
### Synthesize
|
||||
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which synthesize **mels** from text_list here.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
```text
|
||||
usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH]
|
||||
[--input INPUT] [--output OUTPUT] [--device DEVICE]
|
||||
[--opts ...] [-v]
|
||||
|
||||
generate mel spectrogram with TransformerTTS.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config FILE extra config to overwrite the default config
|
||||
--checkpoint_path CHECKPOINT_PATH
|
||||
path of the checkpoint to load.
|
||||
--input INPUT path of the text sentences
|
||||
--output OUTPUT path to save outputs
|
||||
--device DEVICE device type to use.
|
||||
--opts ... options to overwrite --config file and the default
|
||||
config, passing in KEY VALUE pairs
|
||||
-v, --verbose print msg
|
||||
```
|
||||
**Ps.** You can use [waveflow](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/waveflow) as the neural vocoder to synthesize mels to wavs. (Please refer to `synthesize.sh` in our LJSpeech waveflow example)
|
||||
|
||||
## Pretrained Models
|
||||
Pretrained Models can be downloaded from links below. We provide 2 models with different configurations.
|
||||
|
||||
1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
|
||||
|
||||
2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
|
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--input=~/datasets/LJSpeech-1.1 \
|
||||
--output=${preprocess_path} \
|
||||
-v \
|
@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
ckpt_name=$2
|
||||
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--config=${train_output_path}/config.yaml \
|
||||
--checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--input=${BIN_DIR}/../sentences_en.txt \
|
||||
--output=${train_output_path}/test
|
||||
--device=gpu
|
@ -1,92 +0,0 @@
|
||||
# Tacotron2
|
||||
|
||||
PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884).
|
||||
|
||||
## Project Structure
|
||||
|
||||
```text
|
||||
├── config.py # default configuration file
|
||||
├── ljspeech.py # dataset and dataloader settings for LJSpeech
|
||||
├── preprocess.py # script to preprocess LJSpeech dataset
|
||||
├── synthesize.py # script to synthesize spectrogram from text
|
||||
├── train.py # script for tacotron2 model training
|
||||
├── synthesize.ipynb # notebook example for end-to-end TTS
|
||||
```
|
||||
|
||||
## Dataset
|
||||
|
||||
We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
|
||||
|
||||
```bash
|
||||
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
||||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
Then you need to preprocess the data by running ``preprocess.py``, the preprocessed data will be placed in ``--output`` directory.
|
||||
|
||||
```bash
|
||||
python preprocess.py \
|
||||
--input=${DATAPATH} \
|
||||
--output=${PREPROCESSEDDATAPATH} \
|
||||
-v \
|
||||
```
|
||||
|
||||
For more help on arguments
|
||||
|
||||
``python preprocess.py --help``.
|
||||
|
||||
## Train the model
|
||||
|
||||
Tacotron2 model can be trained by running ``train.py``.
|
||||
|
||||
```bash
|
||||
python train.py \
|
||||
--data=${PREPROCESSEDDATAPATH} \
|
||||
--output=${OUTPUTPATH} \
|
||||
--device=gpu \
|
||||
```
|
||||
|
||||
If you want to train on CPU, just set ``--device=cpu``.
|
||||
If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU.
|
||||
By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint. And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load.
|
||||
|
||||
**Note: The checkpoint path cannot contain the file extension.**
|
||||
|
||||
For more help on arguments
|
||||
|
||||
``python train_transformer.py --help``.
|
||||
|
||||
## Synthesize
|
||||
|
||||
After training the Tacotron2, spectrogram can be synthesized by running ``synthesize.py``.
|
||||
|
||||
```bash
|
||||
python synthesize.py \
|
||||
--config=${CONFIGPATH} \
|
||||
--checkpoint_path=${CHECKPOINTPATH} \
|
||||
--input=${TEXTPATH} \
|
||||
--output=${OUTPUTPATH}
|
||||
--device=gpu
|
||||
```
|
||||
|
||||
The ``${CONFIGPATH}`` needs to be matched with ``${CHECKPOINTPATH}``.
|
||||
|
||||
For more help on arguments
|
||||
|
||||
``python synthesize.py --help``.
|
||||
|
||||
Then you can find the spectrogram files in ``${OUTPUTPATH}``, and then they can be the input of vocoder like [waveflow](../waveflow/README.md#Synthesis) to get audio files.
|
||||
|
||||
|
||||
## Pretrained Models
|
||||
|
||||
Pretrained Models can be downloaded from links below. We provide 2 models with different configurations.
|
||||
|
||||
1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
|
||||
|
||||
2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
|
||||
|
||||
|
||||
## Notebook: End-to-end TTS
|
||||
|
||||
See [synthesize.ipynb](./synthesize.ipynb) for details about end-to-end TTS with tacotron2 and waveflow.
|
@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--data=${preprocess_path} \
|
||||
--output=${train_output_path} \
|
||||
--device=gpu \
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=tacotron2
|
||||
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
|
@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
preprocess_path=preprocessed_ljspeech
|
||||
train_output_path=output
|
||||
ckpt_name=step-35000
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${preprocess_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
@ -1,13 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ../synthesize.py \
|
||||
--transformer-tts-config=conf/default.yaml \
|
||||
--transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--transformer-tts-config=${config_path} \
|
||||
--transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--transformer-tts-stat=dump/train/speech_stats.npy \
|
||||
--waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
|
||||
--waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/default/test \
|
||||
--output-dir=${train_output_path}/test \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -1,13 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 synthesize_e2e.py \
|
||||
--transformer-tts-config=conf/default.yaml \
|
||||
--transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \
|
||||
python3 ${BIN_DIR}/synthesize_e2e.py \
|
||||
--transformer-tts-config=${config_path} \
|
||||
--transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--transformer-tts-stat=dump/train/speech_stats.npy \
|
||||
--waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
|
||||
--waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
|
||||
--text=../sentences.txt \
|
||||
--output-dir=exp/default/test_e2e \
|
||||
--text=${BIN_DIR}/../sentences_en.txt \
|
||||
--output-dir=${train_output_path}/test_e2e \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -1,9 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
python3 ../train.py \
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=conf/default.yaml \
|
||||
--output-dir=exp/default \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--nprocs=2 \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=transformer_tts
|
||||
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
|
@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_403.pdz
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize_e2e, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -1,15 +1,19 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ../synthesize.py \
|
||||
--fastspeech2-config=conf/default.yaml \
|
||||
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--fastspeech2-config=${config_path} \
|
||||
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
|
||||
--pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
|
||||
--pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/default/test \
|
||||
--output-dir=${train_output_path}/test \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -1,15 +1,19 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 synthesize_e2e.py \
|
||||
--fastspeech2-config=conf/default.yaml \
|
||||
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \
|
||||
python3 ${BIN_DIR}/synthesize_e2e_en.py \
|
||||
--fastspeech2-config=${config_path} \
|
||||
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
|
||||
--pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
|
||||
--pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
|
||||
--text=../sentences_en.txt \
|
||||
--output-dir=exp/default/test_e2e \
|
||||
--text=${BIN_DIR}/../sentences_en.txt \
|
||||
--output-dir=${train_output_path}/test_e2e \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -1,9 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
python3 ../train.py \
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=conf/default.yaml \
|
||||
--output-dir=exp/default \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--nprocs=1 \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=fastspeech2
|
||||
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
|
@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_201.pdz
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize_e2e, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -0,0 +1,52 @@
|
||||
# WaveFlow with LJSpeech
|
||||
## Dataset
|
||||
### Download the datasaet.
|
||||
```bash
|
||||
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
### Extract the dataset.
|
||||
```bash
|
||||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
## Get Started
|
||||
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
|
||||
Assume the path to the Tacotron2 generated mels is `../tts0/output/test`.
|
||||
Run the command below to
|
||||
1. **source path**.
|
||||
2. preprocess the dataset,
|
||||
3. train the model.
|
||||
4. synthesize wavs from mels.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
### Preprocess the dataset.
|
||||
```bash
|
||||
./local/preprocess.sh ${preprocess_path}
|
||||
```
|
||||
### Train the model
|
||||
`./local/train.sh` calls `${BIN_DIR}/train.py`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
|
||||
```
|
||||
The training script requires 4 command line arguments.
|
||||
1. `--data` is the path of the training dataset.
|
||||
2. `--output` is the path of the output directory.
|
||||
3. `--device` should be "cpu" or "gpu"
|
||||
4. `--nprocs` is the number of processes to train the model in parallel.
|
||||
|
||||
If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet.
|
||||
|
||||
### Synthesize
|
||||
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from mels.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
|
||||
Synthesize waveform.
|
||||
1. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format.
|
||||
2. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does.
|
||||
3. `--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here.
|
||||
4. `--device` specifies to device to run synthesis on.
|
||||
|
||||
## Pretrained Model
|
||||
Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
|
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--input=~/datasets/LJSpeech-1.1 \
|
||||
--output=${preprocess_path}
|
@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
input_mel_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
python ${BIN_DIR}/synthesize.py \
|
||||
--input=${input_mel_path} \
|
||||
--output=${train_output_path}/wavs/ \
|
||||
--checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--device="gpu" \
|
||||
--verbose
|
@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--data=${preprocess_path} \
|
||||
--output=${train_output_path} \
|
||||
--device="gpu" \
|
||||
--nprocs=1
|
@ -1,52 +0,0 @@
|
||||
# WaveFlow with LJSpeech
|
||||
|
||||
## Dataset
|
||||
|
||||
### Download the datasaet.
|
||||
|
||||
```bash
|
||||
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
### Extract the dataset.
|
||||
|
||||
```bash
|
||||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
### Preprocess the dataset.
|
||||
|
||||
Assume the path to save the preprocessed dataset is `ljspeech_waveflow`. Run the command below to preprocess the dataset.
|
||||
|
||||
```bash
|
||||
python preprocess.py --input=LJSpeech-1.1/ --output=ljspeech_waveflow
|
||||
```
|
||||
|
||||
## Train the model
|
||||
|
||||
The training script requires 4 command line arguments.
|
||||
`--data` is the path of the training dataset, `--output` is the path of the output directory (we recommend to use a subdirectory in `runs` to manage different experiments.)
|
||||
|
||||
`--device` should be "cpu" or "gpu", `--nprocs` is the number of processes to train the model in parallel.
|
||||
|
||||
```bash
|
||||
python train.py --data=ljspeech_waveflow/ --output=runs/test --device="gpu" --nprocs=1
|
||||
```
|
||||
|
||||
If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet.
|
||||
|
||||
## Synthesize
|
||||
|
||||
Synthesize waveform. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does.
|
||||
|
||||
`--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here.
|
||||
|
||||
`--device` specifies to device to run synthesis on.
|
||||
|
||||
```bash
|
||||
python synthesize.py --input=mels/ --output=wavs/ --checkpoint_path='step-2000000' --device="gpu" --verbose
|
||||
```
|
||||
|
||||
## Pretrained Model
|
||||
|
||||
Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=waveflow
|
||||
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
|
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
preprocess_path=preprocessed_ljspeech
|
||||
train_output_path=output
|
||||
# mel generated by Tacotron2
|
||||
input_mel_path=../tts0/output/test
|
||||
ckpt_name=step-10000
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${preprocess_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--config=${config_path} \
|
||||
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=${train_output_path}/test
|
@ -1,10 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
FLAGS_cudnn_exhaustive_search=true \
|
||||
FLAGS_conv_workspace_size_limit=4000 \
|
||||
python ../train.py \
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=conf/default.yaml \
|
||||
--output-dir=exp/default \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--nprocs=1
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=parallelwave_gan
|
||||
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
|
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_5000.pdz
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
#generate utterance embedding for each utterance in a dataset.
|
||||
infer_input=$1
|
||||
infer_output=$2
|
||||
train_output_path=$3
|
||||
ckpt_name=$4
|
||||
|
||||
python3 ${BIN_DIR}/inference.py \
|
||||
--input=${infer_input} \
|
||||
--output=${infer_output} \
|
||||
--checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--device="gpu"
|
||||
|
@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
datasets_root=$1
|
||||
preprocess_path=$2
|
||||
dataset_names=$3
|
||||
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--datasets_root=${datasets_root} \
|
||||
--output_dir=${preprocess_path} \
|
||||
--dataset_names=${dataset_names}
|
@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--data=${preprocess_path} \
|
||||
--output=${train_output_path} \
|
||||
--device="gpu" \
|
||||
--nprocs=1
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=ge2e
|
||||
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
|
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
datasets_root=~/datasets/GE2E
|
||||
preprocess_path=dump
|
||||
dataset_names=librispeech_other
|
||||
train_output_path=output
|
||||
infer_input=infer_input
|
||||
infer_output=infer_output
|
||||
ckpt_name=step-10000
|
||||
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${infer_input} ${infer_output} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue