diff --git a/examples/canton/tts3/README.md b/examples/canton/tts3/README.md index 10c36e227..f4acd131c 100644 --- a/examples/canton/tts3/README.md +++ b/examples/canton/tts3/README.md @@ -14,7 +14,7 @@ cp -r ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle/WAV/* ~/d After that, it should be look like: ``` -~/datasets/canton_all_ +~/datasets/canton_all │ └── WAV │ └──G0001 │ └──G0002 @@ -72,7 +72,7 @@ The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of wh Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, the path of pitch features, a path of energy features, speaker, and id of each utterance. -### Training details can refer to the script of examples/aishell3/tts3. +### Training details can refer to the script of [examples/aishell3/tts3](../../aishell3/tts3). ## Pretrained Model(Waiting========) Pretrained FastSpeech2 model with no silence in the edge of audios: diff --git a/examples/canton/tts3/conf/default.yaml b/examples/canton/tts3/conf/default.yaml index c1921b790..a101e6eea 100644 --- a/examples/canton/tts3/conf/default.yaml +++ b/examples/canton/tts3/conf/default.yaml @@ -16,6 +16,9 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) +# The canton datasets we use are different from others like Databaker or LJSpeech, +# we set it to 110 to avoid too many zero-pitch problem. +# Reference: https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder/issues/38 f0min: 110 # Minimum f0 for pitch extraction. f0max: 400 # Maximum f0 for pitch extraction. diff --git a/examples/canton/tts3/local/preprocess.sh b/examples/canton/tts3/local/preprocess.sh index a7afaa1ad..f70b1c028 100755 --- a/examples/canton/tts3/local/preprocess.sh +++ b/examples/canton/tts3/local/preprocess.sh @@ -1,6 +1,6 @@ #!/bin/bash -stage=1 +stage=0 stop_stage=100 config_path=$1 diff --git a/examples/canton/tts3/local/synthesize.sh b/examples/canton/tts3/local/synthesize.sh deleted file mode 100755 index dbbe7fdac..000000000 --- a/examples/canton/tts3/local/synthesize.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -stage=0 -stop_stage=0 - -# pwgan -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - FLAGS_allocator_strategy=naive_best_fit \ - FLAGS_fraction_of_gpu_memory_to_use=0.01 \ - python3 ${BIN_DIR}/../synthesize.py \ - --am=fastspeech2_aishell3 \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=pwgan_aishell3 \ - --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ - --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ - --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test_new \ - --phones_dict=dump/phone_id_map.txt \ - --speaker_dict=dump/speaker_id_map.txt -fi - -# hifigan -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - FLAGS_allocator_strategy=naive_best_fit \ - FLAGS_fraction_of_gpu_memory_to_use=0.01 \ - python3 ${BIN_DIR}/../synthesize.py \ - --am=fastspeech2_aishell3 \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=hifigan_aishell3 \ - --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \ - --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \ - --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt \ - --speaker_dict=dump/speaker_id_map.txt -fi - diff --git a/examples/canton/tts3/local/synthesize.sh b/examples/canton/tts3/local/synthesize.sh new file mode 120000 index 000000000..5f8bb91fc --- /dev/null +++ b/examples/canton/tts3/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/tts3/local/synthesize.sh \ No newline at end of file diff --git a/examples/canton/tts3/local/train.sh b/examples/canton/tts3/local/train.sh deleted file mode 100755 index 1da72f117..000000000 --- a/examples/canton/tts3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/canton/tts3/local/train.sh b/examples/canton/tts3/local/train.sh new file mode 120000 index 000000000..d7b05058e --- /dev/null +++ b/examples/canton/tts3/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/tts3/local/train.sh \ No newline at end of file diff --git a/examples/canton/tts3/path.sh b/examples/canton/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/canton/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/canton/tts3/path.sh b/examples/canton/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/canton/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/paddlespeech/t2s/datasets/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py index 21458f152..a90f1a417 100644 --- a/paddlespeech/t2s/datasets/get_feats.py +++ b/paddlespeech/t2s/datasets/get_feats.py @@ -102,7 +102,7 @@ class Pitch(): def _convert_to_continuous_f0(self, f0: np.ndarray) -> np.ndarray: if (f0 == 0).all(): - print("All frames seems to be unvoiced.") + print("All frames seems to be unvoiced, this utt will be removed.") return f0 # padding start and end of f0 sequence diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index 04f79b11a..521b9a880 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -109,6 +109,8 @@ def process_sentence(config: Dict[str, Any], np.save(mel_path, logmel) # extract pitch and energy f0 = pitch_extractor.get_pitch(wav, duration=np.array(durations)) + if (f0 == 0).all(): + return None assert f0.shape[0] == len(durations) f0_dir = output_dir / "data_pitch" f0_dir.mkdir(parents=True, exist_ok=True)