Cantonese FastSpeech2 Training, test=tts

3 years ago · 54629d340a
parent 599f3253b7
commit 54629d340a
8 changed files with 12 additions and 77 deletions
--- a/examples/canton/tts3/README.md
+++ b/examples/canton/tts3/README.md
@ -14,7 +14,7 @@ cp -r ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle/WAV/* ~/d

 After that, it should be look like:
 ```
-~/datasets/canton_all_
+~/datasets/canton_all
 │   └── WAV
 │       └──G0001
 │       └──G0002
@ -72,7 +72,7 @@ The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of wh

 Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, the path of pitch features, a path of energy features, speaker, and id of each utterance.

-### Training details can refer to the script of examples/aishell3/tts3.
+### Training details can refer to the script of [examples/aishell3/tts3](../../aishell3/tts3).

 ## Pretrained Model(Waiting========)
 Pretrained FastSpeech2 model with no silence in the edge of audios:
--- a/examples/canton/tts3/conf/default.yaml
+++ b/examples/canton/tts3/conf/default.yaml
@ -16,6 +16,9 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
+# The canton datasets we use are different from others like Databaker or LJSpeech, 
+# we set it to 110 to avoid too many zero-pitch problem. 
+# Reference: https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder/issues/38
 f0min: 110          # Minimum f0 for pitch extraction.
 f0max: 400         # Maximum f0 for pitch extraction.

--- a/examples/canton/tts3/local/preprocess.sh
+++ b/examples/canton/tts3/local/preprocess.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-stage=1
+stage=0
 stop_stage=100

 config_path=$1
--- a/examples/canton/tts3/local/synthesize.sh
+++ b/examples/canton/tts3/local/synthesize.sh
@ -1,47 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-stage=0
-stop_stage=0
-
-# pwgan
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    FLAGS_allocator_strategy=naive_best_fit \
-    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-    python3 ${BIN_DIR}/../synthesize.py \
-        --am=fastspeech2_aishell3 \
-        --am_config=${config_path} \
-        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --am_stat=dump/train/speech_stats.npy \
-        --voc=pwgan_aishell3 \
-        --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
-        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
-        --test_metadata=dump/test/norm/metadata.jsonl \
-        --output_dir=${train_output_path}/test_new \
-        --phones_dict=dump/phone_id_map.txt \
-        --speaker_dict=dump/speaker_id_map.txt
-fi
-
-# hifigan
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    FLAGS_allocator_strategy=naive_best_fit \
-    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-    python3 ${BIN_DIR}/../synthesize.py \
-        --am=fastspeech2_aishell3 \
-        --am_config=${config_path} \
-        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --am_stat=dump/train/speech_stats.npy \
-        --voc=hifigan_aishell3 \
-        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
-        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
-        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
-        --test_metadata=dump/test/norm/metadata.jsonl \
-        --output_dir=${train_output_path}/test \
-        --phones_dict=dump/phone_id_map.txt \
-        --speaker_dict=dump/speaker_id_map.txt
-fi
-
--- a/examples/canton/tts3/local/synthesize.sh
+++ b/examples/canton/tts3/local/synthesize.sh
@ -0,0 +1 @@
+../../../csmsc/tts3/local/synthesize.sh
--- a/examples/canton/tts3/local/train.sh
+++ b/examples/canton/tts3/local/train.sh
@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --speaker-dict=dump/speaker_id_map.txt
--- a/examples/canton/tts3/local/train.sh
+++ b/examples/canton/tts3/local/train.sh
@ -0,0 +1 @@
+../../../csmsc/tts3/local/train.sh
--- a/examples/canton/tts3/path.sh
+++ b/examples/canton/tts3/path.sh
@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/canton/tts3/path.sh
+++ b/examples/canton/tts3/path.sh
@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
--- a/paddlespeech/t2s/datasets/get_feats.py
+++ b/paddlespeech/t2s/datasets/get_feats.py
@ -102,7 +102,7 @@ class Pitch():

    def _convert_to_continuous_f0(self, f0: np.ndarray) -> np.ndarray:
        if (f0 == 0).all():
-            print("All frames seems to be unvoiced.")
+            print("All frames seems to be unvoiced, this utt will be removed.")
            return f0

        # padding start and end of f0 sequence
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@ -109,6 +109,8 @@ def process_sentence(config: Dict[str, Any],
        np.save(mel_path, logmel)
        # extract pitch and energy
        f0 = pitch_extractor.get_pitch(wav, duration=np.array(durations))
+        if (f0 == 0).all():
+            return None
        assert f0.shape[0] == len(durations)
        f0_dir = output_dir / "data_pitch"
        f0_dir.mkdir(parents=True, exist_ok=True)