From 80b180217df310b8738c06577c88965bab38f160 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 14 Sep 2022 10:37:03 +0800
Subject: [PATCH] [TTS] fix some bugs of ERNIE-SAT (#2378)

* fix ernie_sat, test=tts

* fix for comments, test=tts
---
 .../ernie_sat/local/synthesize_e2e.sh         |  6 ++--
 .../ernie_sat/local/synthesize_e2e.sh         |  6 ++--
 .../vctk/ernie_sat/local/synthesize_e2e.sh    |  6 ++--
 paddlespeech/t2s/exps/ernie_sat/align.py      |  4 +--
 .../t2s/exps/ernie_sat/synthesize_e2e.py      | 28 +++++++++++--------
 5 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/examples/aishell3/ernie_sat/local/synthesize_e2e.sh b/examples/aishell3/ernie_sat/local/synthesize_e2e.sh
index b33e8ca0..77b353b5 100755
--- a/examples/aishell3/ernie_sat/local/synthesize_e2e.sh
+++ b/examples/aishell3/ernie_sat/local/synthesize_e2e.sh
@@ -13,9 +13,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     FLAGS_fraction_of_gpu_memory_to_use=0.01 \
     python3 ${BIN_DIR}/synthesize_e2e.py \
         --task_name=synthesize \
-        --wav_path=source/SSB03540307.wav\
-        --old_str='请播放歌曲小苹果。' \
-        --new_str='歌曲真好听。' \
+        --wav_path=source/SSB03540307.wav \
+        --old_str='请播放歌曲小苹果' \
+        --new_str='歌曲真好听' \
         --source_lang=zh \
         --target_lang=zh \
         --erniesat_config=${config_path} \
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh
index c30af6e8..446ac879 100755
--- a/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh
+++ b/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh
@@ -15,7 +15,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     python3 ${BIN_DIR}/synthesize_e2e.py \
         --task_name=synthesize \
         --wav_path=source/p243_313.wav \
-        --old_str='For that reason cover should not be given.' \
+        --old_str='For that reason cover should not be given' \
         --new_str='今天天气很好' \
         --source_lang=en \
         --target_lang=zh \
@@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${BIN_DIR}/synthesize_e2e.py \
         --task_name=synthesize \
         --wav_path=source/SSB03540307.wav \
-        --old_str='请播放歌曲小苹果。' \
-        --new_str="Thank you!" \
+        --old_str='请播放歌曲小苹果' \
+        --new_str="Thank you" \
         --source_lang=zh \
         --target_lang=en \
         --erniesat_config=${config_path} \
diff --git a/examples/vctk/ernie_sat/local/synthesize_e2e.sh b/examples/vctk/ernie_sat/local/synthesize_e2e.sh
index fee54016..dcc71044 100755
--- a/examples/vctk/ernie_sat/local/synthesize_e2e.sh
+++ b/examples/vctk/ernie_sat/local/synthesize_e2e.sh
@@ -14,7 +14,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     python3 ${BIN_DIR}/synthesize_e2e.py \
         --task_name=synthesize \
         --wav_path=source/p243_313.wav \
-        --old_str='For that reason cover should not be given.' \
+        --old_str='For that reason cover should not be given' \
         --new_str='I love you very much do you love me' \
         --source_lang=en \
         --target_lang=en \
@@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${BIN_DIR}/synthesize_e2e.py \
         --task_name=edit \
         --wav_path=source/p243_313.wav \
-        --old_str='For that reason cover should not be given.' \
-        --new_str='For that reason cover is not impossible to be given.' \
+        --old_str='For that reason cover should not be given' \
+        --new_str='For that reason cover is not impossible to be given' \
         --source_lang=en \
         --target_lang=en \
         --erniesat_config=${config_path} \
diff --git a/paddlespeech/t2s/exps/ernie_sat/align.py b/paddlespeech/t2s/exps/ernie_sat/align.py
index 464f51a3..8dbe685f 100755
--- a/paddlespeech/t2s/exps/ernie_sat/align.py
+++ b/paddlespeech/t2s/exps/ernie_sat/align.py
@@ -58,7 +58,7 @@ def _readtg(tg_path: str, lang: str='en', fs: int=24000, n_shift: int=300):
         durations[-2] += durations[-1]
         durations = durations[:-1]
 
-    # replace ' and 'sil' with 'sp'
+    # replace '' and 'sil' with 'sp'
     phones = ['sp' if (phn == '' or phn == 'sil') else phn for phn in phones]
 
     if lang == 'en':
@@ -195,7 +195,7 @@ def words2phns(text: str, lang='en'):
             wrd = wrd.upper()
         if (wrd not in ds):
             wrd2phns[str(index) + '_' + wrd] = 'spn'
-            phns.extend('spn')
+            phns.extend(['spn'])
         else:
             wrd2phns[str(index) + '_' + wrd] = word2phns_dict[wrd].split()
             phns.extend(word2phns_dict[wrd].split())
diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
index 21c9ae04..e450aa1a 100644
--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@@ -137,9 +137,6 @@ def prep_feats_with_dur(wav_path: str,
     new_wav = np.concatenate(
         [wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]])
 
-    # 音频是正常遮住了
-    sf.write(str("mask_wav.wav"), new_wav, samplerate=fs)
-
     # 4. get old and new mel span to be mask
     old_span_bdy = get_span_bdy(
         mfa_start=mfa_start, mfa_end=mfa_end, span_to_repl=span_to_repl)
@@ -274,7 +271,8 @@ def get_wav(wav_path: str,
             new_str: str='',
             duration_adjust: bool=True,
             fs: int=24000,
-            n_shift: int=300):
+            n_shift: int=300,
+            task_name: str='synthesize'):
 
     outs = get_mlm_output(
         wav_path=wav_path,
@@ -298,9 +296,11 @@ def get_wav(wav_path: str,
     alt_wav = np.squeeze(alt_wav)
 
     old_time_bdy = [n_shift * x for x in old_span_bdy]
-    wav_replaced = np.concatenate(
-        [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
-
+    if task_name == 'edit':
+        wav_replaced = np.concatenate(
+            [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
+    else:
+        wav_replaced = alt_wav
     wav_dict = {"origin": wav_org, "output": wav_replaced}
     return wav_dict
 
@@ -356,7 +356,11 @@ def parse_args():
         "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
 
     # ernie sat related
-    parser.add_argument("--task_name", type=str, help="task name")
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        choices=['edit', 'synthesize'],
+        help="task name.")
     parser.add_argument("--wav_path", type=str, help="path of old wav")
     parser.add_argument("--old_str", type=str, help="old string")
     parser.add_argument("--new_str", type=str, help="new string")
@@ -410,10 +414,9 @@ if __name__ == '__main__':
     if args.task_name == 'edit':
         new_str = new_str
     elif args.task_name == 'synthesize':
-        new_str = old_str + new_str
+        new_str = old_str + ' ' + new_str
     else:
-        new_str = old_str + new_str
-    print("new_str:", new_str)
+        new_str = old_str + ' ' + new_str
 
     # Extractor
     mel_extractor = LogMelFBank(
@@ -467,7 +470,8 @@ if __name__ == '__main__':
         new_str=new_str,
         duration_adjust=args.duration_adjust,
         fs=erniesat_config.fs,
-        n_shift=erniesat_config.n_shift)
+        n_shift=erniesat_config.n_shift,
+        task_name=args.task_name)
 
     sf.write(
         args.output_name, wav_dict['output'], samplerate=erniesat_config.fs)