From 80b180217df310b8738c06577c88965bab38f160 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 14 Sep 2022 10:37:03 +0800 Subject: [PATCH] [TTS] fix some bugs of ERNIE-SAT (#2378) * fix ernie_sat, test=tts * fix for comments, test=tts --- .../ernie_sat/local/synthesize_e2e.sh | 6 ++-- .../ernie_sat/local/synthesize_e2e.sh | 6 ++-- .../vctk/ernie_sat/local/synthesize_e2e.sh | 6 ++-- paddlespeech/t2s/exps/ernie_sat/align.py | 4 +-- .../t2s/exps/ernie_sat/synthesize_e2e.py | 28 +++++++++++-------- 5 files changed, 27 insertions(+), 23 deletions(-) diff --git a/examples/aishell3/ernie_sat/local/synthesize_e2e.sh b/examples/aishell3/ernie_sat/local/synthesize_e2e.sh index b33e8ca0..77b353b5 100755 --- a/examples/aishell3/ernie_sat/local/synthesize_e2e.sh +++ b/examples/aishell3/ernie_sat/local/synthesize_e2e.sh @@ -13,9 +13,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/synthesize_e2e.py \ --task_name=synthesize \ - --wav_path=source/SSB03540307.wav\ - --old_str='请播放歌曲小苹果。' \ - --new_str='歌曲真好听。' \ + --wav_path=source/SSB03540307.wav \ + --old_str='请播放歌曲小苹果' \ + --new_str='歌曲真好听' \ --source_lang=zh \ --target_lang=zh \ --erniesat_config=${config_path} \ diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh index c30af6e8..446ac879 100755 --- a/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh +++ b/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh @@ -15,7 +15,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${BIN_DIR}/synthesize_e2e.py \ --task_name=synthesize \ --wav_path=source/p243_313.wav \ - --old_str='For that reason cover should not be given.' \ + --old_str='For that reason cover should not be given' \ --new_str='今天天气很好' \ --source_lang=en \ --target_lang=zh \ @@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${BIN_DIR}/synthesize_e2e.py \ --task_name=synthesize \ --wav_path=source/SSB03540307.wav \ - --old_str='请播放歌曲小苹果。' \ - --new_str="Thank you!" \ + --old_str='请播放歌曲小苹果' \ + --new_str="Thank you" \ --source_lang=zh \ --target_lang=en \ --erniesat_config=${config_path} \ diff --git a/examples/vctk/ernie_sat/local/synthesize_e2e.sh b/examples/vctk/ernie_sat/local/synthesize_e2e.sh index fee54016..dcc71044 100755 --- a/examples/vctk/ernie_sat/local/synthesize_e2e.sh +++ b/examples/vctk/ernie_sat/local/synthesize_e2e.sh @@ -14,7 +14,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${BIN_DIR}/synthesize_e2e.py \ --task_name=synthesize \ --wav_path=source/p243_313.wav \ - --old_str='For that reason cover should not be given.' \ + --old_str='For that reason cover should not be given' \ --new_str='I love you very much do you love me' \ --source_lang=en \ --target_lang=en \ @@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${BIN_DIR}/synthesize_e2e.py \ --task_name=edit \ --wav_path=source/p243_313.wav \ - --old_str='For that reason cover should not be given.' \ - --new_str='For that reason cover is not impossible to be given.' \ + --old_str='For that reason cover should not be given' \ + --new_str='For that reason cover is not impossible to be given' \ --source_lang=en \ --target_lang=en \ --erniesat_config=${config_path} \ diff --git a/paddlespeech/t2s/exps/ernie_sat/align.py b/paddlespeech/t2s/exps/ernie_sat/align.py index 464f51a3..8dbe685f 100755 --- a/paddlespeech/t2s/exps/ernie_sat/align.py +++ b/paddlespeech/t2s/exps/ernie_sat/align.py @@ -58,7 +58,7 @@ def _readtg(tg_path: str, lang: str='en', fs: int=24000, n_shift: int=300): durations[-2] += durations[-1] durations = durations[:-1] - # replace ' and 'sil' with 'sp' + # replace '' and 'sil' with 'sp' phones = ['sp' if (phn == '' or phn == 'sil') else phn for phn in phones] if lang == 'en': @@ -195,7 +195,7 @@ def words2phns(text: str, lang='en'): wrd = wrd.upper() if (wrd not in ds): wrd2phns[str(index) + '_' + wrd] = 'spn' - phns.extend('spn') + phns.extend(['spn']) else: wrd2phns[str(index) + '_' + wrd] = word2phns_dict[wrd].split() phns.extend(word2phns_dict[wrd].split()) diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py index 21c9ae04..e450aa1a 100644 --- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py @@ -137,9 +137,6 @@ def prep_feats_with_dur(wav_path: str, new_wav = np.concatenate( [wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]]) - # 音频是正常遮住了 - sf.write(str("mask_wav.wav"), new_wav, samplerate=fs) - # 4. get old and new mel span to be mask old_span_bdy = get_span_bdy( mfa_start=mfa_start, mfa_end=mfa_end, span_to_repl=span_to_repl) @@ -274,7 +271,8 @@ def get_wav(wav_path: str, new_str: str='', duration_adjust: bool=True, fs: int=24000, - n_shift: int=300): + n_shift: int=300, + task_name: str='synthesize'): outs = get_mlm_output( wav_path=wav_path, @@ -298,9 +296,11 @@ def get_wav(wav_path: str, alt_wav = np.squeeze(alt_wav) old_time_bdy = [n_shift * x for x in old_span_bdy] - wav_replaced = np.concatenate( - [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]]) - + if task_name == 'edit': + wav_replaced = np.concatenate( + [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]]) + else: + wav_replaced = alt_wav wav_dict = {"origin": wav_org, "output": wav_replaced} return wav_dict @@ -356,7 +356,11 @@ def parse_args(): "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") # ernie sat related - parser.add_argument("--task_name", type=str, help="task name") + parser.add_argument( + "--task_name", + type=str, + choices=['edit', 'synthesize'], + help="task name.") parser.add_argument("--wav_path", type=str, help="path of old wav") parser.add_argument("--old_str", type=str, help="old string") parser.add_argument("--new_str", type=str, help="new string") @@ -410,10 +414,9 @@ if __name__ == '__main__': if args.task_name == 'edit': new_str = new_str elif args.task_name == 'synthesize': - new_str = old_str + new_str + new_str = old_str + ' ' + new_str else: - new_str = old_str + new_str - print("new_str:", new_str) + new_str = old_str + ' ' + new_str # Extractor mel_extractor = LogMelFBank( @@ -467,7 +470,8 @@ if __name__ == '__main__': new_str=new_str, duration_adjust=args.duration_adjust, fs=erniesat_config.fs, - n_shift=erniesat_config.n_shift) + n_shift=erniesat_config.n_shift, + task_name=args.task_name) sf.write( args.output_name, wav_dict['output'], samplerate=erniesat_config.fs)