[TTS] fix some bugs of ERNIE-SAT (#2378)

* fix ernie_sat, test=tts

* fix for comments, test=tts
pull/2390/head
TianYuan 2 years ago committed by GitHub
parent ec571bb0d1
commit 80b180217d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -13,9 +13,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
--wav_path=source/SSB03540307.wav\
--old_str='请播放歌曲小苹果' \
--new_str='歌曲真好听' \
--wav_path=source/SSB03540307.wav \
--old_str='请播放歌曲小苹果' \
--new_str='歌曲真好听' \
--source_lang=zh \
--target_lang=zh \
--erniesat_config=${config_path} \

@ -15,7 +15,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
--wav_path=source/p243_313.wav \
--old_str='For that reason cover should not be given.' \
--old_str='For that reason cover should not be given' \
--new_str='今天天气很好' \
--source_lang=en \
--target_lang=zh \
@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
--wav_path=source/SSB03540307.wav \
--old_str='请播放歌曲小苹果' \
--new_str="Thank you!" \
--old_str='请播放歌曲小苹果' \
--new_str="Thank you" \
--source_lang=zh \
--target_lang=en \
--erniesat_config=${config_path} \

@ -14,7 +14,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
--wav_path=source/p243_313.wav \
--old_str='For that reason cover should not be given.' \
--old_str='For that reason cover should not be given' \
--new_str='I love you very much do you love me' \
--source_lang=en \
--target_lang=en \
@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=edit \
--wav_path=source/p243_313.wav \
--old_str='For that reason cover should not be given.' \
--new_str='For that reason cover is not impossible to be given.' \
--old_str='For that reason cover should not be given' \
--new_str='For that reason cover is not impossible to be given' \
--source_lang=en \
--target_lang=en \
--erniesat_config=${config_path} \

@ -58,7 +58,7 @@ def _readtg(tg_path: str, lang: str='en', fs: int=24000, n_shift: int=300):
durations[-2] += durations[-1]
durations = durations[:-1]
# replace ' and 'sil' with 'sp'
# replace '' and 'sil' with 'sp'
phones = ['sp' if (phn == '' or phn == 'sil') else phn for phn in phones]
if lang == 'en':
@ -195,7 +195,7 @@ def words2phns(text: str, lang='en'):
wrd = wrd.upper()
if (wrd not in ds):
wrd2phns[str(index) + '_' + wrd] = 'spn'
phns.extend('spn')
phns.extend(['spn'])
else:
wrd2phns[str(index) + '_' + wrd] = word2phns_dict[wrd].split()
phns.extend(word2phns_dict[wrd].split())

@ -137,9 +137,6 @@ def prep_feats_with_dur(wav_path: str,
new_wav = np.concatenate(
[wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]])
# 音频是正常遮住了
sf.write(str("mask_wav.wav"), new_wav, samplerate=fs)
# 4. get old and new mel span to be mask
old_span_bdy = get_span_bdy(
mfa_start=mfa_start, mfa_end=mfa_end, span_to_repl=span_to_repl)
@ -274,7 +271,8 @@ def get_wav(wav_path: str,
new_str: str='',
duration_adjust: bool=True,
fs: int=24000,
n_shift: int=300):
n_shift: int=300,
task_name: str='synthesize'):
outs = get_mlm_output(
wav_path=wav_path,
@ -298,9 +296,11 @@ def get_wav(wav_path: str,
alt_wav = np.squeeze(alt_wav)
old_time_bdy = [n_shift * x for x in old_span_bdy]
wav_replaced = np.concatenate(
[wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
if task_name == 'edit':
wav_replaced = np.concatenate(
[wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
else:
wav_replaced = alt_wav
wav_dict = {"origin": wav_org, "output": wav_replaced}
return wav_dict
@ -356,7 +356,11 @@ def parse_args():
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
# ernie sat related
parser.add_argument("--task_name", type=str, help="task name")
parser.add_argument(
"--task_name",
type=str,
choices=['edit', 'synthesize'],
help="task name.")
parser.add_argument("--wav_path", type=str, help="path of old wav")
parser.add_argument("--old_str", type=str, help="old string")
parser.add_argument("--new_str", type=str, help="new string")
@ -410,10 +414,9 @@ if __name__ == '__main__':
if args.task_name == 'edit':
new_str = new_str
elif args.task_name == 'synthesize':
new_str = old_str + new_str
new_str = old_str + ' ' + new_str
else:
new_str = old_str + new_str
print("new_str:", new_str)
new_str = old_str + ' ' + new_str
# Extractor
mel_extractor = LogMelFBank(
@ -467,7 +470,8 @@ if __name__ == '__main__':
new_str=new_str,
duration_adjust=args.duration_adjust,
fs=erniesat_config.fs,
n_shift=erniesat_config.n_shift)
n_shift=erniesat_config.n_shift,
task_name=args.task_name)
sf.write(
args.output_name, wav_dict['output'], samplerate=erniesat_config.fs)

Loading…
Cancel
Save