fix voice cloning of vits.

pull/2268/head
艾梦 3 years ago
parent 2ebe04f9d3
commit 1450e74b4f

@ -122,13 +122,13 @@ ref_audio
`./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py` `./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py`
```bash ```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${add_blank} ${ref_audio_dir}
``` ```
If you want to convert a speaker audio file to refered speaker, run: If you want to convert a speaker audio file to refered speaker, run:
```bash ```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} ${src_audio_path} CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${add_blank} ${ref_audio_dir} ${src_audio_path}
``` ```
## Pretrained Model ## Pretrained Model

@ -4,8 +4,8 @@ config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 ckpt_name=$3
ge2e_params_path=$4 ge2e_params_path=$4
ref_audio_dir=$5 add_blank=$5
add_blank=$6 ref_audio_dir=$6
src_audio_path=$7 src_audio_path=$7
FLAGS_allocator_strategy=naive_best_fit \ FLAGS_allocator_strategy=naive_best_fit \
@ -14,9 +14,9 @@ python3 ${BIN_DIR}/voice_cloning.py \
--config=${config_path} \ --config=${config_path} \
--ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--ge2e_params_path=${ge2e_params_path} \ --ge2e_params_path=${ge2e_params_path} \
--phones_dict=dump/phone_id_map.txt \
--text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \ --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
--audio-path=${src_audio_path} \ --audio-path=${src_audio_path} \
--input-dir=${ref_audio_dir} \ --input-dir=${ref_audio_dir} \
--output-dir=${train_output_path}/vc_syn \ --output-dir=${train_output_path}/vc_syn \
--phones-dict=dump/phone_id_map.txt \
--add-blank=${add_blank} --add-blank=${add_blank}

@ -11,6 +11,7 @@ conf_path=conf/default.yaml
train_output_path=exp/default train_output_path=exp/default
ckpt_name=snapshot_iter_153.pdz ckpt_name=snapshot_iter_153.pdz
add_blank=true add_blank=true
ref_audio_dir=ref_audio
src_audio_path='' src_audio_path=''
# not include ".pdparams" here # not include ".pdparams" here
@ -40,5 +41,5 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} \ CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} \
${ge2e_params_path} ${ref_audio_dir} ${add_blank} ${src_audio_path} || exit -1 ${ge2e_params_path} ${add_blank} ${ref_audio_dir} ${src_audio_path} || exit -1
fi fi

@ -52,9 +52,8 @@ def evaluate(args):
spk_num = len(spk_id) spk_num = len(spk_id)
fields += ["spk_id"] fields += ["spk_id"]
elif args.voice_cloning: elif args.voice_cloning:
print("Training voice cloning!") print("Evaluating voice cloning!")
fields += ["spk_emb"] fields += ["spk_emb"]
converters["spk_emb"] = np.load
else: else:
print("single speaker vits!") print("single speaker vits!")
print("spk_num:", spk_num) print("spk_num:", spk_num)

@ -102,7 +102,7 @@ def voice_cloning(args):
phone_ids = input_ids["phone_ids"][0] phone_ids = input_ids["phone_ids"][0]
else: else:
wav, _ = librosa.load(str(args.audio_path), sr=config.fs) wav, _ = librosa.load(str(args.audio_path), sr=config.fs)
feats = spec_extractor.get_linear_spectrogram(wav) feats = paddle.to_tensor(spec_extractor.get_linear_spectrogram(wav))
mel_sequences = p.extract_mel_partials( mel_sequences = p.extract_mel_partials(
p.preprocess_wav(args.audio_path)) p.preprocess_wav(args.audio_path))
@ -122,10 +122,11 @@ def voice_cloning(args):
with paddle.no_grad(): with paddle.no_grad():
if args.audio_path is None: if args.audio_path is None:
wav = vits.inference(text=phone_ids, spembs=spk_emb) out = vits.inference(text=phone_ids, spembs=spk_emb)
else: else:
wav = vits.voice_conversion( out = vits.voice_conversion(
feats=feats, spembs_src=spk_emb_src, spembs_tgt=spk_emb) feats=feats, spembs_src=spk_emb_src, spembs_tgt=spk_emb)
wav = out["wav"]
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), str(output_dir / (utt_id + ".wav")),
@ -138,10 +139,11 @@ def voice_cloning(args):
utt_id = "random_spk_emb" utt_id = "random_spk_emb"
with paddle.no_grad(): with paddle.no_grad():
if args.audio_path is None: if args.audio_path is None:
wav = vits.inference(text=phone_ids, spembs=random_spk_emb) out = vits.inference(text=phone_ids, spembs=random_spk_emb)
else: else:
wav = vits.voice_conversion( out = vits.voice_conversion(
feats=feats, spembs_src=spk_emb_src, spembs_tgt=random_spk_emb) feats=feats, spembs_src=spk_emb_src, spembs_tgt=random_spk_emb)
wav = out["wav"]
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), wav.numpy(), samplerate=config.fs) str(output_dir / (utt_id + ".wav")), wav.numpy(), samplerate=config.fs)
print(f"{utt_id} done!") print(f"{utt_id} done!")

@ -524,8 +524,8 @@ class VITSGenerator(nn.Layer):
def voice_conversion( def voice_conversion(
self, self,
feats: Optional[paddle.Tensor]=None, feats: paddle.Tensor=None,
feats_lengths: Optional[paddle.Tensor]=None, feats_lengths: paddle.Tensor=None,
sids_src: Optional[paddle.Tensor]=None, sids_src: Optional[paddle.Tensor]=None,
sids_tgt: Optional[paddle.Tensor]=None, sids_tgt: Optional[paddle.Tensor]=None,
spembs_src: Optional[paddle.Tensor]=None, spembs_src: Optional[paddle.Tensor]=None,

@ -381,7 +381,7 @@ class VITS(nn.Layer):
if use_teacher_forcing: if use_teacher_forcing:
assert feats is not None assert feats is not None
feats = feats[None].transpose([0, 2, 1]) feats = feats[None].transpose([0, 2, 1])
feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]]) feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
wav, att_w, dur = self.generator.inference( wav, att_w, dur = self.generator.inference(
text=text, text=text,
text_lengths=text_lengths, text_lengths=text_lengths,
@ -409,7 +409,7 @@ class VITS(nn.Layer):
def voice_conversion( def voice_conversion(
self, self,
feats: Optional[paddle.Tensor]=None, feats: paddle.Tensor,
sids_src: Optional[paddle.Tensor]=None, sids_src: Optional[paddle.Tensor]=None,
sids_tgt: Optional[paddle.Tensor]=None, sids_tgt: Optional[paddle.Tensor]=None,
spembs_src: Optional[paddle.Tensor]=None, spembs_src: Optional[paddle.Tensor]=None,
@ -429,7 +429,7 @@ class VITS(nn.Layer):
""" """
assert feats is not None assert feats is not None
feats = feats[None].transpose([0, 2, 1]) feats = feats[None].transpose([0, 2, 1])
feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]]) feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
sids_none = sids_src is None and sids_tgt is None sids_none = sids_src is None and sids_tgt is None
spembs_none = spembs_src is None and spembs_tgt is None spembs_none = spembs_src is None and spembs_tgt is None

Loading…
Cancel
Save