fix voice cloning of vits.

pull/2268/head
艾梦 3 years ago
parent 2ebe04f9d3
commit 1450e74b4f

@ -122,13 +122,13 @@ ref_audio
`./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py` `./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py`
```bash ```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${add_blank} ${ref_audio_dir}
``` ```
If you want to convert a speaker audio file to refered speaker, run: If you want to convert a speaker audio file to refered speaker, run:
```bash ```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} ${src_audio_path} CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${add_blank} ${ref_audio_dir} ${src_audio_path}
``` ```
## Pretrained Model ## Pretrained Model

@ -1,22 +1,22 @@
#!/bin/bash #!/bin/bash
config_path=$1 config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 ckpt_name=$3
ge2e_params_path=$4 ge2e_params_path=$4
ref_audio_dir=$5 add_blank=$5
add_blank=$6 ref_audio_dir=$6
src_audio_path=$7 src_audio_path=$7
FLAGS_allocator_strategy=naive_best_fit \ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/voice_cloning.py \ python3 ${BIN_DIR}/voice_cloning.py \
--config=${config_path} \ --config=${config_path} \
--ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--ge2e_params_path=${ge2e_params_path} \ --ge2e_params_path=${ge2e_params_path} \
--text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \ --phones_dict=dump/phone_id_map.txt \
--audio-path=${src_audio_path} \ --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
--input-dir=${ref_audio_dir} \ --audio-path=${src_audio_path} \
--output-dir=${train_output_path}/vc_syn \ --input-dir=${ref_audio_dir} \
--phones-dict=dump/phone_id_map.txt \ --output-dir=${train_output_path}/vc_syn \
--add-blank=${add_blank} --add-blank=${add_blank}

@ -1,44 +1,45 @@
#!/bin/bash #!/bin/bash
set -e set -e
source path.sh source path.sh
gpus=0,1 gpus=0,1
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/default.yaml conf_path=conf/default.yaml
train_output_path=exp/default train_output_path=exp/default
ckpt_name=snapshot_iter_153.pdz ckpt_name=snapshot_iter_153.pdz
add_blank=true add_blank=true
src_audio_path='' ref_audio_dir=ref_audio
src_audio_path=''
# not include ".pdparams" here
ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000 # not include ".pdparams" here
ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
# include ".pdparams" here
ge2e_params_path=${ge2e_ckpt_path}.pdparams # include ".pdparams" here
ge2e_params_path=${ge2e_ckpt_path}.pdparams
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0` # with the following command, you can choose the stage range you want to run
# this can not be mixed use with `$1`, `$2` ... # such as `./run.sh --stage 0 --stop-stage 0`
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 # this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${add_blank} ${ge2e_ckpt_path} || exit -1 # prepare data
fi CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${add_blank} ${ge2e_ckpt_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 # train model, all `ckpt` under `train_output_path/checkpoints/` dir
fi CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} \ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
${ge2e_params_path} ${ref_audio_dir} ${add_blank} ${src_audio_path} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} \
fi ${ge2e_params_path} ${add_blank} ${ref_audio_dir} ${src_audio_path} || exit -1
fi

@ -52,9 +52,8 @@ def evaluate(args):
spk_num = len(spk_id) spk_num = len(spk_id)
fields += ["spk_id"] fields += ["spk_id"]
elif args.voice_cloning: elif args.voice_cloning:
print("Training voice cloning!") print("Evaluating voice cloning!")
fields += ["spk_emb"] fields += ["spk_emb"]
converters["spk_emb"] = np.load
else: else:
print("single speaker vits!") print("single speaker vits!")
print("spk_num:", spk_num) print("spk_num:", spk_num)

@ -102,7 +102,7 @@ def voice_cloning(args):
phone_ids = input_ids["phone_ids"][0] phone_ids = input_ids["phone_ids"][0]
else: else:
wav, _ = librosa.load(str(args.audio_path), sr=config.fs) wav, _ = librosa.load(str(args.audio_path), sr=config.fs)
feats = spec_extractor.get_linear_spectrogram(wav) feats = paddle.to_tensor(spec_extractor.get_linear_spectrogram(wav))
mel_sequences = p.extract_mel_partials( mel_sequences = p.extract_mel_partials(
p.preprocess_wav(args.audio_path)) p.preprocess_wav(args.audio_path))
@ -122,10 +122,11 @@ def voice_cloning(args):
with paddle.no_grad(): with paddle.no_grad():
if args.audio_path is None: if args.audio_path is None:
wav = vits.inference(text=phone_ids, spembs=spk_emb) out = vits.inference(text=phone_ids, spembs=spk_emb)
else: else:
wav = vits.voice_conversion( out = vits.voice_conversion(
feats=feats, spembs_src=spk_emb_src, spembs_tgt=spk_emb) feats=feats, spembs_src=spk_emb_src, spembs_tgt=spk_emb)
wav = out["wav"]
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), str(output_dir / (utt_id + ".wav")),
@ -138,10 +139,11 @@ def voice_cloning(args):
utt_id = "random_spk_emb" utt_id = "random_spk_emb"
with paddle.no_grad(): with paddle.no_grad():
if args.audio_path is None: if args.audio_path is None:
wav = vits.inference(text=phone_ids, spembs=random_spk_emb) out = vits.inference(text=phone_ids, spembs=random_spk_emb)
else: else:
wav = vits.voice_conversion( out = vits.voice_conversion(
feats=feats, spembs_src=spk_emb_src, spembs_tgt=random_spk_emb) feats=feats, spembs_src=spk_emb_src, spembs_tgt=random_spk_emb)
wav = out["wav"]
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), wav.numpy(), samplerate=config.fs) str(output_dir / (utt_id + ".wav")), wav.numpy(), samplerate=config.fs)
print(f"{utt_id} done!") print(f"{utt_id} done!")

@ -524,8 +524,8 @@ class VITSGenerator(nn.Layer):
def voice_conversion( def voice_conversion(
self, self,
feats: Optional[paddle.Tensor]=None, feats: paddle.Tensor=None,
feats_lengths: Optional[paddle.Tensor]=None, feats_lengths: paddle.Tensor=None,
sids_src: Optional[paddle.Tensor]=None, sids_src: Optional[paddle.Tensor]=None,
sids_tgt: Optional[paddle.Tensor]=None, sids_tgt: Optional[paddle.Tensor]=None,
spembs_src: Optional[paddle.Tensor]=None, spembs_src: Optional[paddle.Tensor]=None,

@ -381,7 +381,7 @@ class VITS(nn.Layer):
if use_teacher_forcing: if use_teacher_forcing:
assert feats is not None assert feats is not None
feats = feats[None].transpose([0, 2, 1]) feats = feats[None].transpose([0, 2, 1])
feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]]) feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
wav, att_w, dur = self.generator.inference( wav, att_w, dur = self.generator.inference(
text=text, text=text,
text_lengths=text_lengths, text_lengths=text_lengths,
@ -409,7 +409,7 @@ class VITS(nn.Layer):
def voice_conversion( def voice_conversion(
self, self,
feats: Optional[paddle.Tensor]=None, feats: paddle.Tensor,
sids_src: Optional[paddle.Tensor]=None, sids_src: Optional[paddle.Tensor]=None,
sids_tgt: Optional[paddle.Tensor]=None, sids_tgt: Optional[paddle.Tensor]=None,
spembs_src: Optional[paddle.Tensor]=None, spembs_src: Optional[paddle.Tensor]=None,
@ -429,7 +429,7 @@ class VITS(nn.Layer):
""" """
assert feats is not None assert feats is not None
feats = feats[None].transpose([0, 2, 1]) feats = feats[None].transpose([0, 2, 1])
feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]]) feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
sids_none = sids_src is None and sids_tgt is None sids_none = sids_src is None and sids_tgt is None
spembs_none = spembs_src is None and spembs_tgt is None spembs_none = spembs_src is None and spembs_tgt is None

Loading…
Cancel
Save