From 1450e74b4f2846bb6da5d7e7ee0f19548601eda5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=89=BE=E6=A2=A6?= Date: Tue, 16 Aug 2022 22:31:44 +0800 Subject: [PATCH] fix voice cloning of vits. --- examples/aishell3/vits-vc/README.md | 4 +- .../aishell3/vits-vc/local/voice_cloning.sh | 44 ++++----- examples/aishell3/vits-vc/run.sh | 89 ++++++++++--------- paddlespeech/t2s/exps/vits/synthesize.py | 3 +- paddlespeech/t2s/exps/vits/voice_cloning.py | 12 +-- paddlespeech/t2s/models/vits/generator.py | 4 +- paddlespeech/t2s/models/vits/vits.py | 6 +- 7 files changed, 82 insertions(+), 80 deletions(-) diff --git a/examples/aishell3/vits-vc/README.md b/examples/aishell3/vits-vc/README.md index c47bbdd52..2e1ae21db 100644 --- a/examples/aishell3/vits-vc/README.md +++ b/examples/aishell3/vits-vc/README.md @@ -122,13 +122,13 @@ ref_audio `./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py` ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} +CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${add_blank} ${ref_audio_dir} ``` If you want to convert a speaker audio file to refered speaker, run: ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} ${src_audio_path} +CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${add_blank} ${ref_audio_dir} ${src_audio_path} ``` ## Pretrained Model diff --git a/examples/aishell3/vits-vc/local/voice_cloning.sh b/examples/aishell3/vits-vc/local/voice_cloning.sh index 429bbfd34..3c113da8b 100755 --- a/examples/aishell3/vits-vc/local/voice_cloning.sh +++ b/examples/aishell3/vits-vc/local/voice_cloning.sh @@ -1,22 +1,22 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 -ge2e_params_path=$4 -ref_audio_dir=$5 -add_blank=$6 -src_audio_path=$7 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/voice_cloning.py \ - --config=${config_path} \ - --ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --ge2e_params_path=${ge2e_params_path} \ - --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \ - --audio-path=${src_audio_path} \ - --input-dir=${ref_audio_dir} \ - --output-dir=${train_output_path}/vc_syn \ - --phones-dict=dump/phone_id_map.txt \ - --add-blank=${add_blank} +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +ge2e_params_path=$4 +add_blank=$5 +ref_audio_dir=$6 +src_audio_path=$7 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/voice_cloning.py \ + --config=${config_path} \ + --ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --ge2e_params_path=${ge2e_params_path} \ + --phones_dict=dump/phone_id_map.txt \ + --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \ + --audio-path=${src_audio_path} \ + --input-dir=${ref_audio_dir} \ + --output-dir=${train_output_path}/vc_syn \ + --add-blank=${add_blank} diff --git a/examples/aishell3/vits-vc/run.sh b/examples/aishell3/vits-vc/run.sh index 9ebec2127..2cc378016 100755 --- a/examples/aishell3/vits-vc/run.sh +++ b/examples/aishell3/vits-vc/run.sh @@ -1,44 +1,45 @@ -#!/bin/bash - -set -e -source path.sh - -gpus=0,1 -stage=0 -stop_stage=100 - -conf_path=conf/default.yaml -train_output_path=exp/default -ckpt_name=snapshot_iter_153.pdz -add_blank=true -src_audio_path='' - -# not include ".pdparams" here -ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000 - -# include ".pdparams" here -ge2e_params_path=${ge2e_ckpt_path}.pdparams - -# with the following command, you can choose the stage range you want to run -# such as `./run.sh --stage 0 --stop-stage 0` -# this can not be mixed use with `$1`, `$2` ... -source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # prepare data - CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${add_blank} ${ge2e_ckpt_path} || exit -1 -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # train model, all `ckpt` under `train_output_path/checkpoints/` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} \ - ${ge2e_params_path} ${ref_audio_dir} ${add_blank} ${src_audio_path} || exit -1 -fi +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_153.pdz +add_blank=true +ref_audio_dir=ref_audio +src_audio_path='' + +# not include ".pdparams" here +ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000 + +# include ".pdparams" here +ge2e_params_path=${ge2e_ckpt_path}.pdparams + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${add_blank} ${ge2e_ckpt_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} \ + ${ge2e_params_path} ${add_blank} ${ref_audio_dir} ${src_audio_path} || exit -1 +fi diff --git a/paddlespeech/t2s/exps/vits/synthesize.py b/paddlespeech/t2s/exps/vits/synthesize.py index f58e38874..968684b25 100644 --- a/paddlespeech/t2s/exps/vits/synthesize.py +++ b/paddlespeech/t2s/exps/vits/synthesize.py @@ -52,9 +52,8 @@ def evaluate(args): spk_num = len(spk_id) fields += ["spk_id"] elif args.voice_cloning: - print("Training voice cloning!") + print("Evaluating voice cloning!") fields += ["spk_emb"] - converters["spk_emb"] = np.load else: print("single speaker vits!") print("spk_num:", spk_num) diff --git a/paddlespeech/t2s/exps/vits/voice_cloning.py b/paddlespeech/t2s/exps/vits/voice_cloning.py index 2874e97aa..bdda4d687 100644 --- a/paddlespeech/t2s/exps/vits/voice_cloning.py +++ b/paddlespeech/t2s/exps/vits/voice_cloning.py @@ -102,7 +102,7 @@ def voice_cloning(args): phone_ids = input_ids["phone_ids"][0] else: wav, _ = librosa.load(str(args.audio_path), sr=config.fs) - feats = spec_extractor.get_linear_spectrogram(wav) + feats = paddle.to_tensor(spec_extractor.get_linear_spectrogram(wav)) mel_sequences = p.extract_mel_partials( p.preprocess_wav(args.audio_path)) @@ -122,10 +122,11 @@ def voice_cloning(args): with paddle.no_grad(): if args.audio_path is None: - wav = vits.inference(text=phone_ids, spembs=spk_emb) + out = vits.inference(text=phone_ids, spembs=spk_emb) else: - wav = vits.voice_conversion( + out = vits.voice_conversion( feats=feats, spembs_src=spk_emb_src, spembs_tgt=spk_emb) + wav = out["wav"] sf.write( str(output_dir / (utt_id + ".wav")), @@ -138,10 +139,11 @@ def voice_cloning(args): utt_id = "random_spk_emb" with paddle.no_grad(): if args.audio_path is None: - wav = vits.inference(text=phone_ids, spembs=random_spk_emb) + out = vits.inference(text=phone_ids, spembs=random_spk_emb) else: - wav = vits.voice_conversion( + out = vits.voice_conversion( feats=feats, spembs_src=spk_emb_src, spembs_tgt=random_spk_emb) + wav = out["wav"] sf.write( str(output_dir / (utt_id + ".wav")), wav.numpy(), samplerate=config.fs) print(f"{utt_id} done!") diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py index 69134bd27..359b66258 100644 --- a/paddlespeech/t2s/models/vits/generator.py +++ b/paddlespeech/t2s/models/vits/generator.py @@ -524,8 +524,8 @@ class VITSGenerator(nn.Layer): def voice_conversion( self, - feats: Optional[paddle.Tensor]=None, - feats_lengths: Optional[paddle.Tensor]=None, + feats: paddle.Tensor=None, + feats_lengths: paddle.Tensor=None, sids_src: Optional[paddle.Tensor]=None, sids_tgt: Optional[paddle.Tensor]=None, spembs_src: Optional[paddle.Tensor]=None, diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py index 68c324bec..983bf0a36 100644 --- a/paddlespeech/t2s/models/vits/vits.py +++ b/paddlespeech/t2s/models/vits/vits.py @@ -381,7 +381,7 @@ class VITS(nn.Layer): if use_teacher_forcing: assert feats is not None feats = feats[None].transpose([0, 2, 1]) - feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]]) + feats_lengths = paddle.to_tensor(paddle.shape(feats)[2]) wav, att_w, dur = self.generator.inference( text=text, text_lengths=text_lengths, @@ -409,7 +409,7 @@ class VITS(nn.Layer): def voice_conversion( self, - feats: Optional[paddle.Tensor]=None, + feats: paddle.Tensor, sids_src: Optional[paddle.Tensor]=None, sids_tgt: Optional[paddle.Tensor]=None, spembs_src: Optional[paddle.Tensor]=None, @@ -429,7 +429,7 @@ class VITS(nn.Layer): """ assert feats is not None feats = feats[None].transpose([0, 2, 1]) - feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]]) + feats_lengths = paddle.to_tensor(paddle.shape(feats)[2]) sids_none = sids_src is None and sids_tgt is None spembs_none = spembs_src is None and spembs_tgt is None