fix voice cloning of vits.

3 years ago · 1450e74b4f
parent 2ebe04f9d3
commit 1450e74b4f
7 changed files with 82 additions and 80 deletions
--- a/examples/aishell3/vits-vc/README.md
+++ b/examples/aishell3/vits-vc/README.md
@ -122,13 +122,13 @@ ref_audio
 `./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py`
 ```bash
-CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
+CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${add_blank} ${ref_audio_dir}
 ```
 If you want to convert a speaker audio file to refered speaker, run:
 ```bash
-CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} ${src_audio_path}
+CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${add_blank} ${ref_audio_dir} ${src_audio_path}
 ```
 ## Pretrained Model
--- a/examples/aishell3/vits-vc/local/voice_cloning.sh
+++ b/examples/aishell3/vits-vc/local/voice_cloning.sh
@ -1,22 +1,22 @@
-#!/bin/bash
+#!/bin/bash
-
+
-config_path=$1
+config_path=$1
-train_output_path=$2
+train_output_path=$2
-ckpt_name=$3
+ckpt_name=$3
-ge2e_params_path=$4
+ge2e_params_path=$4
-ref_audio_dir=$5
+add_blank=$5
-add_blank=$6
+ref_audio_dir=$6
-src_audio_path=$7
+src_audio_path=$7
-
+
-FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/voice_cloning.py \
+python3 ${BIN_DIR}/voice_cloning.py \
-    --config=${config_path} \
+    --config=${config_path} \
-    --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+    --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --ge2e_params_path=${ge2e_params_path} \
+    --ge2e_params_path=${ge2e_params_path} \
-    --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
+    --phones_dict=dump/phone_id_map.txt \
-    --audio-path=${src_audio_path} \
+    --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
-    --input-dir=${ref_audio_dir} \
+    --audio-path=${src_audio_path} \
-    --output-dir=${train_output_path}/vc_syn \
+    --input-dir=${ref_audio_dir} \
-    --phones-dict=dump/phone_id_map.txt \
+    --output-dir=${train_output_path}/vc_syn \
-    --add-blank=${add_blank}
+    --add-blank=${add_blank}
--- a/examples/aishell3/vits-vc/run.sh
+++ b/examples/aishell3/vits-vc/run.sh
@ -1,44 +1,45 @@
-#!/bin/bash
+#!/bin/bash
-
+
-set -e
+set -e
-source path.sh
+source path.sh
-
+
-gpus=0,1
+gpus=0,1
-stage=0
+stage=0
-stop_stage=100
+stop_stage=100
-
+
-conf_path=conf/default.yaml
+conf_path=conf/default.yaml
-train_output_path=exp/default
+train_output_path=exp/default
-ckpt_name=snapshot_iter_153.pdz
+ckpt_name=snapshot_iter_153.pdz
-add_blank=true
+add_blank=true
-src_audio_path=''
+ref_audio_dir=ref_audio
-
+src_audio_path=''
-# not include ".pdparams" here
+
-ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
+# not include ".pdparams" here
-
+ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
-# include ".pdparams" here
+
-ge2e_params_path=${ge2e_ckpt_path}.pdparams
+# include ".pdparams" here
-
+ge2e_params_path=${ge2e_ckpt_path}.pdparams
-# with the following command, you can choose the stage range you want to run
+
-# such as `./run.sh --stage 0 --stop-stage 0`
+# with the following command, you can choose the stage range you want to run
-# this can not be mixed use with `$1`, `$2` ...
+# such as `./run.sh --stage 0 --stop-stage 0`
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+# this can not be mixed use with `$1`, `$2` ...
-
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+
-    # prepare data
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${add_blank}  ${ge2e_ckpt_path} || exit -1
+    # prepare data
-fi
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${add_blank}  ${ge2e_ckpt_path} || exit -1
-
+fi
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+
-    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
-fi
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
-
+fi
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-fi
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
-
+fi
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} \
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-        ${ge2e_params_path} ${ref_audio_dir} ${add_blank} ${src_audio_path} || exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} \
-fi
+        ${ge2e_params_path} ${add_blank} ${ref_audio_dir} ${src_audio_path} || exit -1
 fi
--- a/paddlespeech/t2s/exps/vits/synthesize.py
+++ b/paddlespeech/t2s/exps/vits/synthesize.py
@ -52,9 +52,8 @@ def evaluate(args):
        spk_num = len(spk_id)
        fields += ["spk_id"]
    elif args.voice_cloning:
-        print("Training voice cloning!")
+        print("Evaluating voice cloning!")
        fields += ["spk_emb"]
        converters["spk_emb"] = np.load
    else:
        print("single speaker vits!")
    print("spk_num:", spk_num)
--- a/paddlespeech/t2s/exps/vits/voice_cloning.py
+++ b/paddlespeech/t2s/exps/vits/voice_cloning.py
@ -102,7 +102,7 @@ def voice_cloning(args):
        phone_ids = input_ids["phone_ids"][0]
    else:
        wav, _ = librosa.load(str(args.audio_path), sr=config.fs)
-        feats = spec_extractor.get_linear_spectrogram(wav)
+        feats = paddle.to_tensor(spec_extractor.get_linear_spectrogram(wav))
        mel_sequences = p.extract_mel_partials(
            p.preprocess_wav(args.audio_path))
@ -122,10 +122,11 @@ def voice_cloning(args):
        with paddle.no_grad():
            if args.audio_path is None:
-                wav = vits.inference(text=phone_ids, spembs=spk_emb)
+                out = vits.inference(text=phone_ids, spembs=spk_emb)
            else:
-                wav = vits.voice_conversion(
+                out = vits.voice_conversion(
                    feats=feats, spembs_src=spk_emb_src, spembs_tgt=spk_emb)
            wav = out["wav"]
        sf.write(
            str(output_dir / (utt_id + ".wav")),
@ -138,10 +139,11 @@ def voice_cloning(args):
    utt_id = "random_spk_emb"
    with paddle.no_grad():
        if args.audio_path is None:
-            wav = vits.inference(text=phone_ids, spembs=random_spk_emb)
+            out = vits.inference(text=phone_ids, spembs=random_spk_emb)
        else:
-            wav = vits.voice_conversion(
+            out = vits.voice_conversion(
                feats=feats, spembs_src=spk_emb_src, spembs_tgt=random_spk_emb)
        wav = out["wav"]
    sf.write(
        str(output_dir / (utt_id + ".wav")), wav.numpy(), samplerate=config.fs)
    print(f"{utt_id} done!")
--- a/paddlespeech/t2s/models/vits/generator.py
+++ b/paddlespeech/t2s/models/vits/generator.py
@ -524,8 +524,8 @@ class VITSGenerator(nn.Layer):
    def voice_conversion(
            self,
-            feats: Optional[paddle.Tensor]=None,
+            feats: paddle.Tensor=None,
-            feats_lengths: Optional[paddle.Tensor]=None,
+            feats_lengths: paddle.Tensor=None,
            sids_src: Optional[paddle.Tensor]=None,
            sids_tgt: Optional[paddle.Tensor]=None,
            spembs_src: Optional[paddle.Tensor]=None,
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@ -381,7 +381,7 @@ class VITS(nn.Layer):
        if use_teacher_forcing:
            assert feats is not None
            feats = feats[None].transpose([0, 2, 1])
-            feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]])
+            feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
            wav, att_w, dur = self.generator.inference(
                text=text,
                text_lengths=text_lengths,
@ -409,7 +409,7 @@ class VITS(nn.Layer):
    def voice_conversion(
            self,
-            feats: Optional[paddle.Tensor]=None,
+            feats: paddle.Tensor,
            sids_src: Optional[paddle.Tensor]=None,
            sids_tgt: Optional[paddle.Tensor]=None,
            spembs_src: Optional[paddle.Tensor]=None,
@ -429,7 +429,7 @@ class VITS(nn.Layer):
        """
        assert feats is not None
        feats = feats[None].transpose([0, 2, 1])
-        feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]])
+        feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
        sids_none = sids_src is None and sids_tgt is None
        spembs_none = spembs_src is None and spembs_tgt is None