From 1450e74b4f2846bb6da5d7e7ee0f19548601eda5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=89=BE=E6=A2=A6?= <HighCWu@163.com>
Date: Tue, 16 Aug 2022 22:31:44 +0800
Subject: [PATCH] fix voice cloning of vits.

---
 examples/aishell3/vits-vc/README.md           |  4 +-
 .../aishell3/vits-vc/local/voice_cloning.sh   | 44 ++++-----
 examples/aishell3/vits-vc/run.sh              | 89 ++++++++++---------
 paddlespeech/t2s/exps/vits/synthesize.py      |  3 +-
 paddlespeech/t2s/exps/vits/voice_cloning.py   | 12 +--
 paddlespeech/t2s/models/vits/generator.py     |  4 +-
 paddlespeech/t2s/models/vits/vits.py          |  6 +-
 7 files changed, 82 insertions(+), 80 deletions(-)

diff --git a/examples/aishell3/vits-vc/README.md b/examples/aishell3/vits-vc/README.md
index c47bbdd52..2e1ae21db 100644
--- a/examples/aishell3/vits-vc/README.md
+++ b/examples/aishell3/vits-vc/README.md
@@ -122,13 +122,13 @@ ref_audio
 `./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py`
 
 ```bash
-CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
+CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${add_blank} ${ref_audio_dir}
 ```
 
 If you want to convert a speaker audio file to refered speaker, run:
 
 ```bash
-CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} ${src_audio_path}
+CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${add_blank} ${ref_audio_dir} ${src_audio_path}
 ```
 
 ## Pretrained Model
diff --git a/examples/aishell3/vits-vc/local/voice_cloning.sh b/examples/aishell3/vits-vc/local/voice_cloning.sh
index 429bbfd34..3c113da8b 100755
--- a/examples/aishell3/vits-vc/local/voice_cloning.sh
+++ b/examples/aishell3/vits-vc/local/voice_cloning.sh
@@ -1,22 +1,22 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-ge2e_params_path=$4
-ref_audio_dir=$5
-add_blank=$6
-src_audio_path=$7
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/voice_cloning.py \
-    --config=${config_path} \
-    --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --ge2e_params_path=${ge2e_params_path} \
-    --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
-    --audio-path=${src_audio_path} \
-    --input-dir=${ref_audio_dir} \
-    --output-dir=${train_output_path}/vc_syn \
-    --phones-dict=dump/phone_id_map.txt \
-    --add-blank=${add_blank}
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+ge2e_params_path=$4
+add_blank=$5
+ref_audio_dir=$6
+src_audio_path=$7
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/voice_cloning.py \
+    --config=${config_path} \
+    --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+    --ge2e_params_path=${ge2e_params_path} \
+    --phones_dict=dump/phone_id_map.txt \
+    --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
+    --audio-path=${src_audio_path} \
+    --input-dir=${ref_audio_dir} \
+    --output-dir=${train_output_path}/vc_syn \
+    --add-blank=${add_blank}
diff --git a/examples/aishell3/vits-vc/run.sh b/examples/aishell3/vits-vc/run.sh
index 9ebec2127..2cc378016 100755
--- a/examples/aishell3/vits-vc/run.sh
+++ b/examples/aishell3/vits-vc/run.sh
@@ -1,44 +1,45 @@
-#!/bin/bash
-
-set -e
-source path.sh
-
-gpus=0,1
-stage=0
-stop_stage=100
-
-conf_path=conf/default.yaml
-train_output_path=exp/default
-ckpt_name=snapshot_iter_153.pdz
-add_blank=true
-src_audio_path=''
-
-# not include ".pdparams" here
-ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
-
-# include ".pdparams" here
-ge2e_params_path=${ge2e_ckpt_path}.pdparams
-
-# with the following command, you can choose the stage range you want to run
-# such as `./run.sh --stage 0 --stop-stage 0`
-# this can not be mixed use with `$1`, `$2` ...
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # prepare data
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${add_blank}  ${ge2e_ckpt_path} || exit -1
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} \
-        ${ge2e_params_path} ${ref_audio_dir} ${add_blank} ${src_audio_path} || exit -1
-fi
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+add_blank=true
+ref_audio_dir=ref_audio
+src_audio_path=''
+
+# not include ".pdparams" here
+ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
+
+# include ".pdparams" here
+ge2e_params_path=${ge2e_ckpt_path}.pdparams
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${add_blank}  ${ge2e_ckpt_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} \
+        ${ge2e_params_path} ${add_blank} ${ref_audio_dir} ${src_audio_path} || exit -1
+fi
diff --git a/paddlespeech/t2s/exps/vits/synthesize.py b/paddlespeech/t2s/exps/vits/synthesize.py
index f58e38874..968684b25 100644
--- a/paddlespeech/t2s/exps/vits/synthesize.py
+++ b/paddlespeech/t2s/exps/vits/synthesize.py
@@ -52,9 +52,8 @@ def evaluate(args):
         spk_num = len(spk_id)
         fields += ["spk_id"]
     elif args.voice_cloning:
-        print("Training voice cloning!")
+        print("Evaluating voice cloning!")
         fields += ["spk_emb"]
-        converters["spk_emb"] = np.load
     else:
         print("single speaker vits!")
     print("spk_num:", spk_num)
diff --git a/paddlespeech/t2s/exps/vits/voice_cloning.py b/paddlespeech/t2s/exps/vits/voice_cloning.py
index 2874e97aa..bdda4d687 100644
--- a/paddlespeech/t2s/exps/vits/voice_cloning.py
+++ b/paddlespeech/t2s/exps/vits/voice_cloning.py
@@ -102,7 +102,7 @@ def voice_cloning(args):
         phone_ids = input_ids["phone_ids"][0]
     else:
         wav, _ = librosa.load(str(args.audio_path), sr=config.fs)
-        feats = spec_extractor.get_linear_spectrogram(wav)
+        feats = paddle.to_tensor(spec_extractor.get_linear_spectrogram(wav))
 
         mel_sequences = p.extract_mel_partials(
             p.preprocess_wav(args.audio_path))
@@ -122,10 +122,11 @@ def voice_cloning(args):
 
         with paddle.no_grad():
             if args.audio_path is None:
-                wav = vits.inference(text=phone_ids, spembs=spk_emb)
+                out = vits.inference(text=phone_ids, spembs=spk_emb)
             else:
-                wav = vits.voice_conversion(
+                out = vits.voice_conversion(
                     feats=feats, spembs_src=spk_emb_src, spembs_tgt=spk_emb)
+            wav = out["wav"]
 
         sf.write(
             str(output_dir / (utt_id + ".wav")),
@@ -138,10 +139,11 @@ def voice_cloning(args):
     utt_id = "random_spk_emb"
     with paddle.no_grad():
         if args.audio_path is None:
-            wav = vits.inference(text=phone_ids, spembs=random_spk_emb)
+            out = vits.inference(text=phone_ids, spembs=random_spk_emb)
         else:
-            wav = vits.voice_conversion(
+            out = vits.voice_conversion(
                 feats=feats, spembs_src=spk_emb_src, spembs_tgt=random_spk_emb)
+        wav = out["wav"]
     sf.write(
         str(output_dir / (utt_id + ".wav")), wav.numpy(), samplerate=config.fs)
     print(f"{utt_id} done!")
diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py
index 69134bd27..359b66258 100644
--- a/paddlespeech/t2s/models/vits/generator.py
+++ b/paddlespeech/t2s/models/vits/generator.py
@@ -524,8 +524,8 @@ class VITSGenerator(nn.Layer):
 
     def voice_conversion(
             self,
-            feats: Optional[paddle.Tensor]=None,
-            feats_lengths: Optional[paddle.Tensor]=None,
+            feats: paddle.Tensor=None,
+            feats_lengths: paddle.Tensor=None,
             sids_src: Optional[paddle.Tensor]=None,
             sids_tgt: Optional[paddle.Tensor]=None,
             spembs_src: Optional[paddle.Tensor]=None,
diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py
index 68c324bec..983bf0a36 100644
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@@ -381,7 +381,7 @@ class VITS(nn.Layer):
         if use_teacher_forcing:
             assert feats is not None
             feats = feats[None].transpose([0, 2, 1])
-            feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]])
+            feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
             wav, att_w, dur = self.generator.inference(
                 text=text,
                 text_lengths=text_lengths,
@@ -409,7 +409,7 @@ class VITS(nn.Layer):
 
     def voice_conversion(
             self,
-            feats: Optional[paddle.Tensor]=None,
+            feats: paddle.Tensor,
             sids_src: Optional[paddle.Tensor]=None,
             sids_tgt: Optional[paddle.Tensor]=None,
             spembs_src: Optional[paddle.Tensor]=None,
@@ -429,7 +429,7 @@ class VITS(nn.Layer):
         """
         assert feats is not None
         feats = feats[None].transpose([0, 2, 1])
-        feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]])
+        feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
 
         sids_none = sids_src is None and sids_tgt is None
         spembs_none = spembs_src is None and spembs_tgt is None