[TTS]add StarGANv2-VC model scripts (#2842)

3 years ago · f7fd111647
parent fb31a0f050
commit f7fd111647
22 changed files with 2328 additions and 4 deletions
--- a/examples/vctk/README.md
+++ b/examples/vctk/README.md
@ -10,3 +10,4 @@
 * voc2 - MelGAN
 * voc3 - MultiBand MelGAN
 * ernie_sat - ERNIE-SAT
+* vc3 - StarGANv2-VC
--- a/examples/vctk/vc3/README.md
+++ b/examples/vctk/vc3/README.md
@ -0,0 +1,10 @@
+You can download test source audios from [test_wav.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/starganv2vc/test_wav.zip).
+
+
+Test Voice Conversion:
+
+```bash
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/starganv2vc/test_wav.zip
+unzip test_wav.zip
+./run.sh --stage 2 --stop-stage 2 --gpus 0
+```
--- a/examples/vctk/vc3/conf/default.yaml
+++ b/examples/vctk/vc3/conf/default.yaml
@ -0,0 +1,22 @@
+  generator_params:
+    dim_in: 64
+    style_dim: 64
+    max_conv_dim: 512
+    w_hpf: 0
+    F0_channel: 256
+  mapping_network_params:
+    num_domains: 20      # num of speakers in StarGANv2
+    latent_dim: 16
+    style_dim: 64        # same as style_dim in generator_params
+    hidden_dim: 512      # same as max_conv_dim in generator_params
+  style_encoder_params:
+    dim_in: 64           # same as dim_in in generator_params
+    style_dim: 64        # same as style_dim in generator_params
+    num_domains: 20      # same as num_domains in generator_params
+    max_conv_dim: 512    # same as max_conv_dim in generator_params
+  discriminator_params:
+    dim_in: 64           # same as dim_in in generator_params
+    num_domains: 20      # same as num_domains in mapping_network_params
+    max_conv_dim: 512    # same as max_conv_dim in generator_params
+    n_repeat: 4
+    
--- a/examples/vctk/vc3/local/preprocess.sh
+++ b/examples/vctk/vc3/local/preprocess.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+
+fi
--- a/examples/vctk/vc3/local/train.sh
+++ b/examples/vctk/vc3/local/train.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt \
+    --speaker-dict=dump/speaker_id_map.txt
--- a/examples/vctk/vc3/local/voice_conversion.sh
+++ b/examples/vctk/vc3/local/voice_conversion.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+config_path=$1
+source_path=$2
+output_dir=$3
+
+python3 ${BIN_DIR}/vc.py \
+    --config_path=${config_path} \
+    --source_path=${source_path}\
+    --output_dir=${output_dir} 
--- a/examples/vctk/vc3/path.sh
+++ b/examples/vctk/vc3/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=starganv2_vc
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/vctk/vc3/run.sh
+++ b/examples/vctk/vc3/run.sh
@ -0,0 +1,36 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_331.pdz
+source_path=test_wav/goat_01.wav
+output_dir=vc_output
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+# not ready now
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+# not ready now
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_conversion.sh ${conf_path} ${source_path} ${output_dir}|| exit -1
+fi
+
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@ -2003,7 +2003,7 @@ g2pw_onnx_models = {
 }

 # ---------------------------------
-# ------------- Rhy_frontend ---------------
+# ---------- Rhy_frontend ---------
 # ---------------------------------
 rhy_frontend_models = {
    'rhy_e2e': {
@ -2014,3 +2014,16 @@ rhy_frontend_models = {
        },
    },
 }
+
+# ---------------------------------
+# ---------- StarGANv2VC ----------
+# ---------------------------------
+
+StarGANv2VC_source = {
+    '1.0' :{
+        'url': 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/starganv2vc/StarGANv2VC_source.zip',
+        'md5': '195e169419163f5648030ba84c71f866',
+
+    }
+}
+
--- a/paddlespeech/t2s/datasets/get_feats.py
+++ b/paddlespeech/t2s/datasets/get_feats.py
@ -17,6 +17,11 @@ import numpy as np
 import pyworld
 from scipy.interpolate import interp1d

+from typing import Optional
+from typing import Union
+from typing_extensions import Literal
+
+

 class LogMelFBank():
    def __init__(self,
@ -27,7 +32,10 @@ class LogMelFBank():
                 window: str="hann",
                 n_mels: int=80,
                 fmin: int=80,
-                 fmax: int=7600):
+                 fmax: int=7600,
+                 norm: Optional[Union[Literal["slaney"], float]]="slaney",
+                 htk: bool=False,
+                 power: float=1.0):
        self.sr = sr
        # stft
        self.n_fft = n_fft
@ -36,11 +44,14 @@ class LogMelFBank():
        self.window = window
        self.center = True
        self.pad_mode = "reflect"
+        self.norm = norm
+        self.htk = htk

        # mel
        self.n_mels = n_mels
        self.fmin = 0 if fmin is None else fmin
        self.fmax = sr / 2 if fmax is None else fmax
+        self.power = power

        self.mel_filter = self._create_mel_filter()

@ -50,7 +61,9 @@ class LogMelFBank():
            n_fft=self.n_fft,
            n_mels=self.n_mels,
            fmin=self.fmin,
-            fmax=self.fmax)
+            fmax=self.fmax,
+            norm=self.norm,
+            htk=self.htk)
        return mel_filter

    def _stft(self, wav: np.ndarray):
@ -66,7 +79,7 @@ class LogMelFBank():

    def _spectrogram(self, wav: np.ndarray):
        D = self._stft(wav)
-        return np.abs(D)
+        return np.abs(D) ** self.power

    def _mel_spectrogram(self, wav: np.ndarray):
        S = self._spectrogram(wav)
--- a/paddlespeech/t2s/exps/starganv2_vc/init.py
+++ b/paddlespeech/t2s/exps/starganv2_vc/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/t2s/exps/starganv2_vc/vc.py
+++ b/paddlespeech/t2s/exps/starganv2_vc/vc.py
@ -0,0 +1,253 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+from pathlib import Path
+
+import librosa
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.cli.utils import download_and_decompress
+from paddlespeech.resource.pretrained_models import StarGANv2VC_source
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
+from paddlespeech.t2s.models.starganv2_vc import Generator
+from paddlespeech.t2s.models.starganv2_vc import JDCNet
+from paddlespeech.t2s.models.starganv2_vc import MappingNetwork
+from paddlespeech.t2s.models.starganv2_vc import StyleEncoder
+from paddlespeech.utils.env import MODEL_HOME
+
+
+def get_mel_extractor():
+    sr = 16000
+    n_fft = 2048
+    win_length = 1200
+    hop_length = 300
+    n_mels = 80
+    fmin = 0
+    fmax = sr // 2
+
+    mel_extractor = LogMelFBank(
+        sr=sr,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        n_mels=n_mels,
+        fmin=fmin,
+        fmax=fmax,
+        norm=None,
+        htk=True,
+        power=2.0)
+    return mel_extractor
+
+
+def preprocess(wave, mel_extractor):
+    logmel = mel_extractor.get_log_mel_fbank(wave, base='e')
+    # [1, 80, 1011]
+    mean, std = -4, 4
+    mel_tensor = (paddle.to_tensor(logmel.T).unsqueeze(0) - mean) / std
+    return mel_tensor
+
+
+def compute_style(speaker_dicts, mel_extractor, style_encoder, mapping_network):
+    reference_embeddings = {}
+    for key, (path, speaker) in speaker_dicts.items():
+        if path == '':
+            label = paddle.to_tensor([speaker], dtype=paddle.int64)
+            latent_dim = mapping_network.shared[0].weight.shape[0]
+            ref = mapping_network(paddle.randn([1, latent_dim]), label)
+        else:
+            wave, sr = librosa.load(path, sr=24000)
+            audio, index = librosa.effects.trim(wave, top_db=30)
+            if sr != 24000:
+                wave = librosa.resample(wave, sr, 24000)
+            mel_tensor = preprocess(wave=wave, mel_extractor=mel_extractor)
+            with paddle.no_grad():
+                label = paddle.to_tensor([speaker], dtype=paddle.int64)
+                ref = style_encoder(mel_tensor.unsqueeze(1), label)
+        reference_embeddings[key] = (ref, label)
+
+    return reference_embeddings
+
+
+def get_models(args, uncompress_path):
+    model_dict = {}
+    jdc_model_dir = os.path.join(uncompress_path, 'jdcnet.pdz')
+    voc_model_dir = os.path.join(uncompress_path, 'Vocoder/')
+    starganv2vc_model_dir = os.path.join(uncompress_path, 'starganv2vc.pdz')
+
+    F0_model = JDCNet(num_class=1, seq_len=192)
+    F0_model.set_state_dict(paddle.load(jdc_model_dir)['main_params'])
+    F0_model.eval()
+
+    voc_config_path = os.path.join(voc_model_dir, 'config.yml')
+    with open(voc_config_path) as f:
+        voc_config = CfgNode(yaml.safe_load(f))
+    voc_config["generator_params"].pop("upsample_net")
+    voc_config["generator_params"]["upsample_scales"] = voc_config[
+        "generator_params"].pop("upsample_params")["upsample_scales"]
+    vocoder = PWGGenerator(**voc_config["generator_params"])
+    vocoder.remove_weight_norm()
+    vocoder.eval()
+    voc_model_path = os.path.join(voc_model_dir, 'checkpoint-400000steps.pd')
+    vocoder.set_state_dict(paddle.load(voc_model_path))
+
+    with open(args.config_path) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    generator = Generator(**config['generator_params'])
+    mapping_network = MappingNetwork(**config['mapping_network_params'])
+    style_encoder = StyleEncoder(**config['style_encoder_params'])
+
+    starganv2vc_model_param = paddle.load(starganv2vc_model_dir)
+
+    generator.set_state_dict(starganv2vc_model_param['generator_params'])
+    mapping_network.set_state_dict(
+        starganv2vc_model_param['mapping_network_params'])
+    style_encoder.set_state_dict(
+        starganv2vc_model_param['style_encoder_params'])
+
+    generator.eval()
+    mapping_network.eval()
+    style_encoder.eval()
+
+    model_dict['F0_model'] = F0_model
+    model_dict['vocoder'] = vocoder
+    model_dict['generator'] = generator
+    model_dict['mapping_network'] = mapping_network
+    model_dict['style_encoder'] = style_encoder
+    return model_dict
+
+
+def voice_conversion(args, uncompress_path):
+    speakers = [
+        225, 228, 229, 230, 231, 233, 236, 239, 240, 244, 226, 227, 232, 243,
+        254, 256, 258, 259, 270, 273
+    ]
+    demo_dir = os.path.join(uncompress_path, 'Demo/VCTK-corpus/')
+    model_dict = get_models(args, uncompress_path=uncompress_path)
+    style_encoder = model_dict['style_encoder']
+    mapping_network = model_dict['mapping_network']
+    generator = model_dict['generator']
+    vocoder = model_dict['vocoder']
+    F0_model = model_dict['F0_model']
+
+    # 计算 Demo 文件夹下的说话人的风格
+    speaker_dicts = {}
+    selected_speakers = [273, 259, 258, 243, 254, 244, 236, 233, 230, 228]
+    for s in selected_speakers:
+        k = s
+        speaker_dicts['p' + str(s)] = (
+            demo_dir + 'p' + str(k) + '/p' + str(k) + '_023.wav',
+            speakers.index(s))
+    mel_extractor = get_mel_extractor()
+    reference_embeddings = compute_style(
+        speaker_dicts=speaker_dicts,
+        mel_extractor=mel_extractor,
+        style_encoder=style_encoder,
+        mapping_network=mapping_network)
+
+    wave, sr = librosa.load(args.source_path, sr=24000)
+    source = preprocess(wave=wave, mel_extractor=mel_extractor)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    orig_wav_name = str(output_dir / 'orig_voc.wav')
+    print('原始语音 (使用声码器解码): %s' % orig_wav_name)
+    c = source.transpose([0, 2, 1]).squeeze()
+    with paddle.no_grad():
+        recon = vocoder.inference(c)
+        recon = recon.reshape([-1]).numpy()
+    sf.write(orig_wav_name, recon, samplerate=24000)
+
+    keys = []
+    converted_samples = {}
+    reconstructed_samples = {}
+    converted_mels = {}
+    start = time.time()
+
+    for key, (ref, _) in reference_embeddings.items():
+        with paddle.no_grad():
+            # F0_model 输入的特征是否可以不带 norm，或者 norm 是否一定要和 stargan 原作保持一致？
+            # !! 需要，ASR 和 F0_model 用的是一样的数据预处理方式
+            # 如果不想要重新训练 ASR 和 F0_model, 则我们的数据预处理需要和 stargan 原作保持一致
+            # 但是 vocoder 就无法复用
+            # 是否因为 asr 的输入是 16k 的，所以 torchaudio 的参数也是 16k 的？
+            f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))
+            # 输出是带 norm 的 mel, 所以可以直接用 vocoder.inference
+            out = generator(source.unsqueeze(1), ref, F0=f0_feat)
+            c = out.transpose([0, 1, 3, 2]).squeeze()
+            y_out = vocoder.inference(c)
+            y_out = y_out.reshape([-1])
+            if key not in speaker_dicts or speaker_dicts[key][0] == "":
+                recon = None
+            else:
+                wave, sr = librosa.load(speaker_dicts[key][0], sr=24000)
+                mel = preprocess(wave=wave, mel_extractor=mel_extractor)
+                c = mel.transpose([0, 2, 1]).squeeze()
+                recon = vocoder.inference(c)
+                recon = recon.reshape([-1]).numpy()
+
+        converted_samples[key] = y_out.numpy()
+        reconstructed_samples[key] = recon
+        converted_mels[key] = out
+        keys.append(key)
+    end = time.time()
+    print('总共花费时间: %.3f sec' % (end - start))
+    for key, wave in converted_samples.items():
+        wav_name = str(output_dir / ('vc_result_' + key + '.wav'))
+        print('语音转换结果: %s' % wav_name)
+        sf.write(wav_name, wave, samplerate=24000)
+        ref_wav_name = str(output_dir / ('ref_voc_' + key + '.wav'))
+        print('参考的说话人 (使用声码器解码): %s' % ref_wav_name)
+        if reconstructed_samples[key] is not None:
+            sf.write(ref_wav_name, reconstructed_samples[key], samplerate=24000)
+
+
+def parse_args():
+    # parse args and config  
+    parser = argparse.ArgumentParser(
+        description="StarGANv2-VC Voice Conversion.")
+    parser.add_argument("--source_path", type=str, help="source audio's path.")
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default=None,
+        help='Config of StarGANv2-VC model.')
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+    model_version = '1.0'
+    uncompress_path = download_and_decompress(StarGANv2VC_source[model_version],
+                                              MODEL_HOME)
+    voice_conversion(args, uncompress_path=uncompress_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/init.py
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/config.yml
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/config.yml
@ -0,0 +1,29 @@
+log_dir: "logs"
+save_freq: 20
+device: "cuda"
+epochs: 180
+batch_size: 48
+pretrained_model: ""
+train_data: "asr_train_list.txt"
+val_data: "asr_val_list.txt"
+
+dataset_params:
+  data_augmentation: true
+
+preprocess_parasm:
+  sr: 24000
+  spect_params:
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+  mel_params:
+    n_mels: 80
+
+model_params:
+   input_dim: 80
+   hidden_dim: 256
+   n_token: 80
+   token_embedding_dim: 256
+
+optimizer_params:
+  lr: 0.0005
--- a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py
@ -0,0 +1,480 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+
+import paddle
+import paddle.nn.functional as F
+import paddleaudio.functional as audio_F
+from paddle import nn
+
+from paddlespeech.utils.initialize import _calculate_gain
+from paddlespeech.utils.initialize import xavier_uniform_
+
+
+def _get_activation_fn(activ):
+    if activ == 'relu':
+        return nn.ReLU()
+    elif activ == 'lrelu':
+        return nn.LeakyReLU(0.2)
+    elif activ == 'swish':
+        return nn.Swish()
+    else:
+        raise RuntimeError(
+            'Unexpected activ type %s, expected [relu, lrelu, swish]' % activ)
+
+
+class LinearNorm(nn.Layer):
+    def __init__(self,
+                 in_dim: int,
+                 out_dim: int,
+                 bias: bool=True,
+                 w_init_gain: str='linear'):
+        super().__init__()
+        self.linear_layer = nn.Linear(in_dim, out_dim, bias_attr=bias)
+        xavier_uniform_(
+            self.linear_layer.weight, gain=_calculate_gain(w_init_gain))
+
+    def forward(self, x: paddle.Tensor):
+        return self.linear_layer(x)
+
+
+class ConvNorm(nn.Layer):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int=1,
+                 stride: int=1,
+                 padding: int=None,
+                 dilation: int=1,
+                 bias: bool=True,
+                 w_init_gain: str='linear',
+                 param=None):
+        super().__init__()
+        if padding is None:
+            assert (kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+
+        self.conv = nn.Conv1D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias_attr=bias)
+
+        xavier_uniform_(
+            self.conv.weight, gain=_calculate_gain(w_init_gain, param=param))
+
+    def forward(self, signal: paddle.Tensor):
+        conv_signal = self.conv(signal)
+        return conv_signal
+
+
+class CausualConv(nn.Layer):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int=1,
+                 stride: int=1,
+                 padding: int=1,
+                 dilation: int=1,
+                 bias: bool=True,
+                 w_init_gain: str='linear',
+                 param=None):
+        super().__init__()
+        if padding is None:
+            assert (kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2) * 2
+        else:
+            self.padding = padding * 2
+        self.conv = nn.Conv1D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=self.padding,
+            dilation=dilation,
+            bias_attr=bias)
+
+        xavier_uniform_(
+            self.conv.weight, gain=_calculate_gain(w_init_gain, param=param))
+
+    def forward(self, x: paddle.Tensor):
+        x = self.conv(x)
+        x = x[:, :, :-self.padding]
+        return x
+
+
+class CausualBlock(nn.Layer):
+    def __init__(self,
+                 hidden_dim: int,
+                 n_conv: int=3,
+                 dropout_p: float=0.2,
+                 activ: str='lrelu'):
+        super().__init__()
+        self.blocks = nn.LayerList([
+            self._get_conv(
+                hidden_dim=hidden_dim,
+                dilation=3**i,
+                activ=activ,
+                dropout_p=dropout_p) for i in range(n_conv)
+        ])
+
+    def forward(self, x):
+        for block in self.blocks:
+            res = x
+            x = block(x)
+            x += res
+        return x
+
+    def _get_conv(self,
+                  hidden_dim: int,
+                  dilation: int,
+                  activ: str='lrelu',
+                  dropout_p: float=0.2):
+        layers = [
+            CausualConv(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=3,
+                padding=dilation,
+                dilation=dilation), _get_activation_fn(activ),
+            nn.BatchNorm1D(hidden_dim), nn.Dropout(p=dropout_p), CausualConv(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=3,
+                padding=1,
+                dilation=1), _get_activation_fn(activ), nn.Dropout(p=dropout_p)
+        ]
+        return nn.Sequential(*layers)
+
+
+class ConvBlock(nn.Layer):
+    def __init__(self,
+                 hidden_dim: int,
+                 n_conv: int=3,
+                 dropout_p: float=0.2,
+                 activ: str='relu'):
+        super().__init__()
+        self._n_groups = 8
+        self.blocks = nn.LayerList([
+            self._get_conv(
+                hidden_dim=hidden_dim,
+                dilation=3**i,
+                activ=activ,
+                dropout_p=dropout_p) for i in range(n_conv)
+        ])
+
+    def forward(self, x: paddle.Tensor):
+        for block in self.blocks:
+            res = x
+            x = block(x)
+            x += res
+        return x
+
+    def _get_conv(self,
+                  hidden_dim: int,
+                  dilation: int,
+                  activ: str='relu',
+                  dropout_p: float=0.2):
+        layers = [
+            ConvNorm(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=3,
+                padding=dilation,
+                dilation=dilation), _get_activation_fn(activ),
+            nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
+            nn.Dropout(p=dropout_p), ConvNorm(
+                hidden_dim, hidden_dim, kernel_size=3, padding=1,
+                dilation=1), _get_activation_fn(activ), nn.Dropout(p=dropout_p)
+        ]
+        return nn.Sequential(*layers)
+
+
+class LocationLayer(nn.Layer):
+    def __init__(self,
+                 attention_n_filters: int,
+                 attention_kernel_size: int,
+                 attention_dim: int):
+        super().__init__()
+        padding = int((attention_kernel_size - 1) / 2)
+        self.location_conv = ConvNorm(
+            in_channels=2,
+            out_channels=attention_n_filters,
+            kernel_size=attention_kernel_size,
+            padding=padding,
+            bias=False,
+            stride=1,
+            dilation=1)
+        self.location_dense = LinearNorm(
+            in_dim=attention_n_filters,
+            out_dim=attention_dim,
+            bias=False,
+            w_init_gain='tanh')
+
+    def forward(self, attention_weights_cat: paddle.Tensor):
+        processed_attention = self.location_conv(attention_weights_cat)
+        processed_attention = processed_attention.transpose([0, 2, 1])
+        processed_attention = self.location_dense(processed_attention)
+        return processed_attention
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 attention_rnn_dim: int,
+                 embedding_dim: int,
+                 attention_dim: int,
+                 attention_location_n_filters: int,
+                 attention_location_kernel_size: int):
+        super().__init__()
+        self.query_layer = LinearNorm(
+            in_dim=attention_rnn_dim,
+            out_dim=attention_dim,
+            bias=False,
+            w_init_gain='tanh')
+        self.memory_layer = LinearNorm(
+            in_dim=embedding_dim,
+            out_dim=attention_dim,
+            bias=False,
+            w_init_gain='tanh')
+        self.v = LinearNorm(in_dim=attention_dim, out_dim=1, bias=False)
+        self.location_layer = LocationLayer(
+            attention_n_filters=attention_location_n_filters,
+            attention_kernel_size=attention_location_kernel_size,
+            attention_dim=attention_dim)
+        self.score_mask_value = -float("inf")
+
+    def get_alignment_energies(self,
+                               query: paddle.Tensor,
+                               processed_memory: paddle.Tensor,
+                               attention_weights_cat: paddle.Tensor):
+        """
+        Args:
+            query: 
+                decoder output (batch, n_mel_channels * n_frames_per_step)
+            processed_memory: 
+                processed encoder outputs (B, T_in, attention_dim)
+            attention_weights_cat: 
+                cumulative and prev. att weights (B, 2, max_time)
+        Returns:
+            Tensor: alignment (batch, max_time)
+        """
+
+        processed_query = self.query_layer(query.unsqueeze(1))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(
+            paddle.tanh(processed_query + processed_attention_weights +
+                        processed_memory))
+
+        energies = energies.squeeze(-1)
+        return energies
+
+    def forward(self,
+                attention_hidden_state: paddle.Tensor,
+                memory: paddle.Tensor,
+                processed_memory: paddle.Tensor,
+                attention_weights_cat: paddle.Tensor,
+                mask: paddle.Tensor):
+        """
+        Args:
+            attention_hidden_state: 
+                attention rnn last output
+            memory: 
+                encoder outputs
+            processed_memory: 
+                processed encoder outputs
+            attention_weights_cat: 
+                previous and cummulative attention weights
+            mask: 
+                binary mask for padded data
+        """
+        alignment = self.get_alignment_energies(
+            query=attention_hidden_state,
+            processed_memory=processed_memory,
+            attention_weights_cat=attention_weights_cat)
+
+        if mask is not None:
+            alignment.data.masked_fill_(mask, self.score_mask_value)
+
+        attention_weights = F.softmax(alignment, axis=1)
+        attention_context = paddle.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+
+        return attention_context, attention_weights
+
+
+class ForwardAttentionV2(nn.Layer):
+    def __init__(self,
+                 attention_rnn_dim: int,
+                 embedding_dim: int,
+                 attention_dim: int,
+                 attention_location_n_filters: int,
+                 attention_location_kernel_size: int):
+        super().__init__()
+        self.query_layer = LinearNorm(
+            in_dim=attention_rnn_dim,
+            out_dim=attention_dim,
+            bias=False,
+            w_init_gain='tanh')
+        self.memory_layer = LinearNorm(
+            in_dim=embedding_dim,
+            out_dim=attention_dim,
+            bias=False,
+            w_init_gain='tanh')
+        self.v = LinearNorm(in_dim=attention_dim, out_dim=1, bias=False)
+        self.location_layer = LocationLayer(
+            attention_n_filters=attention_location_n_filters,
+            attention_kernel_size=attention_location_kernel_size,
+            attention_dim=attention_dim)
+        self.score_mask_value = -float(1e20)
+
+    def get_alignment_energies(self,
+                               query: paddle.Tensor,
+                               processed_memory: paddle.Tensor,
+                               attention_weights_cat: paddle.Tensor):
+        """
+        Args:
+            query: 
+                decoder output (batch, n_mel_channels * n_frames_per_step)
+            processed_memory: 
+                processed encoder outputs (B, T_in, attention_dim)
+            attention_weights_cat: 
+                prev. and cumulative att weights (B, 2, max_time)
+        Returns:
+            Tensor: alignment (batch, max_time)
+        """
+
+        processed_query = self.query_layer(query.unsqueeze(1))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(
+            paddle.tanh(processed_query + processed_attention_weights +
+                        processed_memory))
+
+        energies = energies.squeeze(-1)
+        return energies
+
+    def forward(self,
+                attention_hidden_state: paddle.Tensor,
+                memory: paddle.Tensor,
+                processed_memory: paddle.Tensor,
+                attention_weights_cat: paddle.Tensor,
+                mask: paddle.Tensor,
+                log_alpha: paddle.Tensor):
+        """
+        Args:
+            attention_hidden_state: 
+                attention rnn last output
+            memory: 
+                encoder outputs
+            processed_memory: 
+                processed encoder outputs
+            attention_weights_cat: 
+                previous and cummulative attention weights
+            mask: 
+                binary mask for padded data
+        """
+        log_energy = self.get_alignment_energies(
+            query=attention_hidden_state,
+            processed_memory=processed_memory,
+            attention_weights_cat=attention_weights_cat)
+
+        if mask is not None:
+            log_energy[:] = paddle.where(
+                mask,
+                paddle.full(log_energy.shape, self.score_mask_value,
+                            log_energy.dtype), log_energy)
+        log_alpha_shift_padded = []
+        max_time = log_energy.shape[1]
+        for sft in range(2):
+            shifted = log_alpha[:, :max_time - sft]
+            shift_padded = F.pad(shifted, (sft, 0), 'constant',
+                                 self.score_mask_value)
+            log_alpha_shift_padded.append(shift_padded.unsqueeze(2))
+
+        biased = paddle.logsumexp(paddle.conat(log_alpha_shift_padded, 2), 2)
+        log_alpha_new = biased + log_energy
+        attention_weights = F.softmax(log_alpha_new, axis=1)
+        attention_context = paddle.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+
+        return attention_context, attention_weights, log_alpha_new
+
+
+class PhaseShuffle2D(nn.Layer):
+    def __init__(self, n: int=2):
+        super().__init__()
+        self.n = n
+        self.random = random.Random(1)
+
+    def forward(self, x: paddle.Tensor, move: int=None):
+        # x.size = (B, C, M, L)
+        if move is None:
+            move = self.random.randint(-self.n, self.n)
+
+        if move == 0:
+            return x
+        else:
+            left = x[:, :, :, :move]
+            right = x[:, :, :, move:]
+            shuffled = paddle.concat([right, left], axis=3)
+        return shuffled
+
+
+class PhaseShuffle1D(nn.Layer):
+    def __init__(self, n: int=2):
+        super().__init__()
+        self.n = n
+        self.random = random.Random(1)
+
+    def forward(self, x: paddle.Tensor, move: int=None):
+        # x.size = (B, C, M, L)
+        if move is None:
+            move = self.random.randint(-self.n, self.n)
+
+        if move == 0:
+            return x
+        else:
+            left = x[:, :, :move]
+            right = x[:, :, move:]
+            shuffled = paddle.concat([right, left], axis=2)
+
+        return shuffled
+
+
+class MFCC(nn.Layer):
+    def __init__(self, n_mfcc: int=40, n_mels: int=80):
+        super().__init__()
+        self.n_mfcc = n_mfcc
+        self.n_mels = n_mels
+        self.norm = 'ortho'
+        dct_mat = audio_F.create_dct(self.n_mfcc, self.n_mels, self.norm)
+        self.register_buffer('dct_mat', dct_mat)
+
+    def forward(self, mel_specgram: paddle.Tensor):
+        if len(mel_specgram.shape) == 2:
+            mel_specgram = mel_specgram.unsqueeze(0)
+            unsqueezed = True
+        else:
+            unsqueezed = False
+        # (channel, n_mels, time).tranpose(...) dot (n_mels, n_mfcc)
+        # -> (channel, time, n_mfcc).tranpose(...)
+        mfcc = paddle.matmul(mel_specgram.transpose([0, 2, 1]),
+                             self.dct_mat).transpose([0, 2, 1])
+
+        # unpack batch
+        if unsqueezed:
+            mfcc = mfcc.squeeze(0)
+        return mfcc
--- a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py
@ -0,0 +1,239 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from .layers import Attention
+from .layers import ConvBlock
+from .layers import ConvNorm
+from .layers import LinearNorm
+from .layers import MFCC
+from paddlespeech.utils.initialize import uniform_
+
+
+class ASRCNN(nn.Layer):
+    def __init__(
+            self,
+            input_dim: int=80,
+            hidden_dim: int=256,
+            n_token: int=35,
+            n_layers: int=6,
+            token_embedding_dim: int=256, ):
+        super().__init__()
+        self.n_token = n_token
+        self.n_down = 1
+        self.to_mfcc = MFCC()
+        self.init_cnn = ConvNorm(
+            in_channels=input_dim // 2,
+            out_channels=hidden_dim,
+            kernel_size=7,
+            padding=3,
+            stride=2)
+        self.cnns = nn.Sequential(* [
+            nn.Sequential(
+                ConvBlock(hidden_dim),
+                nn.GroupNorm(num_groups=1, num_channels=hidden_dim))
+            for n in range(n_layers)
+        ])
+        self.projection = ConvNorm(
+            in_channels=hidden_dim, out_channels=hidden_dim // 2)
+        self.ctc_linear = nn.Sequential(
+            LinearNorm(in_dim=hidden_dim // 2, out_dim=hidden_dim),
+            nn.ReLU(), LinearNorm(in_dim=hidden_dim, out_dim=n_token))
+        self.asr_s2s = ASRS2S(
+            embedding_dim=token_embedding_dim,
+            hidden_dim=hidden_dim // 2,
+            n_token=n_token)
+
+    def forward(self,
+                x: paddle.Tensor,
+                src_key_padding_mask: paddle.Tensor=None,
+                text_input: paddle.Tensor=None):
+        x = self.to_mfcc(x)
+        x = self.init_cnn(x)
+        x = self.cnns(x)
+        x = self.projection(x)
+        x = x.transpose([0, 2, 1])
+        ctc_logit = self.ctc_linear(x)
+        if text_input is not None:
+            _, s2s_logit, s2s_attn = self.asr_s2s(
+                memory=x,
+                memory_mask=src_key_padding_mask,
+                text_input=text_input)
+            return ctc_logit, s2s_logit, s2s_attn
+        else:
+            return ctc_logit
+
+    def get_feature(self, x: paddle.Tensor):
+        x = self.to_mfcc(x.squeeze(1))
+        x = self.init_cnn(x)
+        x = self.cnns(x)
+        x = self.projection(x)
+        return x
+
+    def length_to_mask(self, lengths: paddle.Tensor):
+        mask = paddle.arange(lengths.max()).unsqueeze(0).expand(
+            (lengths.shape[0], -1)).astype(lengths.dtype)
+        mask = paddle.greater_than(mask + 1, lengths.unsqueeze(1))
+        return mask
+
+    def get_future_mask(self, out_length: int, unmask_future_steps: int=0):
+        """
+        Args:
+            out_length (int):
+                returned mask shape is (out_length, out_length).
+            unmask_futre_steps (int): 
+                unmasking future step size.
+        Return:
+            mask (paddle.BoolTensor): 
+                mask future timesteps mask[i, j] = True if i > j + unmask_future_steps else False
+        """
+        index_tensor = paddle.arange(out_length).unsqueeze(0).expand(
+            [out_length, -1])
+        mask = paddle.greater_than(index_tensor,
+                                   index_tensor.T + unmask_future_steps)
+        return mask
+
+
+class ASRS2S(nn.Layer):
+    def __init__(self,
+                 embedding_dim: int=256,
+                 hidden_dim: int=512,
+                 n_location_filters: int=32,
+                 location_kernel_size: int=63,
+                 n_token: int=40):
+        super().__init__()
+        self.embedding = nn.Embedding(n_token, embedding_dim)
+        val_range = math.sqrt(6 / hidden_dim)
+        uniform_(self.embedding.weight, -val_range, val_range)
+
+        self.decoder_rnn_dim = hidden_dim
+        self.project_to_n_symbols = nn.Linear(self.decoder_rnn_dim, n_token)
+        self.attention_layer = Attention(
+            attention_rnn_dim=self.decoder_rnn_dim,
+            embedding_dim=hidden_dim,
+            attention_dim=hidden_dim,
+            attention_location_n_filters=n_location_filters,
+            attention_location_kernel_size=location_kernel_size)
+        self.decoder_rnn = nn.LSTMCell(self.decoder_rnn_dim + embedding_dim,
+                                       self.decoder_rnn_dim)
+        self.project_to_hidden = nn.Sequential(
+            LinearNorm(in_dim=self.decoder_rnn_dim * 2, out_dim=hidden_dim),
+            nn.Tanh())
+        self.sos = 1
+        self.eos = 2
+
+    def initialize_decoder_states(self,
+                                  memory: paddle.Tensor,
+                                  mask: paddle.Tensor):
+        """
+        moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
+        """
+        B, L, H = memory.shape
+        dtype = memory.dtype
+        self.decoder_hidden = paddle.zeros(
+            (B, self.decoder_rnn_dim)).astype(dtype)
+        self.decoder_cell = paddle.zeros(
+            (B, self.decoder_rnn_dim)).astype(dtype)
+        self.attention_weights = paddle.zeros((B, L)).astype(dtype)
+        self.attention_weights_cum = paddle.zeros((B, L)).astype(dtype)
+        self.attention_context = paddle.zeros((B, H)).astype(dtype)
+        self.memory = memory
+        self.processed_memory = self.attention_layer.memory_layer(memory)
+        self.mask = mask
+        self.unk_index = 3
+        self.random_mask = 0.1
+
+    def forward(self,
+                memory: paddle.Tensor,
+                memory_mask: paddle.Tensor,
+                text_input: paddle.Tensor):
+        """
+        moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
+        moemory_mask.shape = (B, L, )
+        texts_input.shape = (B, T)
+        """
+        self.initialize_decoder_states(memory, memory_mask)
+        # text random mask
+        random_mask = (paddle.rand(text_input.shape) < self.random_mask)
+        _text_input = text_input.clone()
+        _text_input[:] = paddle.where(
+            condition=random_mask,
+            x=paddle.full(
+                shape=_text_input.shape,
+                fill_value=self.unk_index,
+                dtype=_text_input.dtype),
+            y=_text_input)
+        decoder_inputs = self.embedding(_text_input).transpose(
+            [1, 0, 2])  # -> [T, B, channel]
+        start_embedding = self.embedding(
+            paddle.to_tensor(
+                [self.sos] * decoder_inputs.shape[1], dtype=paddle.long))
+        decoder_inputs = paddle.concat(
+            (start_embedding.unsqueeze(0), decoder_inputs), axis=0)
+
+        hidden_outputs, logit_outputs, alignments = [], [], []
+        while len(hidden_outputs) < decoder_inputs.shape[0]:
+            decoder_input = decoder_inputs[len(hidden_outputs)]
+            hidden, logit, attention_weights = self.decode(decoder_input)
+            hidden_outputs += [hidden]
+            logit_outputs += [logit]
+            alignments += [attention_weights]
+
+        hidden_outputs, logit_outputs, alignments = \
+            self.parse_decoder_outputs(
+                hidden_outputs, logit_outputs, alignments)
+
+        return hidden_outputs, logit_outputs, alignments
+
+    def decode(self, decoder_input: paddle.Tensor):
+        cell_input = paddle.concat((decoder_input, self.attention_context), -1)
+        self.decoder_rnn.flatten_parameters()
+        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
+            cell_input, (self.decoder_hidden, self.decoder_cell))
+
+        attention_weights_cat = paddle.concat(
+            (self.attention_weights.unsqueeze(1),
+             self.attention_weights_cum.unsqueeze(1)),
+            axis=1)
+
+        self.attention_context, self.attention_weights = self.attention_layer(
+            self.decoder_hidden, self.memory, self.processed_memory,
+            attention_weights_cat, self.mask)
+
+        self.attention_weights_cum += self.attention_weights
+
+        hidden_and_context = paddle.concat(
+            (self.decoder_hidden, self.attention_context), -1)
+        hidden = self.project_to_hidden(hidden_and_context)
+
+        # dropout to increasing g
+        logit = self.project_to_n_symbols(F.dropout(hidden, 0.5, self.training))
+
+        return hidden, logit, self.attention_weights
+
+    def parse_decoder_outputs(self,
+                              hidden: paddle.Tensor,
+                              logit: paddle.Tensor,
+                              alignments: paddle.Tensor):
+        # -> [B, T_out + 1, max_time]
+        alignments = paddle.stack(alignments).transpose([1, 0, 2])
+        # [T_out + 1, B, n_symbols] -> [B, T_out + 1,  n_symbols]
+        logit = paddle.stack(logit).transpose([1, 0, 2])
+        hidden = paddle.stack(hidden).transpose([1, 0, 2])
+
+        return hidden, logit, alignments
--- a/paddlespeech/t2s/models/starganv2_vc/JDCNet/init.py
+++ b/paddlespeech/t2s/models/starganv2_vc/JDCNet/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py
+++ b/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py
@ -0,0 +1,234 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Implementation of model from:
+Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using
+Convolutional Recurrent Neural Networks" (2019)
+Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d
+"""
+import paddle
+from paddle import nn
+
+
+class JDCNet(nn.Layer):
+    """
+    Joint Detection and Classification Network model for singing voice melody.
+    """
+
+    def __init__(self,
+                 num_class: int=722,
+                 seq_len: int=31,
+                 leaky_relu_slope: float=0.01):
+        super().__init__()
+        self.seq_len = seq_len
+        self.num_class = num_class
+
+        # input = (b, 1, 31, 513), b = batch size
+        self.conv_block = nn.Sequential(
+            # out: (b, 64, 31, 513)
+            nn.Conv2D(
+                in_channels=1,
+                out_channels=64,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False),
+            nn.BatchNorm2D(num_features=64),
+            nn.LeakyReLU(leaky_relu_slope),
+            # (b, 64, 31, 513)
+            nn.Conv2D(64, 64, 3, padding=1, bias_attr=False), )
+
+        # res blocks
+        # (b, 128, 31, 128)
+        self.res_block1 = ResBlock(in_channels=64, out_channels=128)
+        # (b, 192, 31, 32) 
+        self.res_block2 = ResBlock(in_channels=128, out_channels=192)
+        # (b, 256, 31, 8)  
+        self.res_block3 = ResBlock(in_channels=192, out_channels=256)
+
+        # pool block
+        self.pool_block = nn.Sequential(
+            nn.BatchNorm2D(num_features=256),
+            nn.LeakyReLU(leaky_relu_slope),
+            # (b, 256, 31, 2)
+            nn.MaxPool2D(kernel_size=(1, 4)),
+            nn.Dropout(p=0.5), )
+
+        # maxpool layers (for auxiliary network inputs)
+        # in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2)
+        self.maxpool1 = nn.MaxPool2D(kernel_size=(1, 40))
+        # in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2)
+        self.maxpool2 = nn.MaxPool2D(kernel_size=(1, 20))
+        # in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2)
+        self.maxpool3 = nn.MaxPool2D(kernel_size=(1, 10))
+
+        # in = (b, 640, 31, 2), out = (b, 256, 31, 2)
+        self.detector_conv = nn.Sequential(
+            nn.Conv2D(
+                in_channels=640,
+                out_channels=256,
+                kernel_size=1,
+                bias_attr=False),
+            nn.BatchNorm2D(256),
+            nn.LeakyReLU(leaky_relu_slope),
+            nn.Dropout(p=0.5), )
+
+        # input: (b, 31, 512) - resized from (b, 256, 31, 2)
+        # output: (b, 31, 512)
+        self.bilstm_classifier = nn.LSTM(
+            input_size=512,
+            hidden_size=256,
+            time_major=False,
+            direction='bidirectional')
+
+        # input: (b, 31, 512) - resized from (b, 256, 31, 2)
+        # output: (b, 31, 512)
+        self.bilstm_detector = nn.LSTM(
+            input_size=512,
+            hidden_size=256,
+            time_major=False,
+            direction='bidirectional')
+
+        # input: (b * 31, 512)
+        # output: (b * 31, num_class)
+        self.classifier = nn.Linear(
+            in_features=512, out_features=self.num_class)
+
+        # input: (b * 31, 512)
+        # output: (b * 31, 2) - binary classifier
+        self.detector = nn.Linear(in_features=512, out_features=2)
+
+        # initialize weights
+        self.apply(self.init_weights)
+
+    def get_feature_GAN(self, x: paddle.Tensor):
+        seq_len = x.shape[-2]
+        x = x.astype(paddle.float32).transpose([0, 1, 3, 2] if len(x.shape) == 4
+                                               else [0, 2, 1])
+
+        convblock_out = self.conv_block(x)
+
+        resblock1_out = self.res_block1(convblock_out)
+        resblock2_out = self.res_block2(resblock1_out)
+        resblock3_out = self.res_block3(resblock2_out)
+        poolblock_out = self.pool_block[0](resblock3_out)
+        poolblock_out = self.pool_block[1](poolblock_out)
+
+        return poolblock_out.transpose([0, 1, 3, 2] if len(poolblock_out.shape)
+                                       == 4 else [0, 2, 1])
+
+    def forward(self, x: paddle.Tensor):
+        """
+        Returns:
+            classification_prediction, detection_prediction
+            sizes: (b, 31, 722), (b, 31, 2)
+        """
+        ###############################
+        # forward pass for classifier #
+        ###############################
+        x = x.transpose([0, 1, 3, 2] if len(x.shape) == 4 else
+                        [0, 2, 1]).astype(paddle.float32)
+
+        convblock_out = self.conv_block(x)
+
+        resblock1_out = self.res_block1(convblock_out)
+        resblock2_out = self.res_block2(resblock1_out)
+        resblock3_out = self.res_block3(resblock2_out)
+
+        poolblock_out = self.pool_block[0](resblock3_out)
+        poolblock_out = self.pool_block[1](poolblock_out)
+        GAN_feature = poolblock_out.transpose([0, 1, 3, 2] if len(
+            poolblock_out.shape) == 4 else [0, 2, 1])
+        poolblock_out = self.pool_block[2](poolblock_out)
+
+        # (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512)
+        classifier_out = poolblock_out.transpose([0, 2, 1, 3]).reshape(
+            (-1, self.seq_len, 512))
+        self.bilstm_classifier.flatten_parameters()
+        classifier_out, _ = self.bilstm_classifier(
+            classifier_out)  # ignore the hidden states
+
+        classifier_out = classifier_out.reshape((-1, 512))  # (b * 31, 512)
+        classifier_out = self.classifier(classifier_out)
+        classifier_out = classifier_out.reshape(
+            (-1, self.seq_len, self.num_class))  # (b, 31, num_class)
+
+        # sizes: (b, 31, 722), (b, 31, 2)
+        # classifier output consists of predicted pitch classes per frame
+        # detector output consists of: (isvoice, notvoice) estimates per frame
+        return paddle.abs(classifier_out.squeeze()), GAN_feature, poolblock_out
+
+    @staticmethod
+    def init_weights(m):
+        if isinstance(m, nn.Linear):
+            nn.initializer.KaimingUniform()(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.Conv2D):
+            nn.initializer.XavierNormal()(m.weight)
+        elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell):
+            for p in m.parameters():
+                if len(p.shape) >= 2:
+                    nn.initializer.Orthogonal()(p)
+                else:
+                    nn.initializer.Normal()(p)
+
+
+class ResBlock(nn.Layer):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 leaky_relu_slope=0.01):
+        super().__init__()
+        self.downsample = in_channels != out_channels
+
+        # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper
+        self.pre_conv = nn.Sequential(
+            nn.BatchNorm2D(num_features=in_channels),
+            nn.LeakyReLU(leaky_relu_slope),
+            # apply downsampling on the y axis only
+            nn.MaxPool2D(kernel_size=(1, 2)), )
+
+        # conv layers
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False),
+            nn.BatchNorm2D(out_channels),
+            nn.LeakyReLU(leaky_relu_slope),
+            nn.Conv2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False), )
+
+        # 1 x 1 convolution layer to match the feature dimensions
+        self.conv1by1 = None
+        if self.downsample:
+            self.conv1by1 = nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                bias_attr=False)
+
+    def forward(self, x: paddle.Tensor):
+        x = self.pre_conv(x)
+        if self.downsample:
+            x = self.conv(x) + self.conv1by1(x)
+        else:
+            x = self.conv(x) + x
+        return x
--- a/paddlespeech/t2s/models/starganv2_vc/init.py
+++ b/paddlespeech/t2s/models/starganv2_vc/init.py
@ -0,0 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .starganv2_vc import *
+from .starganv2_vc_updater import *
+from .AuxiliaryASR.model import *
+from .JDCNet.model import *
--- a/paddlespeech/t2s/models/starganv2_vc/losses.py
+++ b/paddlespeech/t2s/models/starganv2_vc/losses.py
@ -0,0 +1,255 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn.functional as F
+from munch import Munch
+from starganv2vc_paddle.transforms import build_transforms
+
+
+# 这些都写到 updater 里
+def compute_d_loss(nets,
+                   args,
+                   x_real,
+                   y_org,
+                   y_trg,
+                   z_trg=None,
+                   x_ref=None,
+                   use_r1_reg=True,
+                   use_adv_cls=False,
+                   use_con_reg=False):
+    args = Munch(args)
+
+    assert (z_trg is None) != (x_ref is None)
+    # with real audios
+    x_real.stop_gradient = False
+    out = nets.discriminator(x_real, y_org)
+    loss_real = adv_loss(out, 1)
+
+    # R1 regularizaition (https://arxiv.org/abs/1801.04406v4)
+    if use_r1_reg:
+        loss_reg = r1_reg(out, x_real)
+    else:
+        loss_reg = paddle.to_tensor([0.], dtype=paddle.float32)
+
+    # consistency regularization (bCR-GAN: https://arxiv.org/abs/2002.04724)
+    loss_con_reg = paddle.to_tensor([0.], dtype=paddle.float32)
+    if use_con_reg:
+        t = build_transforms()
+        out_aug = nets.discriminator(t(x_real).detach(), y_org)
+        loss_con_reg += F.smooth_l1_loss(out, out_aug)
+
+    # with fake audios
+    with paddle.no_grad():
+        if z_trg is not None:
+            s_trg = nets.mapping_network(z_trg, y_trg)
+        else:  # x_ref is not None
+            s_trg = nets.style_encoder(x_ref, y_trg)
+
+        F0 = nets.f0_model.get_feature_GAN(x_real)
+        x_fake = nets.generator(x_real, s_trg, masks=None, F0=F0)
+    out = nets.discriminator(x_fake, y_trg)
+    loss_fake = adv_loss(out, 0)
+    if use_con_reg:
+        out_aug = nets.discriminator(t(x_fake).detach(), y_trg)
+        loss_con_reg += F.smooth_l1_loss(out, out_aug)
+
+    # adversarial classifier loss
+    if use_adv_cls:
+        out_de = nets.discriminator.classifier(x_fake)
+        loss_real_adv_cls = F.cross_entropy(out_de[y_org != y_trg],
+                                            y_org[y_org != y_trg])
+
+        if use_con_reg:
+            out_de_aug = nets.discriminator.classifier(t(x_fake).detach())
+            loss_con_reg += F.smooth_l1_loss(out_de, out_de_aug)
+    else:
+        loss_real_adv_cls = paddle.zeros([1]).mean()
+
+    loss = loss_real + loss_fake + args.lambda_reg * loss_reg + \
+            args.lambda_adv_cls * loss_real_adv_cls + \
+            args.lambda_con_reg * loss_con_reg
+
+    return loss, Munch(
+        real=loss_real.item(),
+        fake=loss_fake.item(),
+        reg=loss_reg.item(),
+        real_adv_cls=loss_real_adv_cls.item(),
+        con_reg=loss_con_reg.item())
+
+
+def compute_g_loss(nets,
+                   args,
+                   x_real,
+                   y_org,
+                   y_trg,
+                   z_trgs=None,
+                   x_refs=None,
+                   use_adv_cls=False):
+    args = Munch(args)
+
+    assert (z_trgs is None) != (x_refs is None)
+    if z_trgs is not None:
+        z_trg, z_trg2 = z_trgs
+    if x_refs is not None:
+        x_ref, x_ref2 = x_refs
+
+    # compute style vectors
+    if z_trgs is not None:
+        s_trg = nets.mapping_network(z_trg, y_trg)
+    else:
+        s_trg = nets.style_encoder(x_ref, y_trg)
+
+    # compute ASR/F0 features (real)
+    with paddle.no_grad():
+        F0_real, GAN_F0_real, cyc_F0_real = nets.f0_model(x_real)
+        ASR_real = nets.asr_model.get_feature(x_real)
+
+    # adversarial loss
+    x_fake = nets.generator(x_real, s_trg, masks=None, F0=GAN_F0_real)
+    out = nets.discriminator(x_fake, y_trg)
+    loss_adv = adv_loss(out, 1)
+
+    # compute ASR/F0 features (fake)
+    F0_fake, GAN_F0_fake, _ = nets.f0_model(x_fake)
+    ASR_fake = nets.asr_model.get_feature(x_fake)
+
+    # norm consistency loss
+    x_fake_norm = log_norm(x_fake)
+    x_real_norm = log_norm(x_real)
+    loss_norm = ((
+        paddle.nn.ReLU()(paddle.abs(x_fake_norm - x_real_norm) - args.norm_bias)
+    )**2).mean()
+
+    # F0 loss
+    loss_f0 = f0_loss(F0_fake, F0_real)
+
+    # style F0 loss (style initialization)
+    if x_refs is not None and args.lambda_f0_sty > 0 and not use_adv_cls:
+        F0_sty, _, _ = nets.f0_model(x_ref)
+        loss_f0_sty = F.l1_loss(
+            compute_mean_f0(F0_fake), compute_mean_f0(F0_sty))
+    else:
+        loss_f0_sty = paddle.zeros([1]).mean()
+
+    # ASR loss
+    loss_asr = F.smooth_l1_loss(ASR_fake, ASR_real)
+
+    # style reconstruction loss
+    s_pred = nets.style_encoder(x_fake, y_trg)
+    loss_sty = paddle.mean(paddle.abs(s_pred - s_trg))
+
+    # diversity sensitive loss
+    if z_trgs is not None:
+        s_trg2 = nets.mapping_network(z_trg2, y_trg)
+    else:
+        s_trg2 = nets.style_encoder(x_ref2, y_trg)
+    x_fake2 = nets.generator(x_real, s_trg2, masks=None, F0=GAN_F0_real)
+    x_fake2 = x_fake2.detach()
+    _, GAN_F0_fake2, _ = nets.f0_model(x_fake2)
+    loss_ds = paddle.mean(paddle.abs(x_fake - x_fake2))
+    loss_ds += F.smooth_l1_loss(GAN_F0_fake, GAN_F0_fake2.detach())
+
+    # cycle-consistency loss
+    s_org = nets.style_encoder(x_real, y_org)
+    x_rec = nets.generator(x_fake, s_org, masks=None, F0=GAN_F0_fake)
+    loss_cyc = paddle.mean(paddle.abs(x_rec - x_real))
+    # F0 loss in cycle-consistency loss
+    if args.lambda_f0 > 0:
+        _, _, cyc_F0_rec = nets.f0_model(x_rec)
+        loss_cyc += F.smooth_l1_loss(cyc_F0_rec, cyc_F0_real)
+    if args.lambda_asr > 0:
+        ASR_recon = nets.asr_model.get_feature(x_rec)
+        loss_cyc += F.smooth_l1_loss(ASR_recon, ASR_real)
+
+    # adversarial classifier loss
+    if use_adv_cls:
+        out_de = nets.discriminator.classifier(x_fake)
+        loss_adv_cls = F.cross_entropy(out_de[y_org != y_trg],
+                                       y_trg[y_org != y_trg])
+    else:
+        loss_adv_cls = paddle.zeros([1]).mean()
+
+    loss = args.lambda_adv * loss_adv + args.lambda_sty * loss_sty \
+           - args.lambda_ds * loss_ds + args.lambda_cyc * loss_cyc\
+           + args.lambda_norm * loss_norm \
+           + args.lambda_asr * loss_asr \
+           + args.lambda_f0 * loss_f0 \
+           + args.lambda_f0_sty * loss_f0_sty \
+           + args.lambda_adv_cls * loss_adv_cls
+
+    return loss, Munch(
+        adv=loss_adv.item(),
+        sty=loss_sty.item(),
+        ds=loss_ds.item(),
+        cyc=loss_cyc.item(),
+        norm=loss_norm.item(),
+        asr=loss_asr.item(),
+        f0=loss_f0.item(),
+        adv_cls=loss_adv_cls.item())
+
+
+# for norm consistency loss
+def log_norm(x, mean=-4, std=4, axis=2):
+    """
+    normalized log mel -> mel -> norm -> log(norm)
+    """
+    x = paddle.log(paddle.exp(x * std + mean).norm(axis=axis))
+    return x
+
+
+# for adversarial loss
+def adv_loss(logits, target):
+    assert target in [1, 0]
+    if len(logits.shape) > 1:
+        logits = logits.reshape([-1])
+    targets = paddle.full_like(logits, fill_value=target)
+    logits = logits.clip(min=-10, max=10)  # prevent nan
+    loss = F.binary_cross_entropy_with_logits(logits, targets)
+    return loss
+
+
+# for R1 regularization loss
+def r1_reg(d_out, x_in):
+    # zero-centered gradient penalty for real images
+    batch_size = x_in.shape[0]
+    grad_dout = paddle.grad(
+        outputs=d_out.sum(),
+        inputs=x_in,
+        create_graph=True,
+        retain_graph=True,
+        only_inputs=True)[0]
+    grad_dout2 = grad_dout.pow(2)
+    assert (grad_dout2.shape == x_in.shape)
+    reg = 0.5 * grad_dout2.reshape((batch_size, -1)).sum(1).mean(0)
+    return reg
+
+
+# for F0 consistency loss
+def compute_mean_f0(f0):
+    f0_mean = f0.mean(-1)
+    f0_mean = f0_mean.expand((f0.shape[-1], f0_mean.shape[0])).transpose(
+        (1, 0))  # (B, M)
+    return f0_mean
+
+
+def f0_loss(x_f0, y_f0):
+    """
+    x.shape = (B, 1, M, L): predict
+    y.shape = (B, 1, M, L): target
+    """
+    # compute the mean
+    x_mean = compute_mean_f0(x_f0)
+    y_mean = compute_mean_f0(y_f0)
+    loss = F.l1_loss(x_f0 / x_mean, y_f0 / y_mean)
+    return loss
--- a/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
+++ b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
@ -0,0 +1,616 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+StarGAN v2
+Copyright (c) 2020-present NAVER Corp.
+This work is licensed under the Creative Commons Attribution-NonCommercial
+4.0 International License. To view a copy of this license, visit
+http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to
+Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
+"""
+# import copy
+import math
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.utils.initialize import _calculate_gain
+from paddlespeech.utils.initialize import xavier_uniform_
+
+# from munch import Munch
+
+
+class DownSample(nn.Layer):
+    def __init__(self, layer_type: str):
+        super().__init__()
+        self.layer_type = layer_type
+
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        elif self.layer_type == 'timepreserve':
+            return F.avg_pool2d(x, (2, 1))
+        elif self.layer_type == 'half':
+            return F.avg_pool2d(x, 2)
+        else:
+            raise RuntimeError(
+                'Got unexpected donwsampletype %s, expected is [none, timepreserve, half]'
+                % self.layer_type)
+
+
+class UpSample(nn.Layer):
+    def __init__(self, layer_type: str):
+        super().__init__()
+        self.layer_type = layer_type
+
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        elif self.layer_type == 'timepreserve':
+            return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
+        elif self.layer_type == 'half':
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+        else:
+            raise RuntimeError(
+                'Got unexpected upsampletype %s, expected is [none, timepreserve, half]'
+                % self.layer_type)
+
+
+class ResBlk(nn.Layer):
+    def __init__(self,
+                 dim_in: int,
+                 dim_out: int,
+                 actv: nn.LeakyReLU=nn.LeakyReLU(0.2),
+                 normalize: bool=False,
+                 downsample: str='none'):
+        super().__init__()
+        self.actv = actv
+        self.normalize = normalize
+        self.downsample = DownSample(layer_type=downsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out)
+
+    def _build_weights(self, dim_in: int, dim_out: int):
+        self.conv1 = nn.Conv2D(
+            in_channels=dim_in,
+            out_channels=dim_in,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.conv2 = nn.Conv2D(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        if self.normalize:
+            self.norm1 = nn.InstanceNorm2D(dim_in)
+            self.norm2 = nn.InstanceNorm2D(dim_in)
+        if self.learned_sc:
+            self.conv1x1 = nn.Conv2D(
+                in_channels=dim_in,
+                out_channels=dim_out,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+
+    def _shortcut(self, x: paddle.Tensor):
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        if self.downsample:
+            x = self.downsample(x)
+        return x
+
+    def _residual(self, x: paddle.Tensor):
+        if self.normalize:
+            x = self.norm1(x)
+        x = self.actv(x)
+        x = self.conv1(x)
+        x = self.downsample(x)
+        if self.normalize:
+            x = self.norm2(x)
+        x = self.actv(x)
+        x = self.conv2(x)
+        return x
+
+    def forward(self, x: paddle.Tensor):
+        x = self._shortcut(x) + self._residual(x)
+        # unit variance
+        return x / math.sqrt(2)
+
+
+class AdaIN(nn.Layer):
+    def __init__(self, style_dim: int, num_features: int):
+        super().__init__()
+        self.norm = nn.InstanceNorm2D(
+            num_features=num_features, weight_attr=False, bias_attr=False)
+        self.fc = nn.Linear(style_dim, num_features * 2)
+
+    def forward(self, x: paddle.Tensor, s: paddle.Tensor):
+        if len(s.shape) == 1:
+            s = s[None]
+        h = self.fc(s)
+        h = h.reshape((h.shape[0], h.shape[1], 1, 1))
+        gamma, beta = paddle.split(h, 2, axis=1)
+        return (1 + gamma) * self.norm(x) + beta
+
+
+class AdainResBlk(nn.Layer):
+    def __init__(self,
+                 dim_in: int,
+                 dim_out: int,
+                 style_dim: int=64,
+                 w_hpf: int=0,
+                 actv: nn.Layer=nn.LeakyReLU(0.2),
+                 upsample: str='none'):
+        super().__init__()
+        self.w_hpf = w_hpf
+        self.actv = actv
+        self.upsample = UpSample(layer_type=upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+
+    def _build_weights(self, dim_in: int, dim_out: int, style_dim: int=64):
+        self.conv1 = nn.Conv2D(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.conv2 = nn.Conv2D(
+            in_channels=dim_out,
+            out_channels=dim_out,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.norm1 = AdaIN(style_dim=style_dim, num_features=dim_in)
+        self.norm2 = AdaIN(style_dim=style_dim, num_features=dim_out)
+        if self.learned_sc:
+            self.conv1x1 = nn.Conv2D(
+                in_channels=dim_in,
+                out_channels=dim_out,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+
+    def _shortcut(self, x: paddle.Tensor):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+
+    def _residual(self, x: paddle.Tensor, s: paddle.Tensor):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.upsample(x)
+        x = self.conv1(x)
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(x)
+        return x
+
+    def forward(self, x: paddle.Tensor, s: paddle.Tensor):
+        out = self._residual(x, s)
+        if self.w_hpf == 0:
+            out = (out + self._shortcut(x)) / math.sqrt(2)
+        return out
+
+
+class HighPass(nn.Layer):
+    def __init__(self, w_hpf: int):
+        super().__init__()
+        self.filter = paddle.to_tensor([[-1, -1, -1], [-1, 8., -1],
+                                        [-1, -1, -1]]) / w_hpf
+
+    def forward(self, x: paddle.Tensor):
+        filter = self.filter.unsqueeze(0).unsqueeze(1).tile(
+            [x.shape[1], 1, 1, 1])
+        return F.conv2d(x, filter, padding=1, groups=x.shape[1])
+
+
+class Generator(nn.Layer):
+    def __init__(self,
+                 dim_in: int=48,
+                 style_dim: int=48,
+                 max_conv_dim: int=48 * 8,
+                 w_hpf: int=1,
+                 F0_channel: int=0):
+        super().__init__()
+
+        self.stem = nn.Conv2D(
+            in_channels=1,
+            out_channels=dim_in,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.encode = nn.LayerList()
+        self.decode = nn.LayerList()
+        self.to_out = nn.Sequential(
+            nn.InstanceNorm2D(dim_in),
+            nn.LeakyReLU(0.2),
+            nn.Conv2D(
+                in_channels=dim_in,
+                out_channels=1,
+                kernel_size=1,
+                stride=1,
+                padding=0))
+        self.F0_channel = F0_channel
+        # down/up-sampling blocks
+        # int(np.log2(img_size)) - 4
+        repeat_num = 4
+        if w_hpf > 0:
+            repeat_num += 1
+
+        for lid in range(repeat_num):
+            if lid in [1, 3]:
+                _downtype = 'timepreserve'
+            else:
+                _downtype = 'half'
+
+            dim_out = min(dim_in * 2, max_conv_dim)
+            self.encode.append(
+                ResBlk(
+                    dim_in=dim_in,
+                    dim_out=dim_out,
+                    normalize=True,
+                    downsample=_downtype))
+            (self.decode.insert if lid else
+             lambda i, sublayer: self.decode.append(sublayer))(0, AdainResBlk(
+                 dim_in=dim_out,
+                 dim_out=dim_in,
+                 style_dim=style_dim,
+                 w_hpf=w_hpf,
+                 upsample=_downtype))  # stack-like
+            dim_in = dim_out
+
+        # bottleneck blocks (encoder)
+        for _ in range(2):
+            self.encode.append(
+                ResBlk(dim_in=dim_out, dim_out=dim_out, normalize=True))
+
+        # F0 blocks 
+        if F0_channel != 0:
+            self.decode.insert(0,
+                               AdainResBlk(
+                                   dim_in=dim_out + int(F0_channel / 2),
+                                   dim_out=dim_out,
+                                   style_dim=style_dim,
+                                   w_hpf=w_hpf))
+
+        # bottleneck blocks (decoder)
+        for _ in range(2):
+            self.decode.insert(0,
+                               AdainResBlk(
+                                   dim_in=dim_out + int(F0_channel / 2),
+                                   dim_out=dim_out + int(F0_channel / 2),
+                                   style_dim=style_dim,
+                                   w_hpf=w_hpf))
+
+        if F0_channel != 0:
+            self.F0_conv = nn.Sequential(
+                ResBlk(
+                    dim_in=F0_channel,
+                    dim_out=int(F0_channel / 2),
+                    normalize=True,
+                    downsample="half"), )
+
+        if w_hpf > 0:
+            self.hpf = HighPass(w_hpf)
+
+    def forward(self,
+                x: paddle.Tensor,
+                s: paddle.Tensor,
+                masks: paddle.Tensor=None,
+                F0: paddle.Tensor=None):
+        x = self.stem(x)
+        cache = {}
+        for block in self.encode:
+            if (masks is not None) and (x.shape[2] in [32, 64, 128]):
+                cache[x.shape[2]] = x
+            x = block(x)
+
+        if F0 is not None:
+            F0 = self.F0_conv(F0)
+            F0 = F.adaptive_avg_pool2d(F0, [x.shape[-2], x.shape[-1]])
+            x = paddle.concat([x, F0], axis=1)
+
+        for block in self.decode:
+            x = block(x, s)
+            if (masks is not None) and (x.shape[2] in [32, 64, 128]):
+                mask = masks[0] if x.shape[2] in [32] else masks[1]
+                mask = F.interpolate(mask, size=x.shape[2], mode='bilinear')
+                x = x + self.hpf(mask * cache[x.shape[2]])
+
+        return self.to_out(x)
+
+
+class MappingNetwork(nn.Layer):
+    def __init__(self,
+                 latent_dim: int=16,
+                 style_dim: int=48,
+                 num_domains: int=2,
+                 hidden_dim: int=384):
+        super().__init__()
+        layers = []
+        layers += [nn.Linear(latent_dim, hidden_dim)]
+        layers += [nn.ReLU()]
+        for _ in range(3):
+            layers += [nn.Linear(hidden_dim, hidden_dim)]
+            layers += [nn.ReLU()]
+        self.shared = nn.Sequential(*layers)
+
+        self.unshared = nn.LayerList()
+        for _ in range(num_domains):
+            self.unshared.extend([
+                nn.Sequential(
+                    nn.Linear(hidden_dim, hidden_dim),
+                    nn.ReLU(),
+                    nn.Linear(hidden_dim, hidden_dim),
+                    nn.ReLU(),
+                    nn.Linear(hidden_dim, hidden_dim),
+                    nn.ReLU(), nn.Linear(hidden_dim, style_dim))
+            ])
+
+    def forward(self, z: paddle.Tensor, y: paddle.Tensor):
+        h = self.shared(z)
+        out = []
+        for layer in self.unshared:
+            out += [layer(h)]
+        # (batch, num_domains, style_dim)
+        out = paddle.stack(out, axis=1)
+        idx = paddle.arange(y.shape[0])
+        # (batch, style_dim)
+        s = out[idx, y]
+        return s
+
+
+class StyleEncoder(nn.Layer):
+    def __init__(self,
+                 dim_in: int=48,
+                 style_dim: int=48,
+                 num_domains: int=2,
+                 max_conv_dim: int=384):
+        super().__init__()
+        blocks = []
+        blocks += [
+            nn.Conv2D(
+                in_channels=1,
+                out_channels=dim_in,
+                kernel_size=3,
+                stride=1,
+                padding=1)
+        ]
+        repeat_num = 4
+        for _ in range(repeat_num):
+            dim_out = min(dim_in * 2, max_conv_dim)
+            blocks += [
+                ResBlk(dim_in=dim_in, dim_out=dim_out, downsample='half')
+            ]
+            dim_in = dim_out
+
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [
+            nn.Conv2D(
+                in_channels=dim_out,
+                out_channels=dim_out,
+                kernel_size=5,
+                stride=1,
+                padding=0)
+        ]
+        blocks += [nn.AdaptiveAvgPool2D(1)]
+        blocks += [nn.LeakyReLU(0.2)]
+        self.shared = nn.Sequential(*blocks)
+        self.unshared = nn.LayerList()
+        for _ in range(num_domains):
+            self.unshared.append(nn.Linear(dim_out, style_dim))
+
+    def forward(self, x: paddle.Tensor, y: paddle.Tensor):
+        h = self.shared(x)
+        h = h.reshape((h.shape[0], -1))
+        out = []
+        for layer in self.unshared:
+            out += [layer(h)]
+        # (batch, num_domains, style_dim)
+        out = paddle.stack(out, axis=1)
+        idx = paddle.arange(y.shape[0])
+        # (batch, style_dim)
+        s = out[idx, y]
+        return s
+
+
+class Discriminator(nn.Layer):
+    def __init__(self,
+                 dim_in: int=48,
+                 num_domains: int=2,
+                 max_conv_dim: int=384,
+                 repeat_num: int=4):
+        super().__init__()
+        # real/fake discriminator
+        self.dis = Discriminator2D(
+            dim_in=dim_in,
+            num_domains=num_domains,
+            max_conv_dim=max_conv_dim,
+            repeat_num=repeat_num)
+        # adversarial classifier
+        self.cls = Discriminator2D(
+            dim_in=dim_in,
+            num_domains=num_domains,
+            max_conv_dim=max_conv_dim,
+            repeat_num=repeat_num)
+        self.num_domains = num_domains
+
+    def forward(self, x: paddle.Tensor, y: paddle.Tensor):
+        return self.dis(x, y)
+
+    def classifier(self, x: paddle.Tensor):
+        return self.cls.get_feature(x)
+
+
+class LinearNorm(nn.Layer):
+    def __init__(self,
+                 in_dim: int,
+                 out_dim: int,
+                 bias: bool=True,
+                 w_init_gain: str='linear'):
+        super().__init__()
+        self.linear_layer = nn.Linear(in_dim, out_dim, bias_attr=bias)
+        xavier_uniform_(
+            self.linear_layer.weight, gain=_calculate_gain(w_init_gain))
+
+    def forward(self, x):
+        return self.linear_layer(x)
+
+
+class Discriminator2D(nn.Layer):
+    def __init__(self,
+                 dim_in: int=48,
+                 num_domains: int=2,
+                 max_conv_dim: int=384,
+                 repeat_num: int=4):
+        super().__init__()
+        blocks = []
+        blocks += [
+            nn.Conv2D(
+                in_channels=1,
+                out_channels=dim_in,
+                kernel_size=3,
+                stride=1,
+                padding=1)
+        ]
+
+        for lid in range(repeat_num):
+            dim_out = min(dim_in * 2, max_conv_dim)
+            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
+            dim_in = dim_out
+
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [
+            nn.Conv2D(
+                in_channels=dim_out,
+                out_channels=dim_out,
+                kernel_size=5,
+                stride=1,
+                padding=0)
+        ]
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [nn.AdaptiveAvgPool2D(1)]
+        blocks += [
+            nn.Conv2D(
+                in_channels=dim_out,
+                out_channels=num_domains,
+                kernel_size=1,
+                stride=1,
+                padding=0)
+        ]
+        self.main = nn.Sequential(*blocks)
+
+    def get_feature(self, x: paddle.Tensor):
+        out = self.main(x)
+        # (batch, num_domains)
+        out = out.reshape((out.shape[0], -1))
+        return out
+
+    def forward(self, x: paddle.Tensor, y: paddle.Tensor):
+        out = self.get_feature(x)
+        idx = paddle.arange(y.shape[0])
+        # (batch)
+        out = out[idx, y]
+        return out
+
+
+'''
+def build_model(args, F0_model: nn.Layer, ASR_model: nn.Layer):
+    generator = Generator(
+        dim_in=args.dim_in,
+        style_dim=args.style_dim,
+        max_conv_dim=args.max_conv_dim,
+        w_hpf=args.w_hpf,
+        F0_channel=args.F0_channel)
+    mapping_network = MappingNetwork(
+        latent_dim=args.latent_dim,
+        style_dim=args.style_dim,
+        num_domains=args.num_domains,
+        hidden_dim=args.max_conv_dim)
+    style_encoder = StyleEncoder(
+        dim_in=args.dim_in,
+        style_dim=args.style_dim,
+        num_domains=args.num_domains,
+        max_conv_dim=args.max_conv_dim)
+    discriminator = Discriminator(
+        dim_in=args.dim_in,
+        num_domains=args.num_domains,
+        max_conv_dim=args.max_conv_dim,
+        n_repeat=args.n_repeat)
+    generator_ema = copy.deepcopy(generator)
+    mapping_network_ema = copy.deepcopy(mapping_network)
+    style_encoder_ema = copy.deepcopy(style_encoder)
+
+    nets = Munch(
+        generator=generator,
+        mapping_network=mapping_network,
+        style_encoder=style_encoder,
+        discriminator=discriminator,
+        f0_model=F0_model,
+        asr_model=ASR_model)
+
+    nets_ema = Munch(
+        generator=generator_ema,
+        mapping_network=mapping_network_ema,
+        style_encoder=style_encoder_ema)
+
+    return nets, nets_ema
+
+
+class StarGANv2VC(nn.Layer):
+    def __init__(
+            self,
+            # spk_num
+            num_domains: int=20,
+            dim_in: int=64,
+            style_dim: int=64,
+            latent_dim: int=16,
+            max_conv_dim: int=512,
+            n_repeat: int=4,
+            w_hpf: int=0,
+            F0_channel: int=256):
+        super().__init__()
+
+        self.generator = Generator(
+            dim_in=dim_in,
+            style_dim=style_dim,
+            max_conv_dim=max_conv_dim,
+            w_hpf=w_hpf,
+            F0_channel=F0_channel)
+        # MappingNetwork and StyleEncoder are used to generate reference_embeddings
+        self.mapping_network = MappingNetwork(
+            latent_dim=latent_dim,
+            style_dim=style_dim,
+            num_domains=num_domains,
+            hidden_dim=max_conv_dim)
+
+        self.style_encoder = StyleEncoder(
+            dim_in=dim_in,
+            style_dim=style_dim,
+            num_domains=num_domains,
+            max_conv_dim=max_conv_dim)
+
+        self.discriminator = Discriminator(
+            dim_in=dim_in,
+            num_domains=num_domains,
+            max_conv_dim=max_conv_dim,
+            repeat_num=n_repeat)
+'''
--- a/paddlespeech/t2s/models/starganv2_vc/starganv2_vc_updater.py
+++ b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc_updater.py
@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.