diff --git a/examples/vctk/vc3/README.md b/examples/vctk/vc3/README.md new file mode 100644 index 000000000..2de0a0add --- /dev/null +++ b/examples/vctk/vc3/README.md @@ -0,0 +1 @@ +You can download test source audios from [test_wav.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/starganv2vc/test_wav.zip). \ No newline at end of file diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index ff0b30f6d..d27d51676 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -1705,7 +1705,7 @@ g2pw_onnx_models = { } # --------------------------------- -# ------------- Rhy_frontend --------------- +# ---------- Rhy_frontend --------- # --------------------------------- rhy_frontend_models = { 'rhy_e2e': { @@ -1716,3 +1716,16 @@ rhy_frontend_models = { }, }, } + +# --------------------------------- +# ---------- StarGANv2VC ---------- +# --------------------------------- + +StarGANv2VC_source = { + '1.0' :{ + 'url': 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/starganv2vc/StarGANv2VC_source.zip', + 'md5': '195e169419163f5648030ba84c71f866', + + } +} + diff --git a/paddlespeech/t2s/exps/starganv2_vc/vc.py b/paddlespeech/t2s/exps/starganv2_vc/vc.py index 0f0e5080d..daa72158d 100644 --- a/paddlespeech/t2s/exps/starganv2_vc/vc.py +++ b/paddlespeech/t2s/exps/starganv2_vc/vc.py @@ -22,14 +22,15 @@ import soundfile as sf import yaml from yacs.config import CfgNode +from paddlespeech.cli.utils import download_and_decompress +from paddlespeech.resource.pretrained_models import StarGANv2VC_source from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator from paddlespeech.t2s.models.starganv2_vc import Generator from paddlespeech.t2s.models.starganv2_vc import JDCNet from paddlespeech.t2s.models.starganv2_vc import MappingNetwork from paddlespeech.t2s.models.starganv2_vc import StyleEncoder - -uncompress_path = '/home/yuantian01/PaddleSpeech_stargan/PaddleSpeech/stargan_models/' +from paddlespeech.utils.env import MODEL_HOME def get_mel_extractor(): @@ -86,7 +87,7 @@ def compute_style(speaker_dicts, mel_extractor, style_encoder, mapping_network): return reference_embeddings -def get_models(): +def get_models(uncompress_path): model_dict = {} jdc_model_dir = os.path.join(uncompress_path, 'jdcnet.pdz') voc_model_dir = os.path.join(uncompress_path, 'Vocoder/') @@ -153,13 +154,13 @@ def get_models(): return model_dict -def voice_conversion(args): +def voice_conversion(source_path, output_dir, uncompress_path): speakers = [ 225, 228, 229, 230, 231, 233, 236, 239, 240, 244, 226, 227, 232, 243, 254, 256, 258, 259, 270, 273 ] demo_dir = os.path.join(uncompress_path, 'Demo/VCTK-corpus/') - model_dict = get_models() + model_dict = get_models(uncompress_path) style_encoder = model_dict['style_encoder'] mapping_network = model_dict['mapping_network'] generator = model_dict['generator'] @@ -174,14 +175,16 @@ def voice_conversion(args): speaker_dicts['p' + str(s)] = ( demo_dir + 'p' + str(k) + '/p' + str(k) + '_023.wav', speakers.index(s)) - print("speaker_dicts:", speaker_dicts) mel_extractor = get_mel_extractor() - reference_embeddings = compute_style(speaker_dicts, mel_extractor, - style_encoder, mapping_network) + reference_embeddings = compute_style( + speaker_dicts=speaker_dicts, + mel_extractor=mel_extractor, + style_encoder=style_encoder, + mapping_network=mapping_network) - wave, sr = librosa.load(args.source_path, sr=24000) + wave, sr = librosa.load(source_path, sr=24000) source = preprocess(wave, mel_extractor) - output_dir = Path(args.output_dir) + output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) orig_wav_name = str(output_dir / 'orig_voc.wav') print('原始语音 (使用声码器解码): %s' % orig_wav_name) @@ -255,7 +258,13 @@ def main(): paddle.set_device("gpu") else: print("ngpu should >= 0 !") - voice_conversion(args) + model_version = '1.0' + uncompress_path = download_and_decompress(StarGANv2VC_source[model_version], + MODEL_HOME) + voice_conversion( + source_path=args.source_path, + output_dir=args.output_dir, + uncompress_path=uncompress_path) if __name__ == "__main__":