diff --git a/examples/vctk/vc3/conf/default.yaml b/examples/vctk/vc3/conf/default.yaml index 50d564fbf..0acc2a569 100644 --- a/examples/vctk/vc3/conf/default.yaml +++ b/examples/vctk/vc3/conf/default.yaml @@ -8,7 +8,7 @@ num_domains: 20 # num of speakers in StarGANv2 latent_dim: 16 style_dim: 64 # same as style_dim in generator_params - max_conv_dim: 512 # same as max_conv_dim in generator_params + hidden_dim: 512 # same as max_conv_dim in generator_params style_encoder_params: dim_in: 64 # same as dim_in in generator_params style_dim: 64 # same as style_dim in generator_params diff --git a/examples/vctk/vc3/local/voice_conversion.sh b/examples/vctk/vc3/local/voice_conversion.sh index 9fa132a4e..edf8f7ef0 100755 --- a/examples/vctk/vc3/local/voice_conversion.sh +++ b/examples/vctk/vc3/local/voice_conversion.sh @@ -1,8 +1,10 @@ #!/bin/bash -source_path=$1 -output_dir=$2 +config_path=$1 +source_path=$2 +output_dir=$3 python3 ${BIN_DIR}/vc.py \ + --config_path=${config_path} \ --source_path=${source_path}\ - --output_dir=${output_dir} \ No newline at end of file + --output_dir=${output_dir} \ No newline at end of file diff --git a/examples/vctk/vc3/run.sh b/examples/vctk/vc3/run.sh index dced9d7ad..602a593d0 100755 --- a/examples/vctk/vc3/run.sh +++ b/examples/vctk/vc3/run.sh @@ -31,6 +31,6 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # synthesize, vocoder is pwgan by default - CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_conversion.sh ${source_path} ${output_dir}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_conversion.sh ${conf_path} ${source_path} ${output_dir}|| exit -1 fi diff --git a/paddlespeech/t2s/exps/starganv2_vc/vc.py b/paddlespeech/t2s/exps/starganv2_vc/vc.py index daa72158d..d0bc04e39 100644 --- a/paddlespeech/t2s/exps/starganv2_vc/vc.py +++ b/paddlespeech/t2s/exps/starganv2_vc/vc.py @@ -87,7 +87,7 @@ def compute_style(speaker_dicts, mel_extractor, style_encoder, mapping_network): return reference_embeddings -def get_models(uncompress_path): +def get_models(args, uncompress_path): model_dict = {} jdc_model_dir = os.path.join(uncompress_path, 'jdcnet.pdz') voc_model_dir = os.path.join(uncompress_path, 'Vocoder/') @@ -109,43 +109,25 @@ def get_models(uncompress_path): voc_model_path = os.path.join(voc_model_dir, 'checkpoint-400000steps.pd') vocoder.set_state_dict(paddle.load(voc_model_path)) - dim_in = 64 - style_dim = 64 - latent_dim = 16 - num_domains = 20 - max_conv_dim = 512 - n_repeat = 4 - w_hpf = 0 - F0_channel = 256 - - generator = Generator( - dim_in=dim_in, - style_dim=style_dim, - max_conv_dim=max_conv_dim, - w_hpf=w_hpf, - F0_channel=F0_channel) - - mapping_network = MappingNetwork( - latent_dim=latent_dim, - style_dim=style_dim, - num_domains=num_domains, - hidden_dim=max_conv_dim) - - style_encoder = StyleEncoder( - dim_in=dim_in, - style_dim=style_dim, - num_domains=num_domains, - max_conv_dim=max_conv_dim) + with open(args.config_path) as f: + config = CfgNode(yaml.safe_load(f)) + + generator = Generator(**config['generator_params']) + mapping_network = MappingNetwork(**config['mapping_network_params']) + style_encoder = StyleEncoder(**config['style_encoder_params']) starganv2vc_model_param = paddle.load(starganv2vc_model_dir) + generator.set_state_dict(starganv2vc_model_param['generator_params']) mapping_network.set_state_dict( starganv2vc_model_param['mapping_network_params']) style_encoder.set_state_dict( starganv2vc_model_param['style_encoder_params']) + generator.eval() mapping_network.eval() style_encoder.eval() + model_dict['F0_model'] = F0_model model_dict['vocoder'] = vocoder model_dict['generator'] = generator @@ -154,13 +136,13 @@ def get_models(uncompress_path): return model_dict -def voice_conversion(source_path, output_dir, uncompress_path): +def voice_conversion(args, uncompress_path): speakers = [ 225, 228, 229, 230, 231, 233, 236, 239, 240, 244, 226, 227, 232, 243, 254, 256, 258, 259, 270, 273 ] demo_dir = os.path.join(uncompress_path, 'Demo/VCTK-corpus/') - model_dict = get_models(uncompress_path) + model_dict = get_models(args, uncompress_path=uncompress_path) style_encoder = model_dict['style_encoder'] mapping_network = model_dict['mapping_network'] generator = model_dict['generator'] @@ -182,9 +164,9 @@ def voice_conversion(source_path, output_dir, uncompress_path): style_encoder=style_encoder, mapping_network=mapping_network) - wave, sr = librosa.load(source_path, sr=24000) + wave, sr = librosa.load(args.source_path, sr=24000) source = preprocess(wave, mel_extractor) - output_dir = Path(output_dir) + output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) orig_wav_name = str(output_dir / 'orig_voc.wav') print('原始语音 (使用声码器解码): %s' % orig_wav_name) @@ -244,6 +226,11 @@ def parse_args(): description="StarGANv2-VC Voice Conversion.") parser.add_argument("--source_path", type=str, help="source audio's path.") parser.add_argument("--output_dir", type=str, help="output dir.") + parser.add_argument( + '--config_path', + type=str, + default=None, + help='Config of StarGANv2-VC model.') parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") args = parser.parse_args() @@ -261,10 +248,7 @@ def main(): model_version = '1.0' uncompress_path = download_and_decompress(StarGANv2VC_source[model_version], MODEL_HOME) - voice_conversion( - source_path=args.source_path, - output_dir=args.output_dir, - uncompress_path=uncompress_path) + voice_conversion(args, uncompress_path=uncompress_path) if __name__ == "__main__":