pull/2842/head
TianYuan 3 years ago
parent 1d5f7d4143
commit cd3af4d9d6

@ -8,7 +8,7 @@
num_domains: 20 # num of speakers in StarGANv2 num_domains: 20 # num of speakers in StarGANv2
latent_dim: 16 latent_dim: 16
style_dim: 64 # same as style_dim in generator_params style_dim: 64 # same as style_dim in generator_params
max_conv_dim: 512 # same as max_conv_dim in generator_params hidden_dim: 512 # same as max_conv_dim in generator_params
style_encoder_params: style_encoder_params:
dim_in: 64 # same as dim_in in generator_params dim_in: 64 # same as dim_in in generator_params
style_dim: 64 # same as style_dim in generator_params style_dim: 64 # same as style_dim in generator_params

@ -1,8 +1,10 @@
#!/bin/bash #!/bin/bash
source_path=$1 config_path=$1
output_dir=$2 source_path=$2
output_dir=$3
python3 ${BIN_DIR}/vc.py \ python3 ${BIN_DIR}/vc.py \
--config_path=${config_path} \
--source_path=${source_path}\ --source_path=${source_path}\
--output_dir=${output_dir} --output_dir=${output_dir}

@ -31,6 +31,6 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan by default # synthesize, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_conversion.sh ${source_path} ${output_dir}|| exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_conversion.sh ${conf_path} ${source_path} ${output_dir}|| exit -1
fi fi

@ -87,7 +87,7 @@ def compute_style(speaker_dicts, mel_extractor, style_encoder, mapping_network):
return reference_embeddings return reference_embeddings
def get_models(uncompress_path): def get_models(args, uncompress_path):
model_dict = {} model_dict = {}
jdc_model_dir = os.path.join(uncompress_path, 'jdcnet.pdz') jdc_model_dir = os.path.join(uncompress_path, 'jdcnet.pdz')
voc_model_dir = os.path.join(uncompress_path, 'Vocoder/') voc_model_dir = os.path.join(uncompress_path, 'Vocoder/')
@ -109,43 +109,25 @@ def get_models(uncompress_path):
voc_model_path = os.path.join(voc_model_dir, 'checkpoint-400000steps.pd') voc_model_path = os.path.join(voc_model_dir, 'checkpoint-400000steps.pd')
vocoder.set_state_dict(paddle.load(voc_model_path)) vocoder.set_state_dict(paddle.load(voc_model_path))
dim_in = 64 with open(args.config_path) as f:
style_dim = 64 config = CfgNode(yaml.safe_load(f))
latent_dim = 16
num_domains = 20 generator = Generator(**config['generator_params'])
max_conv_dim = 512 mapping_network = MappingNetwork(**config['mapping_network_params'])
n_repeat = 4 style_encoder = StyleEncoder(**config['style_encoder_params'])
w_hpf = 0
F0_channel = 256
generator = Generator(
dim_in=dim_in,
style_dim=style_dim,
max_conv_dim=max_conv_dim,
w_hpf=w_hpf,
F0_channel=F0_channel)
mapping_network = MappingNetwork(
latent_dim=latent_dim,
style_dim=style_dim,
num_domains=num_domains,
hidden_dim=max_conv_dim)
style_encoder = StyleEncoder(
dim_in=dim_in,
style_dim=style_dim,
num_domains=num_domains,
max_conv_dim=max_conv_dim)
starganv2vc_model_param = paddle.load(starganv2vc_model_dir) starganv2vc_model_param = paddle.load(starganv2vc_model_dir)
generator.set_state_dict(starganv2vc_model_param['generator_params']) generator.set_state_dict(starganv2vc_model_param['generator_params'])
mapping_network.set_state_dict( mapping_network.set_state_dict(
starganv2vc_model_param['mapping_network_params']) starganv2vc_model_param['mapping_network_params'])
style_encoder.set_state_dict( style_encoder.set_state_dict(
starganv2vc_model_param['style_encoder_params']) starganv2vc_model_param['style_encoder_params'])
generator.eval() generator.eval()
mapping_network.eval() mapping_network.eval()
style_encoder.eval() style_encoder.eval()
model_dict['F0_model'] = F0_model model_dict['F0_model'] = F0_model
model_dict['vocoder'] = vocoder model_dict['vocoder'] = vocoder
model_dict['generator'] = generator model_dict['generator'] = generator
@ -154,13 +136,13 @@ def get_models(uncompress_path):
return model_dict return model_dict
def voice_conversion(source_path, output_dir, uncompress_path): def voice_conversion(args, uncompress_path):
speakers = [ speakers = [
225, 228, 229, 230, 231, 233, 236, 239, 240, 244, 226, 227, 232, 243, 225, 228, 229, 230, 231, 233, 236, 239, 240, 244, 226, 227, 232, 243,
254, 256, 258, 259, 270, 273 254, 256, 258, 259, 270, 273
] ]
demo_dir = os.path.join(uncompress_path, 'Demo/VCTK-corpus/') demo_dir = os.path.join(uncompress_path, 'Demo/VCTK-corpus/')
model_dict = get_models(uncompress_path) model_dict = get_models(args, uncompress_path=uncompress_path)
style_encoder = model_dict['style_encoder'] style_encoder = model_dict['style_encoder']
mapping_network = model_dict['mapping_network'] mapping_network = model_dict['mapping_network']
generator = model_dict['generator'] generator = model_dict['generator']
@ -182,9 +164,9 @@ def voice_conversion(source_path, output_dir, uncompress_path):
style_encoder=style_encoder, style_encoder=style_encoder,
mapping_network=mapping_network) mapping_network=mapping_network)
wave, sr = librosa.load(source_path, sr=24000) wave, sr = librosa.load(args.source_path, sr=24000)
source = preprocess(wave, mel_extractor) source = preprocess(wave, mel_extractor)
output_dir = Path(output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
orig_wav_name = str(output_dir / 'orig_voc.wav') orig_wav_name = str(output_dir / 'orig_voc.wav')
print('原始语音 (使用声码器解码): %s' % orig_wav_name) print('原始语音 (使用声码器解码): %s' % orig_wav_name)
@ -244,6 +226,11 @@ def parse_args():
description="StarGANv2-VC Voice Conversion.") description="StarGANv2-VC Voice Conversion.")
parser.add_argument("--source_path", type=str, help="source audio's path.") parser.add_argument("--source_path", type=str, help="source audio's path.")
parser.add_argument("--output_dir", type=str, help="output dir.") parser.add_argument("--output_dir", type=str, help="output dir.")
parser.add_argument(
'--config_path',
type=str,
default=None,
help='Config of StarGANv2-VC model.')
parser.add_argument( parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
args = parser.parse_args() args = parser.parse_args()
@ -261,10 +248,7 @@ def main():
model_version = '1.0' model_version = '1.0'
uncompress_path = download_and_decompress(StarGANv2VC_source[model_version], uncompress_path = download_and_decompress(StarGANv2VC_source[model_version],
MODEL_HOME) MODEL_HOME)
voice_conversion( voice_conversion(args, uncompress_path=uncompress_path)
source_path=args.source_path,
output_dir=args.output_dir,
uncompress_path=uncompress_path)
if __name__ == "__main__": if __name__ == "__main__":

Loading…
Cancel
Save