pull/2842/head
TianYuan 3 years ago
parent d54a0967c9
commit d50c03e7fa

@ -1,7 +1,8 @@
#!/bin/bash #!/bin/bash
config_path=$1 source_path=$1
train_output_path=$2 output_dir=$2
ckpt_name=$3
python3 ${BIN_DIR}/vc.py python3 ${BIN_DIR}/vc.py \
--source_path=${source_path}\
--output_dir=${output_dir}

@ -10,6 +10,8 @@ stop_stage=100
conf_path=conf/default.yaml conf_path=conf/default.yaml
train_output_path=exp/default train_output_path=exp/default
ckpt_name=snapshot_iter_331.pdz ckpt_name=snapshot_iter_331.pdz
source_path=test_wav/goat_01.wav
output_dir=vc_output
# with the following command, you can choose the stage range you want to run # with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0` # such as `./run.sh --stage 0 --stop-stage 0`
@ -28,6 +30,6 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan by default # synthesize, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_conversion.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_conversion.sh ${source_path} ${output_dir}|| exit -1
fi fi

@ -11,10 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse
import os
import time import time
from pathlib import Path
import librosa import librosa
import numpy as np
import paddle import paddle
import soundfile as sf import soundfile as sf
import yaml import yaml
@ -27,20 +29,19 @@ from paddlespeech.t2s.models.starganv2_vc import JDCNet
from paddlespeech.t2s.models.starganv2_vc import MappingNetwork from paddlespeech.t2s.models.starganv2_vc import MappingNetwork
from paddlespeech.t2s.models.starganv2_vc import StyleEncoder from paddlespeech.t2s.models.starganv2_vc import StyleEncoder
jdc_modeldir = '/home/yuantian01/PaddleSpeech_stargan/PaddleSpeech/stargan_models/jdcnet.pdz' uncompress_path = '/home/yuantian01/PaddleSpeech_stargan/PaddleSpeech/stargan_models/'
# 是 stargan 重新训练的
voc_modeldir = '/home/yuantian01/PaddleSpeech_stargan/PaddleSpeech/stargan_models/Vocoder/'
starganv2vc_modeldir = '/home/yuantian01/PaddleSpeech_stargan/PaddleSpeech/stargan_models/starganv2vc.pdz'
sr = 16000
n_fft = 2048
win_length = 1200
hop_length = 300
n_mels = 80
fmin = 0
fmax = sr // 2
mel_extractor = LogMelFBank( def get_mel_extractor():
sr = 16000
n_fft = 2048
win_length = 1200
hop_length = 300
n_mels = 80
fmin = 0
fmax = sr // 2
mel_extractor = LogMelFBank(
sr=sr, sr=sr,
n_fft=n_fft, n_fft=n_fft,
hop_length=hop_length, hop_length=hop_length,
@ -52,25 +53,21 @@ mel_extractor = LogMelFBank(
htk=True, htk=True,
power=2.0) power=2.0)
speakers = [ return mel_extractor
225, 228, 229, 230, 231, 233, 236, 239, 240, 244, 226, 227, 232, 243, 254,
256, 258, 259, 270, 273
]
mean, std = -4, 4
def preprocess(wave): def preprocess(wave, mel_extractor):
logmel = mel_extractor.get_log_mel_fbank(wave, base='e') logmel = mel_extractor.get_log_mel_fbank(wave, base='e')
# [1, 80, 1011] # [1, 80, 1011]
mean, std = -4, 4
mel_tensor = (paddle.to_tensor(logmel.T).unsqueeze(0) - mean) / std mel_tensor = (paddle.to_tensor(logmel.T).unsqueeze(0) - mean) / std
return mel_tensor return mel_tensor
def compute_style(speaker_dicts): def compute_style(speaker_dicts, mel_extractor, style_encoder, mapping_network):
reference_embeddings = {} reference_embeddings = {}
for key, (path, speaker) in speaker_dicts.items(): for key, (path, speaker) in speaker_dicts.items():
if path == "": if path == '':
label = paddle.to_tensor([speaker], dtype=paddle.int64) label = paddle.to_tensor([speaker], dtype=paddle.int64)
latent_dim = mapping_network.shared[0].weight.shape[0] latent_dim = mapping_network.shared[0].weight.shape[0]
ref = mapping_network(paddle.randn([1, latent_dim]), label) ref = mapping_network(paddle.randn([1, latent_dim]), label)
@ -79,7 +76,7 @@ def compute_style(speaker_dicts):
audio, index = librosa.effects.trim(wave, top_db=30) audio, index = librosa.effects.trim(wave, top_db=30)
if sr != 24000: if sr != 24000:
wave = librosa.resample(wave, sr, 24000) wave = librosa.resample(wave, sr, 24000)
mel_tensor = preprocess(wave) mel_tensor = preprocess(wave, mel_extractor)
with paddle.no_grad(): with paddle.no_grad():
label = paddle.to_tensor([speaker], dtype=paddle.int64) label = paddle.to_tensor([speaker], dtype=paddle.int64)
@ -89,86 +86,118 @@ def compute_style(speaker_dicts):
return reference_embeddings return reference_embeddings
F0_model = JDCNet(num_class=1, seq_len=192) def get_models():
i = 0 model_dict = {}
jdc_model_dir = os.path.join(uncompress_path, 'jdcnet.pdz')
voc_model_dir = os.path.join(uncompress_path, 'Vocoder/')
starganv2vc_model_dir = os.path.join(uncompress_path, 'starganv2vc.pdz')
F0_model.set_state_dict(paddle.load(jdc_modeldir)['main_params']) F0_model = JDCNet(num_class=1, seq_len=192)
F0_model.eval() F0_model.set_state_dict(paddle.load(jdc_model_dir)['main_params'])
F0_model.eval()
with open(voc_modeldir + 'config.yml') as f: voc_config_path = os.path.join(voc_model_dir, 'config.yml')
with open(voc_config_path) as f:
voc_config = CfgNode(yaml.safe_load(f)) voc_config = CfgNode(yaml.safe_load(f))
voc_config["generator_params"].pop("upsample_net") voc_config["generator_params"].pop("upsample_net")
voc_config["generator_params"]["upsample_scales"] = voc_config[ voc_config["generator_params"]["upsample_scales"] = voc_config[
"generator_params"].pop("upsample_params")["upsample_scales"] "generator_params"].pop("upsample_params")["upsample_scales"]
vocoder = PWGGenerator(**voc_config["generator_params"]) vocoder = PWGGenerator(**voc_config["generator_params"])
vocoder.remove_weight_norm() vocoder.remove_weight_norm()
vocoder.eval() vocoder.eval()
vocoder.set_state_dict(paddle.load(voc_modeldir + 'checkpoint-400000steps.pd')) voc_model_path = os.path.join(voc_model_dir, 'checkpoint-400000steps.pd')
vocoder.set_state_dict(paddle.load(voc_model_path))
dim_in = 64
style_dim = 64 dim_in = 64
latent_dim = 16 style_dim = 64
num_domains = 20 latent_dim = 16
max_conv_dim = 512 num_domains = 20
n_repeat = 4 max_conv_dim = 512
w_hpf = 0 n_repeat = 4
F0_channel = 256 w_hpf = 0
F0_channel = 256
generator = Generator(
generator = Generator(
dim_in=dim_in, dim_in=dim_in,
style_dim=style_dim, style_dim=style_dim,
max_conv_dim=max_conv_dim, max_conv_dim=max_conv_dim,
w_hpf=w_hpf, w_hpf=w_hpf,
F0_channel=F0_channel) F0_channel=F0_channel)
mapping_network = MappingNetwork(
mapping_network = MappingNetwork(
latent_dim=latent_dim, latent_dim=latent_dim,
style_dim=style_dim, style_dim=style_dim,
num_domains=num_domains, num_domains=num_domains,
hidden_dim=max_conv_dim) hidden_dim=max_conv_dim)
style_encoder = StyleEncoder(
style_encoder = StyleEncoder(
dim_in=dim_in, dim_in=dim_in,
style_dim=style_dim, style_dim=style_dim,
num_domains=num_domains, num_domains=num_domains,
max_conv_dim=max_conv_dim) max_conv_dim=max_conv_dim)
starganv2vc_model_param = paddle.load(starganv2vc_modeldir) starganv2vc_model_param = paddle.load(starganv2vc_model_dir)
generator.set_state_dict(starganv2vc_model_param['generator_params']) generator.set_state_dict(starganv2vc_model_param['generator_params'])
mapping_network.set_state_dict( mapping_network.set_state_dict(
starganv2vc_model_param['mapping_network_params']) starganv2vc_model_param['mapping_network_params'])
style_encoder.set_state_dict(starganv2vc_model_param['style_encoder_params']) style_encoder.set_state_dict(
generator.eval() starganv2vc_model_param['style_encoder_params'])
mapping_network.eval() generator.eval()
style_encoder.eval() mapping_network.eval()
style_encoder.eval()
# 计算Demo文件夹下的说话人的风格 model_dict['F0_model'] = F0_model
speaker_dicts = {} model_dict['vocoder'] = vocoder
selected_speakers = [273, 259, 258, 243, 254, 244, 236, 233, 230, 228] model_dict['generator'] = generator
for s in selected_speakers: model_dict['mapping_network'] = mapping_network
model_dict['style_encoder'] = style_encoder
return model_dict
def voice_conversion(args):
speakers = [
225, 228, 229, 230, 231, 233, 236, 239, 240, 244, 226, 227, 232, 243,
254, 256, 258, 259, 270, 273
]
demo_dir = os.path.join(uncompress_path, 'Demo/VCTK-corpus/')
model_dict = get_models()
style_encoder = model_dict['style_encoder']
mapping_network = model_dict['mapping_network']
generator = model_dict['generator']
vocoder = model_dict['vocoder']
F0_model = model_dict['F0_model']
# 计算 Demo 文件夹下的说话人的风格
speaker_dicts = {}
selected_speakers = [273, 259, 258, 243, 254, 244, 236, 233, 230, 228]
for s in selected_speakers:
k = s k = s
speaker_dicts['p' + str(s)] = ( speaker_dicts['p' + str(s)] = (
'Demo/VCTK-corpus/p' + str(k) + '/p' + str(k) + '_023.wav', demo_dir + 'p' + str(k) + '/p' + str(k) + '_023.wav',
speakers.index(s)) speakers.index(s))
print("speaker_dicts:", speaker_dicts) print("speaker_dicts:", speaker_dicts)
reference_embeddings = compute_style(speaker_dicts) mel_extractor = get_mel_extractor()
# print("reference_embeddings:", reference_embeddings) reference_embeddings = compute_style(speaker_dicts, mel_extractor,
style_encoder, mapping_network)
# ============================================================================
wave, sr = librosa.load(args.source_path, sr=24000)
# 这里改成你上传的干净低噪声的wav格式语音文件 source = preprocess(wave, mel_extractor)
wav_path = 'goat_01.wav' output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
audio, source_sr = librosa.load(wav_path, sr=24000) orig_wav_name = str(output_dir / 'orig_voc.wav')
audio = audio / np.max(np.abs(audio)) print('原始语音 (使用声码器解码): %s' % orig_wav_name)
audio.dtype = np.float32 c = source.transpose([0, 2, 1]).squeeze()
with paddle.no_grad():
recon = vocoder.inference(c)
recon = recon.reshape([-1]).numpy()
sf.write(orig_wav_name, recon, samplerate=24000)
start = time.time() keys = []
source = preprocess(audio) converted_samples = {}
keys = [] reconstructed_samples = {}
converted_samples = {} converted_mels = {}
reconstructed_samples = {} start = time.time()
converted_mels = {}
for key, (ref, _) in reference_embeddings.items(): for key, (ref, _) in reference_embeddings.items():
with paddle.no_grad(): with paddle.no_grad():
# F0_model 输入的特征是否可以不带 norm或者 norm 是否一定要和 stargan 原作保持一致? # F0_model 输入的特征是否可以不带 norm或者 norm 是否一定要和 stargan 原作保持一致?
# !! 需要ASR 和 F0_model 用的是一样的数据预处理方式 # !! 需要ASR 和 F0_model 用的是一样的数据预处理方式
@ -178,16 +207,14 @@ for key, (ref, _) in reference_embeddings.items():
f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1)) f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))
# 输出是带 norm 的 mel, 所以可以直接用 vocoder.inference # 输出是带 norm 的 mel, 所以可以直接用 vocoder.inference
out = generator(source.unsqueeze(1), ref, F0=f0_feat) out = generator(source.unsqueeze(1), ref, F0=f0_feat)
c = out.transpose([0, 1, 3, 2]).squeeze() c = out.transpose([0, 1, 3, 2]).squeeze()
y_out = vocoder.inference(c) y_out = vocoder.inference(c)
y_out = y_out.reshape([-1]) y_out = y_out.reshape([-1])
if key not in speaker_dicts or speaker_dicts[key][0] == "": if key not in speaker_dicts or speaker_dicts[key][0] == "":
recon = None recon = None
else: else:
wave, sr = librosa.load(speaker_dicts[key][0], sr=24000) wave, sr = librosa.load(speaker_dicts[key][0], sr=24000)
mel = preprocess(wave) mel = preprocess(wave, mel_extractor)
c = mel.transpose([0, 2, 1]).squeeze() c = mel.transpose([0, 2, 1]).squeeze()
recon = vocoder.inference(c) recon = vocoder.inference(c)
recon = recon.reshape([-1]).numpy() recon = recon.reshape([-1]).numpy()
@ -196,26 +223,40 @@ for key, (ref, _) in reference_embeddings.items():
reconstructed_samples[key] = recon reconstructed_samples[key] = recon
converted_mels[key] = out converted_mels[key] = out
keys.append(key) keys.append(key)
end = time.time()
end = time.time() print('总共花费时间: %.3f sec' % (end - start))
for key, wave in converted_samples.items():
print('总共花费时间: %.3f sec' % (end - start)) wav_name = str(output_dir / ('vc_result_' + key + '.wav'))
print('原始语音 (使用声码器解码):')
wave, sr = librosa.load(wav_path, sr=24000)
mel = preprocess(wave)
c = mel.transpose([0, 2, 1]).squeeze()
with paddle.no_grad():
recon = vocoder.inference(c)
recon = recon.reshape([-1]).numpy()
# display(ipd.Audio(recon, rate=24000))
sf.write('orig_voc.wav', recon, samplerate=24000)
for key, wave in converted_samples.items():
wav_name = 'vc_result_' + key + '.wav'
print('语音转换结果: %s' % wav_name) print('语音转换结果: %s' % wav_name)
sf.write(wav_name, wave, samplerate=24000) sf.write(wav_name, wave, samplerate=24000)
ref_wav_name = 'ref_voc_' + key + '.wav' ref_wav_name = str(output_dir / ('ref_voc_' + key + '.wav'))
print('参考的说话人 (使用声码器解码): %s' % ref_wav_name) print('参考的说话人 (使用声码器解码): %s' % ref_wav_name)
if reconstructed_samples[key] is not None: if reconstructed_samples[key] is not None:
sf.write(ref_wav_name, reconstructed_samples[key], samplerate=24000) sf.write(ref_wav_name, reconstructed_samples[key], samplerate=24000)
def parse_args():
# parse args and config
parser = argparse.ArgumentParser(
description="StarGANv2-VC Voice Conversion.")
parser.add_argument("--source_path", type=str, help="source audio's path.")
parser.add_argument("--output_dir", type=str, help="output dir.")
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
args = parser.parse_args()
return args
def main():
args = parse_args()
if args.ngpu == 0:
paddle.set_device("cpu")
elif args.ngpu > 0:
paddle.set_device("gpu")
else:
print("ngpu should >= 0 !")
voice_conversion(args)
if __name__ == "__main__":
main()

Loading…
Cancel
Save