# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse from pathlib import Path import numpy as np import paddle import soundfile as sf import yaml from yacs.config import CfgNode from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator from paddlespeech.t2s.models.parallel_wavegan import PWGInference from paddlespeech.t2s.modules.normalizer import ZScore def evaluate(args, fastspeech2_config, pwg_config): # construct dataset for evaluation sentences = [] with open(args.text, 'rt') as f: for line in f: items = line.strip().split() utt_id = items[0] sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) odim = fastspeech2_config.n_mels model = FastSpeech2( idim=vocab_size, odim=odim, **fastspeech2_config["model"]) model.set_state_dict( paddle.load(args.fastspeech2_checkpoint)["main_params"]) model.eval() vocoder = PWGGenerator(**pwg_config["generator_params"]) vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"]) vocoder.remove_weight_norm() vocoder.eval() print("model done!") frontend = Frontend(phone_vocab_path=args.phones_dict) print("frontend done!") stat = np.load(args.fastspeech2_stat) mu, std = stat mu = paddle.to_tensor(mu) std = paddle.to_tensor(std) fastspeech2_normalizer = ZScore(mu, std) stat = np.load(args.pwg_stat) mu, std = stat mu = paddle.to_tensor(mu) std = paddle.to_tensor(std) pwg_normalizer = ZScore(mu, std) fastspeech2_inference = StyleFastSpeech2Inference( fastspeech2_normalizer, model, args.fastspeech2_pitch_stat, args.fastspeech2_energy_stat) fastspeech2_inference.eval() pwg_inference = PWGInference(pwg_normalizer, vocoder) pwg_inference.eval() output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) styles = ["normal", "robot", "1.2xspeed", "0.8xspeed", "child_voice"] for style in styles: robot = False durations = None durations_scale = None durations_bias = None pitch = None pitch_scale = None pitch_bias = None energy = None energy_scale = None energy_bias = None if style == "robot": # all tones in phones be `1` # all pitch should be the same, we use mean here robot = True if style == "1.2xspeed": durations_scale = 1 / 1.2 if style == "0.8xspeed": durations_scale = 1 / 0.8 if style == "child_voice": pitch_scale = 1.3 sub_output_dir = output_dir / style sub_output_dir.mkdir(parents=True, exist_ok=True) for utt_id, sentence in sentences: input_ids = frontend.get_input_ids( sentence, merge_sentences=True, robot=robot) phone_ids = input_ids["phone_ids"][0] with paddle.no_grad(): mel = fastspeech2_inference( phone_ids, durations=durations, durations_scale=durations_scale, durations_bias=durations_bias, pitch=pitch, pitch_scale=pitch_scale, pitch_bias=pitch_bias, energy=energy, energy_scale=energy_scale, energy_bias=energy_bias, robot=robot) wav = pwg_inference(mel) sf.write( str(sub_output_dir / (utt_id + ".wav")), wav.numpy(), samplerate=fastspeech2_config.fs) print(f"{style}_{utt_id} done!") def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser( description="Synthesize with fastspeech2 & parallel wavegan.") parser.add_argument( "--fastspeech2-config", type=str, help="fastspeech2 config file.") parser.add_argument( "--fastspeech2-checkpoint", type=str, help="fastspeech2 checkpoint to load.") parser.add_argument( "--fastspeech2-stat", type=str, help="mean and standard deviation used to normalize spectrogram when training fastspeech2." ) parser.add_argument( "--fastspeech2-pitch-stat", type=str, help="mean and standard deviation used to normalize pitch when training fastspeech2" ) parser.add_argument( "--fastspeech2-energy-stat", type=str, help="mean and standard deviation used to normalize energy when training fastspeech2." ) parser.add_argument( "--pwg-config", type=str, help="parallel wavegan config file.") parser.add_argument( "--pwg-checkpoint", type=str, help="parallel wavegan generator parameters to load.") parser.add_argument( "--pwg-stat", type=str, help="mean and standard deviation used to normalize spectrogram when training parallel wavegan." ) parser.add_argument( "--phones-dict", type=str, default="phone_id_map.txt", help="phone vocabulary file.") parser.add_argument( "--text", type=str, help="text to synthesize, a 'utt_id sentence' pair per line.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() if args.ngpu == 0: paddle.set_device("cpu") elif args.ngpu > 0: paddle.set_device("gpu") else: print("ngpu should >= 0 !") with open(args.fastspeech2_config) as f: fastspeech2_config = CfgNode(yaml.safe_load(f)) with open(args.pwg_config) as f: pwg_config = CfgNode(yaml.safe_load(f)) print("========Args========") print(yaml.safe_dump(vars(args))) print("========Config========") print(fastspeech2_config) print(pwg_config) evaluate(args, fastspeech2_config, pwg_config) if __name__ == "__main__": main()