# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from pathlib import Path from typing import Dict from typing import List from typing import Union import numpy as np import paddle import yaml from sedit_arg_parser import parse_args from yacs.config import CfgNode from paddlespeech.t2s.exps.syn_utils import get_am_inference from paddlespeech.t2s.exps.syn_utils import get_voc_inference def read_2col_text(path: Union[Path, str]) -> Dict[str, str]: """Read a text file having 2 column as dict object. Examples: wav.scp: key1 /some/path/a.wav key2 /some/path/b.wav >>> read_2col_text('wav.scp') {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'} """ data = {} with Path(path).open("r", encoding="utf-8") as f: for linenum, line in enumerate(f, 1): sps = line.rstrip().split(maxsplit=1) if len(sps) == 1: k, v = sps[0], "" else: k, v = sps if k in data: raise RuntimeError(f"{k} is duplicated ({path}:{linenum})") data[k] = v return data def load_num_sequence_text(path: Union[Path, str], loader_type: str="csv_int" ) -> Dict[str, List[Union[float, int]]]: """Read a text file indicating sequences of number Examples: key1 1 2 3 key2 34 5 6 >>> d = load_num_sequence_text('text') >>> np.testing.assert_array_equal(d["key1"], np.array([1, 2, 3])) """ if loader_type == "text_int": delimiter = " " dtype = int elif loader_type == "text_float": delimiter = " " dtype = float elif loader_type == "csv_int": delimiter = "," dtype = int elif loader_type == "csv_float": delimiter = "," dtype = float else: raise ValueError(f"Not supported loader_type={loader_type}") # path looks like: # utta 1,0 # uttb 3,4,5 # -> return {'utta': np.ndarray([1, 0]), # 'uttb': np.ndarray([3, 4, 5])} d = read_2column_text(path) # Using for-loop instead of dict-comprehension for debuggability retval = {} for k, v in d.items(): try: retval[k] = [dtype(i) for i in v.split(delimiter)] except TypeError: print(f'Error happened with path="{path}", id="{k}", value="{v}"') raise return retval def is_chinese(ch): if u'\u4e00' <= ch <= u'\u9fff': return True else: return False def get_voc_out(mel): # vocoder args = parse_args() with open(args.voc_config) as f: voc_config = CfgNode(yaml.safe_load(f)) voc_inference = get_voc_inference( voc=args.voc, voc_config=voc_config, voc_ckpt=args.voc_ckpt, voc_stat=args.voc_stat) with paddle.no_grad(): wav = voc_inference(mel) return np.squeeze(wav) def eval_durs(phns, target_lang="chinese", fs=24000, hop_length=300): args = parse_args() if target_lang == 'english': args.am = "fastspeech2_ljspeech" args.am_config = "download/fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml" args.am_ckpt = "download/fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz" args.am_stat = "download/fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy" args.phones_dict = "download/fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt" elif target_lang == 'chinese': args.am = "fastspeech2_csmsc" args.am_config = "download/fastspeech2_conformer_baker_ckpt_0.5/conformer.yaml" args.am_ckpt = "download/fastspeech2_conformer_baker_ckpt_0.5/snapshot_iter_76000.pdz" args.am_stat = "download/fastspeech2_conformer_baker_ckpt_0.5/speech_stats.npy" args.phones_dict = "download/fastspeech2_conformer_baker_ckpt_0.5/phone_id_map.txt" if args.ngpu == 0: paddle.set_device("cpu") elif args.ngpu > 0: paddle.set_device("gpu") else: print("ngpu should >= 0 !") # Init body. with open(args.am_config) as f: am_config = CfgNode(yaml.safe_load(f)) am_inference, am = get_am_inference( am=args.am, am_config=am_config, am_ckpt=args.am_ckpt, am_stat=args.am_stat, phones_dict=args.phones_dict, tones_dict=args.tones_dict, speaker_dict=args.speaker_dict, return_am=True) vocab_phones = {} with open(args.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] for tone, id in phn_id: vocab_phones[tone] = int(id) vocab_size = len(vocab_phones) phonemes = [phn if phn in vocab_phones else "sp" for phn in phns] phone_ids = [vocab_phones[item] for item in phonemes] phone_ids.append(vocab_size - 1) phone_ids = paddle.to_tensor(np.array(phone_ids, np.int64)) _, d_outs, _, _ = am.inference(phone_ids, spk_id=None, spk_emb=None) pre_d_outs = d_outs phu_durs_new = pre_d_outs * hop_length / fs phu_durs_new = phu_durs_new.tolist()[:-1] return phu_durs_new