diff --git a/examples/csmsc/tts1/local/preprocess.sh b/examples/csmsc/tts1/local/preprocess.sh index e1acc8e83..f92664cef 100644 --- a/examples/csmsc/tts1/local/preprocess.sh +++ b/examples/csmsc/tts1/local/preprocess.sh @@ -10,7 +10,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "Extract features ..." python3 ${BIN_DIR}/preprocess.py \ --dataset=ljspeech \ - --rootdir=~/datasets/LJSpeech-1.1/ \ + --rootdir=~/datasets/BZNSYP/ \ --dumpdir=dump \ --config-path=conf/default.yaml \ --num-cpu=8 diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess_new.py b/paddlespeech/t2s/exps/transformer_tts/preprocess_new.py new file mode 100644 index 000000000..87f6c7cff --- /dev/null +++ b/paddlespeech/t2s/exps/transformer_tts/preprocess_new.py @@ -0,0 +1,307 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from concurrent.futures import ThreadPoolExecutor +from operator import itemgetter +from pathlib import Path +from typing import Any +from typing import Dict +from typing import List + +import jsonlines +import librosa +import numpy as np +import tqdm +import yaml +from yacs.config import CfgNode as Configuration +import re +from paddlespeech.t2s.datasets.get_feats import LogMelFBank +from paddlespeech.t2s.frontend import English,Chinese + + +def get_lj_sentences(file_name, frontend): + '''read MFA duration.txt + + Args: + file_name (str or Path) + Returns: + Dict: sentence: {'utt': ([char], [int])} + ''' + f = open(file_name, 'r') + sentence = {} + speaker_set = set() + for line in f: + line_list = line.strip().split('|') + utt = line_list[0] + speaker = utt.split("-")[0][:2] + speaker_set.add(speaker) + raw_text = line_list[-1] + phonemes = frontend.phoneticize(raw_text) + phonemes = phonemes[1:-1] + phonemes = [phn for phn in phonemes if not phn.isspace()] + sentence[utt] = (phonemes, speaker) + f.close() + return sentence, speaker_set + +def get_csmsc_sentences(file_name,fronten): + '''read MFA duration.txt + + Args: + file_name (str or Path) + Returns: + Dict: sentence: {'utt': ([char], [int])} + ''' + sentence = {} + speaker_set = set() + utt = 'girl' + with open(file_name, mode='r', encoding='utf-8') as f: + lines = f.readlines() + for i in range(len(lines)): + ann = lines[i] + if i % 2 == 0: + head = ann.strip('\n|\t').split('\t') + body = re.sub(r'[0-9]|#', '', head[-1]) + phonemes = fronten.phoneticize(body) + phonemes = phonemes[1:-1] + phonemes = [phn for phn in phonemes if not phn.isspace()] + sentence[head[0]] = (phonemes, utt) + speaker_set.add(utt) + f.close() + return sentence, speaker_set + +def get_input_token(sentence, output_path): + '''get phone set from training data and save it + + Args: + sentence (Dict): sentence: {'utt': ([char], str)} + output_path (str or path): path to save phone_id_map + ''' + phn_token = set() + for utt in sentence: + for phn in sentence[utt][0]: + if phn != "": + phn_token.add(phn) + phn_token = list(phn_token) + phn_token.sort() + phn_token = ["", ""] + phn_token + phn_token += [""] + + with open(output_path, 'w') as f: + for i, phn in enumerate(phn_token): + f.write(phn + ' ' + str(i) + '\n') + + +def get_spk_id_map(speaker_set, output_path): + speakers = sorted(list(speaker_set)) + with open(output_path, 'w') as f: + for i, spk in enumerate(speakers): + f.write(spk + ' ' + str(i) + '\n') + + +def process_sentence(config: Dict[str, Any], + fp: Path, + sentences: Dict, + output_dir: Path, + mel_extractor=None): + utt_id = fp.stem + record = None + if utt_id in sentences: + # reading, resampling may occur + wav, _ = librosa.load(str(fp), sr=config.fs) + if len(wav.shape) != 1 or np.abs(wav).max() > 1.0: + return record + assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." + assert np.abs(wav).max( + ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." + phones = sentences[utt_id][0] + speaker = sentences[utt_id][1] + logmel = mel_extractor.get_log_mel_fbank(wav, base='e') + # change duration according to mel_length + num_frames = logmel.shape[0] + mel_dir = output_dir / "data_speech" + mel_dir.mkdir(parents=True, exist_ok=True) + mel_path = mel_dir / (utt_id + "_speech.npy") + np.save(mel_path, logmel) + record = { + "utt_id": utt_id, + "phones": phones, + "text_lengths": len(phones), + "speech_lengths": num_frames, + "speech": str(mel_path), + "speaker": speaker + } + return record + + +def process_sentences(config, + fps: List[Path], + sentences: Dict, + output_dir: Path, + mel_extractor=None, + nprocs: int=1): + + if nprocs == 1: + results = [] + for fp in tqdm.tqdm(fps, total=len(fps)): + record = process_sentence( + config=config, + fp=fp, + sentences=sentences, + output_dir=output_dir, + mel_extractor=mel_extractor) + if record: + results.append(record) + else: + with ThreadPoolExecutor(nprocs) as pool: + futures = [] + with tqdm.tqdm(total=len(fps)) as progress: + for fp in fps: + future = pool.submit(process_sentence, config, fp, + sentences, output_dir, mel_extractor) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + results = [] + for ft in futures: + record = ft.result() + if record: + results.append(record) + + results.sort(key=itemgetter("utt_id")) + with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: + for item in results: + writer.write(item) + print("Done") + + +def main(): + # parse config and args + parser = argparse.ArgumentParser( + description="Preprocess audio and then extract features.") + + parser.add_argument( + "--dataset", + default="csmsc", + type=str, + help="name of dataset, should in {ljspeech,csmsc} now") + + parser.add_argument( + "--rootdir", default='./BZNSYP/', type=str, help="directory to dataset.") + + parser.add_argument( + "--dumpdir", + type=str, + default='./dump/', + #required=True, + help="directory to dump feature files.") + + parser.add_argument( + "--config-path", + default="./default.yaml", + type=str, + help="yaml format configuration file.") + + parser.add_argument( + "--num-cpu", type=int, default=1, help="number of process.") + + args = parser.parse_args() + config_path = Path(args.config_path).resolve() + root_dir = Path(args.rootdir).expanduser() + dumpdir = Path(args.dumpdir).expanduser() + # use absolute path + dumpdir = dumpdir.resolve() + dumpdir.mkdir(parents=True, exist_ok=True) + + assert root_dir.is_dir() + + with open(config_path, 'rt') as f: + _C = yaml.safe_load(f) + _C = Configuration(_C) + config = _C.clone() + + phone_id_map_path = dumpdir / "phone_id_map.txt" + speaker_id_map_path = dumpdir / "speaker_id_map.txt" + if args.dataset == "csmsc": + wav_files = sorted(list((root_dir / "Wave").rglob("*.wav"))) + frontend = Chinese() + sentences, speaker_set = get_csmsc_sentences(root_dir / "000001-010000.txt", frontend) + print(speaker_set) + get_input_token(sentences, phone_id_map_path) + get_spk_id_map(speaker_set, speaker_id_map_path) + num_train = 9000 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + + if args.dataset == "ljspeech": + wav_files = sorted(list((root_dir / "wavs").rglob("*.wav"))) + frontend = English() + sentences, speaker_set = get_lj_sentences(root_dir / "metadata.csv",frontend) + get_input_token(sentences, phone_id_map_path) + get_spk_id_map(speaker_set, speaker_id_map_path) + # split data into 3 sections + num_train = 12900 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + + train_dump_dir = dumpdir / "train" / "raw" + train_dump_dir.mkdir(parents=True, exist_ok=True) + dev_dump_dir = dumpdir / "dev" / "raw" + dev_dump_dir.mkdir(parents=True, exist_ok=True) + test_dump_dir = dumpdir / "test" / "raw" + test_dump_dir.mkdir(parents=True, exist_ok=True) + + # Extractor + mel_extractor = LogMelFBank( + sr=config.fs, + n_fft=config.n_fft, + hop_length=config.n_shift, + win_length=config.win_length, + window=config.window, + n_mels=config.n_mels, + fmin=config.fmin, + fmax=config.fmax) + + # process for the 3 sections + if train_wav_files: + process_sentences( + config=config, + fps=train_wav_files, + sentences=sentences, + output_dir=train_dump_dir, + mel_extractor=mel_extractor, + nprocs=args.num_cpu) + if dev_wav_files: + process_sentences( + config=config, + fps=dev_wav_files, + sentences=sentences, + output_dir=dev_dump_dir, + mel_extractor=mel_extractor, + nprocs=args.num_cpu) + if test_wav_files: + process_sentences( + config=config, + fps=test_wav_files, + sentences=sentences, + output_dir=test_dump_dir, + mel_extractor=mel_extractor, + nprocs=args.num_cpu) + + +if __name__ == "__main__": + main()