diff --git a/.mergify.yml b/.mergify.yml index 3347c6dc..6dae66d0 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -80,6 +80,12 @@ pull_request_rules: actions: label: add: ["CLI"] + - name: "auto add label=Server" + conditions: + - files~=^paddlespeech/server + actions: + label: + add: ["Server"] - name: "auto add label=Demo" conditions: - files~=^demos/ @@ -130,7 +136,7 @@ pull_request_rules: add: ["Docker"] - name: "auto add label=Deployment" conditions: - - files~=^speechnn/ + - files~=^speechx/ actions: label: add: ["Deployment"] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2f80e46b..60f0b92f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,12 @@ +repos: - repo: https://github.com/pre-commit/mirrors-yapf.git - sha: v0.16.0 + rev: v0.16.0 hooks: - id: yapf files: \.py$ exclude: (?=third_party).*(\.py)$ - repo: https://github.com/pre-commit/pre-commit-hooks - sha: a11d9314b22d8f8c7556443875b731ef05965464 + rev: a11d9314b22d8f8c7556443875b731ef05965464 hooks: - id: check-merge-conflict - id: check-symlinks @@ -31,7 +32,7 @@ - --jobs=1 exclude: (?=third_party).*(\.py)$ - repo : https://github.com/Lucas-C/pre-commit-hooks - sha: v1.0.1 + rev: v1.0.1 hooks: - id: forbid-crlf files: \.md$ diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ffe8098..6e8315e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,46 @@ # Changelog +Date: 2022-1-29, Author: yt605155624. +Add features to: T2S: + - Update aishell3 vc0 with new Tacotron2. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1419 + +Date: 2022-1-29, Author: yt605155624. +Add features to: T2S: + - Add ljspeech Tacotron2. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1416 + +Date: 2022-1-24, Author: yt605155624. +Add features to: T2S: + - Add csmsc WaveRNN. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1379 + +Date: 2022-1-19, Author: yt605155624. +Add features to: T2S: + - Add csmsc Tacotron2. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1314 + Date: 2022-1-10, Author: Jackwaterveg. -Add features to: CLI: - - Support English (librispeech/asr1/transformer). +Add features to: CLI: + - Support English (librispeech/asr1/transformer). - Support choosing `decode_method` for conformer and transformer models. - Refactor the config, using the unified config. - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297 *** + +Date: 2022-1-17, Author: Jackwaterveg. +Add features to: CLI: + - Support deepspeech2 online/offline model(aishell). + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1356 + +*** + +Date: 2022-1-24, Author: Jackwaterveg. +Add features to: ctc_decoders: + - Support online ctc prefix-beam search decoder. + - Unified ctc online decoder and ctc offline decoder. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/821 + +*** diff --git a/README.md b/README.md index cca1cb53..9a2fe2aa 100644 --- a/README.md +++ b/README.md @@ -16,12 +16,15 @@

- + + + +

@@ -143,6 +146,8 @@ For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech sample
+- [PaddleSpeech Demo Video](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) + ### 🔥 Hot Activities - 2021.12.21~12.24 @@ -236,7 +241,7 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Speech Recogination + Speech Recogination Aishell DeepSpeech2 RNN + Conv based Models @@ -249,7 +254,7 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r u2.transformer.conformer-aishell - + Librispeech Transformer based Attention Models @@ -257,6 +262,13 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r + + TIMIT + Unified Streaming & Non-streaming Two-pass + + u2-timit + + Alignment THCHS30 @@ -266,20 +278,13 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Language Model + Language Model Ngram Language Model kenlm - - TIMIT - Unified Streaming & Non-streaming Two-pass - - u2-timit - - - + Speech Translation (English to Chinese) TED En-Zh Transformer + ASR MTL @@ -317,14 +322,15 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r Acoustic Model - Tacotron2 - LJSpeech + Tacotron2 + LJSpeech / CSMSC - tacotron2-ljspeech + tacotron2-ljspeech / tacotron2-csmsc Transformer TTS + LJSpeech transformer-ljspeech @@ -344,7 +350,7 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Vocoder + Vocoder WaveFlow LJSpeech @@ -378,7 +384,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r HiFiGAN-csmsc - + + + WaveRNN + CSMSC + + WaveRNN-csmsc + + Voice Cloning GE2E @@ -416,7 +429,6 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Audio Classification ESC-50 @@ -440,7 +452,6 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Punctuation Restoration IWLST2012_zh @@ -463,7 +474,6 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](ht - [Automatic Speech Recognition](./docs/source/asr/quick_start.md) - [Introduction](./docs/source/asr/models_introduction.md) - [Data Preparation](./docs/source/asr/data_preparation.md) - - [Data Augmentation](./docs/source/asr/augmentation.md) - [Ngram LM](./docs/source/asr/ngram_lm.md) - [Text-to-Speech](./docs/source/tts/quick_start.md) - [Introduction](./docs/source/tts/models_introduction.md) @@ -489,7 +499,17 @@ author={PaddlePaddle Authors}, howpublished = {\url{https://github.com/PaddlePaddle/PaddleSpeech}}, year={2021} } + +@inproceedings{zheng2021fused, + title={Fused acoustic and text encoding for multimodal bilingual pretraining and speech translation}, + author={Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Huang, Liang}, + booktitle={International Conference on Machine Learning}, + pages={12736--12746}, + year={2021}, + organization={PMLR} +} ``` + ## Contribute to PaddleSpeech @@ -540,6 +560,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. - Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function. - Many thanks to [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) for contributing Punctuation Restoration model. +- Many thanks to [kslz](https://github.com/745165806) for supplementary Chinese documents. Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. diff --git a/README_cn.md b/README_cn.md index ddf189c3..409b7a25 100644 --- a/README_cn.md +++ b/README_cn.md @@ -147,6 +147,8 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
+- [PaddleSpeech 示例视频](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) + ### 🔥 热门活动 @@ -233,7 +235,7 @@ PaddleSpeech 的 **语音转文本** 包含语音识别声学模型、语音识 - 语音识别 + 语音识别 Aishell DeepSpeech2 RNN + Conv based Models @@ -254,6 +256,13 @@ PaddleSpeech 的 **语音转文本** 包含语音识别声学模型、语音识 + + TIMIT + Unified Streaming & Non-streaming Two-pass + + u2-timit + + 对齐 THCHS30 @@ -263,19 +272,12 @@ PaddleSpeech 的 **语音转文本** 包含语音识别声学模型、语音识 - 语言模型 + 语言模型 Ngram 语言模型 kenlm - - TIMIT - Unified Streaming & Non-streaming Two-pass - - u2-timit - - 语音翻译(英译中) TED En-Zh @@ -315,14 +317,15 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 声学模型 - Tacotron2 - LJSpeech + Tacotron2 + LJSpeech / CSMSC - tacotron2-ljspeech + tacotron2-ljspeech / tacotron2-csmsc Transformer TTS + LJSpeech transformer-ljspeech @@ -342,7 +345,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - 声码器 + 声码器 WaveFlow LJSpeech @@ -376,7 +379,14 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 HiFiGAN-csmsc - + + + WaveRNN + CSMSC + + WaveRNN-csmsc + + 声音克隆 GE2E @@ -415,8 +425,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - - 声音分类 ESC-50 @@ -440,7 +448,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - 标点恢复 IWLST2012_zh @@ -468,7 +475,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - [语音识别自定义训练](./docs/source/asr/quick_start.md) - [简介](./docs/source/asr/models_introduction.md) - [数据准备](./docs/source/asr/data_preparation.md) - - [数据增强](./docs/source/asr/augmentation.md) - [Ngram 语言模型](./docs/source/asr/ngram_lm.md) - [语音合成自定义训练](./docs/source/tts/quick_start.md) - [简介](./docs/source/tts/models_introduction.md) @@ -549,6 +555,7 @@ year={2021} - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。 - 非常感谢 [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) 采用 PaddleSpeech 语音合成功能实现 Virtual Uploader(VUP)/Virtual YouTuber(VTuber) 虚拟主播。 - 非常感谢 [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) 贡献标点重建相关模型。 +- 非常感谢 [kslz](https://github.com/kslz) 补充中文文档。 此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。 diff --git a/dataset/voxceleb/README.md b/dataset/voxceleb/README.md new file mode 100644 index 00000000..3efb3519 --- /dev/null +++ b/dataset/voxceleb/README.md @@ -0,0 +1,10 @@ +# [VoxCeleb](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/) +VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from interview videos uploaded to YouTube。 + +VoxCeleb contains speech from speakers spanning a wide range of different ethnicities, accents, professions and ages. +All speaking face-tracks are captured "in the wild", with background chatter, laughter, overlapping speech, pose variation and different lighting conditions. +VoxCeleb consists of both audio and video. Each segment is at least 3 seconds long. + +The dataset consists of two versions, VoxCeleb1 and VoxCeleb2. Each version has it's own train/test split. For each we provide YouTube URLs, face detections and tracks, audio files, cropped face videos and speaker meta-data. There is no overlap between the two versions. + +more info in details refers to http://www.robots.ox.ac.uk/~vgg/data/voxceleb/ diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py new file mode 100644 index 00000000..ce744751 --- /dev/null +++ b/dataset/voxceleb/voxceleb1.py @@ -0,0 +1,188 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare VoxCeleb1 dataset + +create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. + +researchers should download the voxceleb1 dataset yourselves +through google form to get the username & password and unpack the data +""" +import argparse +import codecs +import glob +import json +import os +import subprocess +from pathlib import Path + +import soundfile + +from utils.utility import check_md5sum +from utils.utility import download +from utils.utility import unzip + +# all the data will be download in the current data/voxceleb directory default +DATA_HOME = os.path.expanduser('.') + +# if you use the http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/ as the download base url +# you need to get the username & password via the google form + +# if you use the https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a as the download base url, +# you need use --no-check-certificate to connect the target download url + +BASE_URL = "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a" + +# dev data +DEV_LIST = { + "vox1_dev_wav_partaa": "e395d020928bc15670b570a21695ed96", + "vox1_dev_wav_partab": "bbfaaccefab65d82b21903e81a8a8020", + "vox1_dev_wav_partac": "017d579a2a96a077f40042ec33e51512", + "vox1_dev_wav_partad": "7bb1e9f70fddc7a678fa998ea8b3ba19", +} +DEV_TARGET_DATA = "vox1_dev_wav_parta* vox1_dev_wav.zip ae63e55b951748cc486645f532ba230b" + +# test data +TEST_LIST = {"vox1_test_wav.zip": "185fdc63c3c739954633d50379a3d102"} +TEST_TARGET_DATA = "vox1_test_wav.zip vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102" + +# kaldi trial +# this trial file is organized by kaldi according the official file, +# which is a little different with the official trial veri_test2.txt +KALDI_BASE_URL = "http://www.openslr.org/resources/49/" +TRIAL_LIST = {"voxceleb1_test_v2.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7"} +TRIAL_TARGET_DATA = "voxceleb1_test_v2.txt voxceleb1_test_v2.txt 29fc7cc1c5d59f0816dc15d6e8be60f7" + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/voxceleb1/", + type=str, + help="Directory to save the voxceleb1 dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") + +args = parser.parse_args() + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + data_path = os.path.join(data_dir, "wav", "**", "*.wav") + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + speakers = set() + for audio_path in glob.glob(data_path, recursive=True): + audio_id = "-".join(audio_path.split("/")[-3:]) + utt2spk = audio_path.split("/")[-3] + duration = soundfile.info(audio_path).duration + text = "" + json_lines.append( + json.dumps( + { + "utt": audio_id, + "utt2spk": str(utt2spk), + "feat": audio_path, + "feat_shape": (duration, ), + "text": text # compatible with asr data format + }, + ensure_ascii=False)) + + total_sec += duration + total_text += len(text) + total_num += 1 + speakers.add(utt2spk) + + # data_dir_name refer to dev or test + # voxceleb1 is given explicit in the path + data_dir_name = Path(data_dir).name + manifest_path_prefix = manifest_path_prefix + "." + data_dir_name + with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f: + for line in json_lines: + f.write(line + "\n") + + manifest_dir = os.path.dirname(manifest_path_prefix) + meta_path = os.path.join(manifest_dir, "voxceleb1." + + data_dir_name) + ".meta" + with codecs.open(meta_path, 'w', encoding='utf-8') as f: + print(f"{total_num} utts", file=f) + print(f"{len(speakers)} speakers", file=f) + print(f"{total_sec / (60 * 60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + +def prepare_dataset(base_url, data_list, target_dir, manifest_path, + target_data): + if not os.path.exists(target_dir): + os.mkdir(target_dir) + + # wav directory already exists, it need do nothing + if not os.path.exists(os.path.join(target_dir, "wav")): + # download all dataset part + for zip_part in data_list.keys(): + download_url = " --no-check-certificate " + base_url + "/" + zip_part + download( + url=download_url, + md5sum=data_list[zip_part], + target_dir=target_dir) + + # pack the all part to target zip file + all_target_part, target_name, target_md5sum = target_data.split() + target_name = os.path.join(target_dir, target_name) + if not os.path.exists(target_name): + pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part, + target_name) + subprocess.call(pack_part_cmd, shell=True) + + # check the target zip file md5sum + if not check_md5sum(target_name, target_md5sum): + raise RuntimeError("{} MD5 checkssum failed".format(target_name)) + else: + print("Check {} md5sum successfully".format(target_name)) + + # unzip the all zip file + if target_name.endswith(".zip"): + unzip(target_name, target_dir) + + # create the manifest file + create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path) + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + base_url=BASE_URL, + data_list=DEV_LIST, + target_dir=os.path.join(args.target_dir, "dev"), + manifest_path=args.manifest_prefix, + target_data=DEV_TARGET_DATA) + + prepare_dataset( + base_url=BASE_URL, + data_list=TEST_LIST, + target_dir=os.path.join(args.target_dir, "test"), + manifest_path=args.manifest_prefix, + target_data=TEST_TARGET_DATA) + + print("Manifest prepare done!") + +if __name__ == '__main__': + main() diff --git a/docs/images/arch/PaddleSpeech_Server_architecture_diagram.png b/docs/images/arch/PaddleSpeech_Server_architecture_diagram.png new file mode 100644 index 00000000..16f8ddcc Binary files /dev/null and b/docs/images/arch/PaddleSpeech_Server_architecture_diagram.png differ diff --git a/docs/images/arch/PaddleSpeech_Server_class_diagram.png b/docs/images/arch/PaddleSpeech_Server_class_diagram.png new file mode 100644 index 00000000..0c3daddd Binary files /dev/null and b/docs/images/arch/PaddleSpeech_Server_class_diagram.png differ diff --git a/docs/images/arch/paddlespeech_high_layout.jpg b/docs/images/arch/paddlespeech_high_layout.jpg new file mode 100644 index 00000000..f726aa2e Binary files /dev/null and b/docs/images/arch/paddlespeech_high_layout.jpg differ diff --git a/docs/source/asr/augmentation.md b/docs/source/asr/augmentation.md deleted file mode 100644 index 8e65cb19..00000000 --- a/docs/source/asr/augmentation.md +++ /dev/null @@ -1,40 +0,0 @@ -# Data Augmentation Pipeline - -Data augmentation has often been a highly effective technique to boost deep learning performance. We augment our speech data by synthesizing new audios with small random perturbation (label-invariant transformation) added upon raw audios. You don't have to do the syntheses on your own, as it is already embedded into the data provider and is done on the fly, randomly for each epoch during training. - -Six optional augmentation components are provided to be selected, configured, and inserted into the processing pipeline. - -* Audio - - Volume Perturbation - - Speed Perturbation - - Shifting Perturbation - - Online Bayesian normalization - - Noise Perturbation (need background noise audio files) - - Impulse Response (need impulse audio files) - -* Feature - - SpecAugment - - Adaptive SpecAugment - -To inform the trainer of what augmentation components are needed and what their processing orders are, it is required to prepare in advance an *augmentation configuration file* in [JSON](http://www.json.org/) format. For example: - -``` -[{ - "type": "speed", - "params": {"min_speed_rate": 0.95, - "max_speed_rate": 1.05}, - "prob": 0.6 -}, -{ - "type": "shift", - "params": {"min_shift_ms": -5, - "max_shift_ms": 5}, - "prob": 0.8 -}] -``` - -When the `augment_conf_file` argument is set to the path of the above example configuration file, every audio clip in every epoch will be processed: with 60% of chance, it will first be speed perturbed with a uniformly random sampled speed-rate between 0.95 and 1.05, and then with 80% of chance it will be shifted in time with a randomly sampled offset between -5 ms and 5 ms. Finally, this newly synthesized audio clip will be fed into the feature extractor for further training. - -For other configuration examples, please refer to `examples/conf/augmentation.example.json`. - -Be careful when utilizing the data augmentation technique, as improper augmentation will harm the training, due to the enlarged train-test gap. diff --git a/docs/source/asr/models_introduction.md b/docs/source/asr/models_introduction.md index d82e12c0..56d58197 100644 --- a/docs/source/asr/models_introduction.md +++ b/docs/source/asr/models_introduction.md @@ -38,7 +38,7 @@ vi examples/librispeech/s0/data/vocab.txt ``` #### CMVN -For CMVN, a subset of the full of the training set is selected and be used to compute the feature mean and std. +For CMVN, a subset of or full of the training set is selected and be used to compute the feature mean and std. ``` # The code to compute the feature mean and std cd examples/aishell/s0 diff --git a/docs/source/demo_video.rst b/docs/source/demo_video.rst new file mode 100644 index 00000000..dc7e718a --- /dev/null +++ b/docs/source/demo_video.rst @@ -0,0 +1,13 @@ +Demo Video +================== + +.. raw:: html + + + diff --git a/docs/source/index.rst b/docs/source/index.rst index 5bbc9319..7f9c87bd 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -27,7 +27,6 @@ Contents asr/models_introduction asr/data_preparation - asr/augmentation asr/feature_list asr/ngram_lm @@ -42,6 +41,7 @@ Contents tts/gan_vocoder tts/demo tts/demo_2 + .. toctree:: :maxdepth: 1 @@ -51,12 +51,14 @@ Contents .. toctree:: :maxdepth: 1 - :caption: Acknowledgement - - asr/reference - - + :caption: Demos + demo_video + tts_demo_video +.. toctree:: + :maxdepth: 1 + :caption: Acknowledgement + asr/reference diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 3310bfb2..8f855f7c 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -1,3 +1,4 @@ + # Released Models ## Speech-to-Text Models @@ -9,9 +10,10 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.056 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) -[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1) -[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1) -[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2) +[Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz)| Librispeech Dataset | Char-based | 518 MB | 2 Conv + 3 bidirectional LSTM layers| - |0.0725| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0) +[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1) +[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../examples/librispeech/asr1) +[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../examples/librispeech/asr2) ### Language Model based on NGram Language Model | Training Data | Token-based | Size | Descriptions @@ -31,14 +33,15 @@ Language Model | Training Data | Token-based | Size | Descriptions ### Acoustic Models Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: -Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)||| +Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)||| +Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB| FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB| FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)||| FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)||| FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)||| -FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| +FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| ### Vocoders Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size (static) @@ -51,12 +54,14 @@ Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeec |Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB| Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB| +WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB| + ### Voice Cloning Model Type | Dataset| Example Link | Pretrained Models :-------------:| :------------:| :-----: | :-----: GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip) -GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip) +GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_vc0_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_vc0_0.2.0.zip) GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)|[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) @@ -65,7 +70,7 @@ GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/ Model Type | Dataset| Example Link | Pretrained Models :-------------:| :------------:| :-----: | :-----: PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams), [panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams), [panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams) -PANN | ESC-50 |[pann-esc50]("./examples/esc50/cls0")|[esc50_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn6.tar.gz), [esc50_cnn10.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn10.tar.gz), [esc50_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn14.tar.gz) +PANN | ESC-50 |[pann-esc50](../../examples/esc50/cls0)|[esc50_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn6.tar.gz), [esc50_cnn10.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn10.tar.gz), [esc50_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn14.tar.gz) ## Punctuation Restoration Models Model Type | Dataset| Example Link | Pretrained Models diff --git a/docs/source/tts/README.md b/docs/source/tts/README.md index 3de8901b..835db08e 100644 --- a/docs/source/tts/README.md +++ b/docs/source/tts/README.md @@ -71,7 +71,3 @@ Check our [website](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) #### GE2E 1. [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip) - -## License - -Parakeet is provided under the [Apache-2.0 license](LICENSE). diff --git a/docs/source/tts/quick_start.md b/docs/source/tts/quick_start.md index 3180d80a..bddee778 100644 --- a/docs/source/tts/quick_start.md +++ b/docs/source/tts/quick_start.md @@ -1,3 +1,4 @@ +([简体中文](./quick_start_cn.md)|English) # Quick Start of Text-to-Speech The examples in PaddleSpeech are mainly classified by datasets, the TTS datasets we mainly used are: * CSMCS (Mandarin single speaker) diff --git a/docs/source/tts/quick_start_cn.md b/docs/source/tts/quick_start_cn.md new file mode 100644 index 00000000..37246e84 --- /dev/null +++ b/docs/source/tts/quick_start_cn.md @@ -0,0 +1,205 @@ +(简体中文|[English](./quick_start.md)) +# 语音合成快速开始 +这些PaddleSpeech中的样例主要按数据集分类,我们主要使用的TTS数据集有: + +* CSMCS (普通话单发音人) +* AISHELL3 (普通话多发音人) +* LJSpeech (英文单发音人) +* VCTK (英文多发音人) + +PaddleSpeech 的 TTS 模型具有以下映射关系: + +* tts0 - Tactron2 +* tts1 - TransformerTTS +* tts2 - SpeedySpeech +* tts3 - FastSpeech2 +* voc0 - WaveFlow +* voc1 - Parallel WaveGAN +* voc2 - MelGAN +* voc3 - MultiBand MelGAN +* voc4 - Style MelGAN +* voc5 - HiFiGAN +* vc0 - Tactron2 Voice Clone with GE2E +* vc1 - FastSpeech2 Voice Clone with GE2E + +## 快速开始 + +让我们以 FastSpeech2 + Parallel WaveGAN 和 CSMSC 数据集 为例. [examples/csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc) + +### 用 CSMSC 数据集训练 Parallel WaveGAN + +- 进入目录 + ```bash + cd examples/csmsc/voc1 + ``` +- 设置环境变量 + ```bash + source path.sh + ``` + **在你开始做任何事情之前,必须先做这步** + 将 `MAIN_ROOT` 设置为项目目录. 使用 `parallelwave_gan` 模型作为 `MODEL`. + +- 运行 + ```bash + bash run.sh + ``` + 这只是一个演示,请确保源数据已经准备好,并且在下一个 `step` 之前每个 `step` 都运行正常. +### 用CSMSC数据集训练FastSpeech2 + +- 进入目录 + ```bash + cd examples/csmsc/tts3 + ``` + +- 设置环境变量 + ```bash + source path.sh + ``` + **在你开始做任何事情之前,必须先做这步** + 将 `MAIN_ROOT` 设置为项目目录. 使用 `fastspeech2` 模型作为 `MODEL` 。 + +- 运行 + ```bash + bash run.sh + ``` + 这只是一个演示,请确保源数据已经准备好,并且在下一个 `step` 之前每个 `step` 都运行正常。 + +`run.sh` 中主要包括以下步骤: + +- 设置路径。 +- 预处理数据集, +- 训练模型。 +- 从 `metadata.jsonl` 中合成波形 +- 从文本文件合成波形。(在声学模型中) +- 使用静态模型进行推理。(可选) + +有关更多详细信息,请参见 examples 中的 `README.md` + +## TTS 流水线 +本节介绍如何使用 TTS 提供的预训练模型,并对其进行推理。 + +TTS中的预训练模型在压缩包中提供。将其解压缩以获得如下文件夹: +**Acoustic Models:** + +```text +checkpoint_name +├── default.yaml +├── snapshot_iter_*.pdz +├── speech_stats.npy +├── phone_id_map.txt +├── spk_id_map.txt (optimal) +└── tone_id_map.txt (optimal) +``` +**Vocoders:** +```text +checkpoint_name +├── default.yaml +├── snapshot_iter_*.pdz +└── stats.npy +``` +- `default.yaml` 存储用于训练模型的配置。 +- `snapshot_iter_*.pdz` 是检查点文件,其中`*`是它经过训练的步骤。 +- `*_stats.npy` 是特征的统计文件,如果它在训练前已被标准化。 +- `phone_id_map.txt` 是音素到音素 ID 的映射关系。 +- `tone_id_map.txt` 是在训练声学模型之前分割音调和拼音时,音调到音调 ID 的映射关系。(例如在 csmsc/speedyspeech 的示例中) +- `spk_id_map.txt` 是多发音人声学模型中 "发音人" 到 "spk_ids" 的映射关系。 + +下面的示例代码显示了如何使用模型进行预测。 +### Acoustic Models 声学模型(文本到频谱图) +下面的代码显示了如何使用 `FastSpeech2` 模型。加载预训练模型后,使用它和 normalizer 对象构建预测对象,然后使用 `fastspeech2_inferencet(phone_ids)` 生成频谱图,频谱图可进一步用于使用声码器合成原始音频。 + +```python +from pathlib import Path +import numpy as np +import paddle +import yaml +from yacs.config import CfgNode +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference +from paddlespeech.t2s.modules.normalizer import ZScore +# examples/fastspeech2/baker/frontend.py +from frontend import Frontend + +# 加载预训练模型 +checkpoint_dir = Path("fastspeech2_nosil_baker_ckpt_0.4") +with open(checkpoint_dir / "phone_id_map.txt", "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] +vocab_size = len(phn_id) +with open(checkpoint_dir / "default.yaml") as f: + fastspeech2_config = CfgNode(yaml.safe_load(f)) +odim = fastspeech2_config.n_mels +model = FastSpeech2( + idim=vocab_size, odim=odim, **fastspeech2_config["model"]) +model.set_state_dict( + paddle.load(args.fastspeech2_checkpoint)["main_params"]) +model.eval() + +# 加载特征文件 +stat = np.load(checkpoint_dir / "speech_stats.npy") +mu, std = stat +mu = paddle.to_tensor(mu) +std = paddle.to_tensor(std) +fastspeech2_normalizer = ZScore(mu, std) + +# 构建预测对象 +fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model) + +# load Chinese Frontend +frontend = Frontend(checkpoint_dir / "phone_id_map.txt") + +# 构建一个中文前端 +sentence = "你好吗?" +input_ids = frontend.get_input_ids(sentence, merge_sentences=True) +phone_ids = input_ids["phone_ids"] +flags = 0 +# 构建预测对象加载中文前端,对中文文本前端的输出进行分段 +for part_phone_ids in phone_ids: + with paddle.no_grad(): + temp_mel = fastspeech2_inference(part_phone_ids) + if flags == 0: + mel = temp_mel + flags = 1 + else: + mel = paddle.concat([mel, temp_mel]) +``` + +### Vcoder声码器(谱图到波形) +下面的代码显示了如何使用 `Parallel WaveGAN` 模型。像上面的例子一样,加载预训练模型后,使用它和 normalizer 对象构建预测对象,然后使用 `pwg_inference(mel)` 生成原始音频( wav 格式)。 + +```python +from pathlib import Path +import numpy as np +import paddle +import soundfile as sf +import yaml +from yacs.config import CfgNode +from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator +from paddlespeech.t2s.models.parallel_wavegan import PWGInference +from paddlespeech.t2s.modules.normalizer import ZScore + +# 加载预训练模型 +checkpoint_dir = Path("parallel_wavegan_baker_ckpt_0.4") +with open(checkpoint_dir / "pwg_default.yaml") as f: + pwg_config = CfgNode(yaml.safe_load(f)) +vocoder = PWGGenerator(**pwg_config["generator_params"]) +vocoder.set_state_dict(paddle.load(args.pwg_params)) +vocoder.remove_weight_norm() +vocoder.eval() + +# 加载特征文件 +stat = np.load(checkpoint_dir / "pwg_stats.npy") +mu, std = stat +mu = paddle.to_tensor(mu) +std = paddle.to_tensor(std) +pwg_normalizer = ZScore(mu, std) + +# 加载预训练模型构造预测对象 +pwg_inference = PWGInference(pwg_normalizer, vocoder) + +# 频谱图到波形 +wav = pwg_inference(mel) +sf.write( + audio_path, + wav.numpy(), + samplerate=fastspeech2_config.fs) +``` diff --git a/docs/source/tts/tts_datasets.md b/docs/source/tts/tts_datasets.md new file mode 100644 index 00000000..a79981df --- /dev/null +++ b/docs/source/tts/tts_datasets.md @@ -0,0 +1,75 @@ +# TTS Datasets + +## Mandarin +- [CSMSC](https://www.data-baker.com/open_source.html): Chinese Standard Mandarin Speech Copus + - Duration/h: 12 + - Number of Sentences: 10,000 + - Size: 2.14GB + - Speaker: 1 female, ages 20 ~30 + - Sample Rate: 48 kHz、16bit + - Mean Words per Clip: 16 +- [AISHELL-3](http://www.aishelltech.com/aishell_3) + - Duration/h: 85 + - Number of Sentences: 88,035 + - Size: 17.75GB + - Speaker: 218 + - Sample Rate: 44.1 kHz、16bit + +## English +- [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) + - Duration/h: 24 + - Number of Sentences: 13,100 + - Size: 2.56GB + - Speaker: 1, age 20 ~30 + - Sample Rate: 22050 Hz、16bit + - Mean Words per Clip: 17.23 +- [VCTK](https://datashare.ed.ac.uk/handle/10283/3443) + - Number of Sentences: 44,583 + - Size: 10.94GB + - Speaker: 110 + - Sample Rate: 48 kHz、16bit + - Mean Words per Clip: 17.23 + +## Japanese + + +- [tri-jek](https://sites.google.com/site/shinnosuketakamichi/research-topics/tri-jek_corpus): Japanese-English-Korean tri-lingual corpus +- [JSSS-misc](https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss-misc_corpus): misc tasks of JSSS corpus +- [JTubeSpeech](https://github.com/sarulab-speech/jtubespeech): Corpus of Japanese speech collected from YouTube +- [J-MAC](https://sites.google.com/site/shinnosuketakamichi/research-topics/j-mac_corpus): Japanese multi-speaker audiobook corpus +- [J-KAC](https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus): Japanese Kamishibai and audiobook corpus +- [JMD](https://sites.google.com/site/shinnosuketakamichi/research-topics/jmd_corpus): Japanese multi-dialect corpus +- [JSSS](https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus): Japanese multi-style (summarization and simplification) corpus +- [RWCP-SSD-Onomatopoeia](https://www.ksuke.net/dataset/rwcp-ssd-onomatopoeia): onomatopoeic word dataset for environmental sounds +- [Life-m](https://sites.google.com/site/shinnosuketakamichi/research-topics/life-m_corpus): landmark image-themed music corpus +- [PJS](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus): Phoneme-balanced Japanese singing voice corpus +- [JVS-MuSiC](https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_music): Japanese multi-speaker singing-voice corpus +- [JVS](https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus): Japanese multi-speaker voice corpus +- [JSUT-book](https://sites.google.com/site/shinnosuketakamichi/publication/jsut-book): audiobook corpus by a single Japanese speaker +- [JSUT-vi](https://sites.google.com/site/shinnosuketakamichi/publication/jsut-vi): vocal imitation corpus by a single Japanese speaker +- [JSUT-song](https://sites.google.com/site/shinnosuketakamichi/publication/jsut-song): singing voice corpus by a single Japanese singer +- [JSUT](https://sites.google.com/site/shinnosuketakamichi/publication/jsut): a large-scaled corpus of reading-style Japanese speech by a single speaker + +## Emotions +### English +- [CREMA-D](https://github.com/CheyneyComputerScience/CREMA-D) +- [Seen and Unseen emotional style transfer for voice conversion with a new emotional speech dataset](https://kunzhou9646.github.io/controllable-evc/) + - paper : [Seen and Unseen emotional style transfer for voice conversion with a new emotional speech dataset](https://arxiv.org/abs/2010.14794) +### Mandarin +- [EMOVIE Dataset](https://viem-ccy.github.io/EMOVIE/dataset_release ) + - paper: [EMOVIE: A Mandarin Emotion Speech Dataset with a Simple Emotional Text-to-Speech Model](https://arxiv.org/abs/2106.09317) +- MASC + - paper: [MASC: A Speech Corpus in Mandarin for Emotion Analysis and Affective Speaker Recognition](https://ieeexplore.ieee.org/document/4013501) +### English && Mandarin +- [Emotional Voice Conversion: Theory, Databases and ESD](https://github.com/HLTSingapore/Emotional-Speech-Data) + - paper: [Emotional Voice Conversion: Theory, Databases and ESD](https://arxiv.org/abs/2105.14762) + +## Music +- [GiantMIDI-Piano](https://github.com/bytedance/GiantMIDI-Piano) +- [MAESTRO Dataset](https://magenta.tensorflow.org/datasets/maestro) + - [tf code](https://www.tensorflow.org/tutorials/audio/music_generation) +- [Opencpop](https://wenet.org.cn/opencpop/) diff --git a/docs/source/tts_demo_video.rst b/docs/source/tts_demo_video.rst new file mode 100644 index 00000000..4f807165 --- /dev/null +++ b/docs/source/tts_demo_video.rst @@ -0,0 +1,12 @@ +TTS Demo Video +================== + +.. raw:: html + + + diff --git a/docs/tutorial/asr/tutorial_deepspeech2.ipynb b/docs/tutorial/asr/tutorial_deepspeech2.ipynb index 86790473..34c0090a 100644 --- a/docs/tutorial/asr/tutorial_deepspeech2.ipynb +++ b/docs/tutorial/asr/tutorial_deepspeech2.ipynb @@ -265,7 +265,7 @@ }, "outputs": [], "source": [ - "!pip install --upgrade pip && pip install paddlespeech" + "!pip install --upgrade pip && pip install paddlespeech==0.1.0" ] }, { diff --git a/docs/tutorial/asr/tutorial_transformer.ipynb b/docs/tutorial/asr/tutorial_transformer.ipynb index c9eb5ebb..dc303006 100644 --- a/docs/tutorial/asr/tutorial_transformer.ipynb +++ b/docs/tutorial/asr/tutorial_transformer.ipynb @@ -138,7 +138,7 @@ }, "outputs": [], "source": [ - "!pip install --upgrade pip && pip install paddlespeech" + "!pip install --upgrade pip && pip install paddlespeech==0.1.0" ] }, { diff --git a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml index 9de06711..c07bc77e 100644 --- a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml +++ b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml @@ -1,10 +1,10 @@ -chunk_batch_size: 32 +decode_batch_size: 32 error_rate_type: cer decoding_method: ctc_beam_search lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm alpha: 2.2 #1.9 beta: 4.3 -beam_size: 300 +beam_size: 500 cutoff_prob: 0.99 cutoff_top_n: 40 num_proc_bsearch: 10 diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index 2538e8f9..281ad836 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -257,6 +257,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \ --output_dir=exp/default/test_e2e \ --phones_dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \ --speaker_dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt \ - --spk_id=0 + --spk_id=0 \ + --inference_dir=exp/default/inference ``` diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml index 3a57e902..ac495674 100644 --- a/examples/aishell3/tts3/conf/default.yaml +++ b/examples/aishell3/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type @@ -84,7 +84,6 @@ updater: use_masking: True # whether to apply masking for padded part in loss calculation - ########################################################### # OPTIMIZER SETTING # ########################################################### diff --git a/examples/aishell3/tts3/local/inference.sh b/examples/aishell3/tts3/local/inference.sh new file mode 100755 index 00000000..3b03b53c --- /dev/null +++ b/examples/aishell3/tts3/local/inference.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_aishell3 \ + --voc=pwgan_aishell3 \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=0 +fi + diff --git a/examples/aishell3/tts3/local/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh index d0d92585..60e1a5ce 100755 --- a/examples/aishell3/tts3/local/synthesize_e2e.sh +++ b/examples/aishell3/tts3/local/synthesize_e2e.sh @@ -20,4 +20,5 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \ --output_dir=${train_output_path}/test_e2e \ --phones_dict=dump/phone_id_map.txt \ --speaker_dict=dump/speaker_id_map.txt \ - --spk_id=0 + --spk_id=0 \ + --inference_dir=${train_output_path}/inference diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md index 91d32619..664ec1ac 100644 --- a/examples/aishell3/vc0/README.md +++ b/examples/aishell3/vc0/README.md @@ -1,94 +1,140 @@ # Tacotron2 + AISHELL-3 Voice Cloning -This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows: -1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2 because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e). -2. Synthesizer: We use the trained speaker encoder to generate speaker embedding for each sentence in AISHELL-3. This embedding is an extra input of Tacotron2 which will be concated with encoder outputs. -3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0). +This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows: +1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `Tacotron2` because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e). +2. Synthesizer: We use the trained speaker encoder to generate speaker embedding for each sentence in AISHELL-3. This embedding is an extra input of `Tacotron2` which will be concated with encoder outputs. +3. Vocoder: We use [Parallel Wave GAN](http://arxiv.org/abs/1910.11480) as the neural Vocoder, refer to [voc1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1). + +## Dataset +### Download and Extract +Download AISHELL-3. +```bash +wget https://www.openslr.org/resources/93/data_aishell3.tgz +``` +Extract AISHELL-3. +```bash +mkdir data_aishell3 +tar zxvf data_aishell3.tgz -C data_aishell3 +``` +### Get MFA Result and Extract +We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here. +You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo. + +## Pretrained GE2E Model +We use pretrained GE2E model to generate speaker embedding for each sentence. + +Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it. ## Get Started Assume the path to the dataset is `~/datasets/data_aishell3`. -Assume the path to the MFA result of AISHELL-3 is `./alignment`. -Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000` +Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`. +Assume the path to the pretrained ge2e model is `./ge2e_ckpt_0.3`. + Run the command below to 1. **source path**. 2. preprocess the dataset. 3. train the model. -4. start a voice cloning inference. +4. synthesize waveform from `metadata.jsonl`. +5. start a voice cloning inference. ```bash ./run.sh ``` -You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset. +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. ```bash ./run.sh --stage 0 --stop-stage 0 ``` ### Data Preprocessing ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} +CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path} ``` -#### Generate Speaker Embedding - Use pretrained GE2E (speaker encoder) to generate speaker embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`. - -```bash -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - python3 ${BIN_DIR}/../ge2e/inference.py \ - --input=${input} \ - --output=${preprocess_path}/embed \ - --ngpu=1 \ - --checkpoint_path=${ge2e_ckpt_path} -fi +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. +```text +dump +├── dev +│ ├── norm +│ └── raw +├── embed +│ ├── SSB0005 +│ ├── SSB0009 +│ ├── ... +│ └── ... +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── speech_stats.npy ``` +The `embed` contains the generated speaker embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`. The computing time of utterance embedding can be x hours. -#### Process Wav -There is silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get the alignment of text and speech, then utilize the alignment results to remove the silence. -We use Montreal Force Aligner 1.0. The label in aishell3 includes pinyin,so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You should preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`. +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. -We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon. - -You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo. +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and id of each utterance. +The preprocessing step is very similar to that one of [tts0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0), but there is one more `ge2e/inference` step here. +### Model Training +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "Process wav ..." - python3 ${BIN_DIR}/process_wav.py \ - --input=${input}/wav \ - --output=${preprocess_path}/normalized_wav \ - --alignment=${alignment} -fi +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` +The training step is very similar to that one of [tts0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0), but we should set `--voice-cloning=True` when calling `${BIN_DIR}/train.py`. -#### Preprocess Transcription -We revert the transcription into `phones` and `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels. - +### Synthesizing +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it. ```bash -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - python3 ${BIN_DIR}/preprocess_transcription.py \ - --input=${input} \ - --output=${preprocess_path} -fi +unzip pwg_aishell3_ckpt_0.5.zip ``` -The default input is `~/datasets/data_aishell3/train`,which contains `label_train-set.txt`, the processed results are `metadata.yaml` and `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading. -#### Extract Mel -```python -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - python3 ${BIN_DIR}/extract_mel.py \ - --input=${preprocess_path}/normalized_wav \ - --output=${preprocess_path}/mel -fi +Parallel WaveGAN checkpoint contains files listed below. +```text +pwg_aishell3_ckpt_0.5 +├── default.yaml # default config used to train parallel wavegan +├── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +└── snapshot_iter_1000000.pdz # generator parameters of parallel wavegan ``` - -### Model Training +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` +The synthesizing step is very similar to that one of [tts0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0), but we should set `--voice-cloning=True` when calling `${BIN_DIR}/../synthesize.py`. -Our model removes stop token prediction in Tacotron2, because of the problem of the extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition. - -In addition, to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster. ### Voice Cloning +Assume there are some reference audios in `./ref_audio` +```text +ref_audio +├── 001238.wav +├── LJ015-0254.wav +└── audio_self_test.mp3 +``` +`./local/voice_cloning.sh` calls `${BIN_DIR}/../voice_cloning.py` + ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} +CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} ``` + ## Pretrained Model -[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip). +[tacotron2_aishell3_ckpt_vc0_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_vc0_0.2.0.zip) + + +Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss +:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: +default| 2(gpu) x 37596|0.58704|0.39623|0.15073|0.039|1.9981e-04| + +Tacotron2 checkpoint contains files listed below. +(There is no need for `speaker_id_map.txt` here ) + +```text +tacotron2_aishell3_ckpt_vc0_0.2.0 +├── default.yaml # default config used to train tacotron2 +├── phone_id_map.txt # phone vocabulary file when training tacotron2 +├── snapshot_iter_37596.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training tacotron2 +``` + +## More +We strongly recommend that you use [FastSpeech2 + AISHELL-3 Voice Cloning](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1) which works better. diff --git a/examples/aishell3/vc0/conf/default.yaml b/examples/aishell3/vc0/conf/default.yaml new file mode 100644 index 00000000..26096eb2 --- /dev/null +++ b/examples/aishell3/vc0/conf/default.yaml @@ -0,0 +1,86 @@ +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size (samples). +n_shift: 300 # Hop size (samples). 12.5ms +win_length: 1200 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 64 +num_workers: 2 + +########################################################### +# MODEL SETTING # +########################################################### +model: # keyword arguments for the selected model + embed_dim: 512 # char or phn embedding dimension + elayers: 1 # number of blstm layers in encoder + eunits: 512 # number of blstm units + econv_layers: 3 # number of convolutional layers in encoder + econv_chans: 512 # number of channels in convolutional layer + econv_filts: 5 # filter size of convolutional layer + atype: location # attention function type + adim: 512 # attention dimension + aconv_chans: 32 # number of channels in convolutional layer of attention + aconv_filts: 15 # filter size of convolutional layer of attention + cumulate_att_w: True # whether to cumulate attention weight + dlayers: 2 # number of lstm layers in decoder + dunits: 1024 # number of lstm units in decoder + prenet_layers: 2 # number of layers in prenet + prenet_units: 256 # number of units in prenet + postnet_layers: 5 # number of layers in postnet + postnet_chans: 512 # number of channels in postnet + postnet_filts: 5 # filter size of postnet layer + output_activation: null # activation function for the final output + use_batch_norm: True # whether to use batch normalization in encoder + use_concate: True # whether to concatenate encoder embedding with decoder outputs + use_residual: False # whether to use residual connection in encoder + dropout_rate: 0.5 # dropout rate + zoneout_rate: 0.1 # zoneout rate + reduction_factor: 1 # reduction factor + spk_embed_dim: 256 # speaker embedding dimension + spk_embed_integration_type: concat # how to integrate speaker embedding + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation + use_guided_attn_loss: True # whether to use guided attention loss + guided_attn_loss_sigma: 0.4 # sigma of guided attention loss + guided_attn_loss_lambda: 1.0 # strength of guided attention loss + + +########################################################## +# OPTIMIZER SETTING # +########################################################## +optimizer: + optim: adam # optimizer type + learning_rate: 1.0e-03 # learning rate + epsilon: 1.0e-06 # epsilon + weight_decay: 0.0 # weight decay coefficient + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 100 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +seed: 42 \ No newline at end of file diff --git a/examples/aishell3/vc0/local/preprocess.sh b/examples/aishell3/vc0/local/preprocess.sh index 5bf88066..069cf94c 100755 --- a/examples/aishell3/vc0/local/preprocess.sh +++ b/examples/aishell3/vc0/local/preprocess.sh @@ -1,36 +1,72 @@ #!/bin/bash -stage=0 +stage=3 stop_stage=100 -input=$1 -preprocess_path=$2 -alignment=$3 -ge2e_ckpt_path=$4 +config_path=$1 +ge2e_ckpt_path=$2 +# gen speaker embedding if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \ - --input=${input}/wav \ - --output=${preprocess_path}/embed \ + --input=~/datasets/data_aishell3/train/wav/ \ + --output=dump/embed \ --checkpoint_path=${ge2e_ckpt_path} fi +# copy from tts3/preprocess if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "Process wav ..." - python3 ${BIN_DIR}/process_wav.py \ - --input=${input}/wav \ - --output=${preprocess_path}/normalized_wav \ - --alignment=${alignment} + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./aishell3_alignment_tone \ + --output durations.txt \ + --config=${config_path} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - python3 ${BIN_DIR}/preprocess_transcription.py \ - --input=${input} \ - --output=${preprocess_path} + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/preprocess.py \ + --dataset=aishell3 \ + --rootdir=~/datasets/data_aishell3/ \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --num-cpu=20 \ + --cut-sil=True \ + --spk_emb_dir=dump/embed fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - python3 ${BIN_DIR}/extract_mel.py \ - --input=${preprocess_path}/normalized_wav \ - --output=${preprocess_path}/mel + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="speech" +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # normalize and covert phone to id, dev and test should use train's stats + echo "Normalize ..." + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt fi diff --git a/examples/aishell3/vc0/local/synthesize.sh b/examples/aishell3/vc0/local/synthesize.sh new file mode 100755 index 00000000..98430280 --- /dev/null +++ b/examples/aishell3/vc0/local/synthesize.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_aishell3 \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_aishell3 \ + --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ + --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ + --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --voice-cloning=True diff --git a/examples/aishell3/vc0/local/train.sh b/examples/aishell3/vc0/local/train.sh index f062cbbf..c775fcad 100755 --- a/examples/aishell3/vc0/local/train.sh +++ b/examples/aishell3/vc0/local/train.sh @@ -1,9 +1,13 @@ #!/bin/bash -preprocess_path=$1 +config_path=$1 train_output_path=$2 python3 ${BIN_DIR}/train.py \ - --data=${preprocess_path} \ - --output=${train_output_path} \ - --ngpu=1 \ No newline at end of file + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=2 \ + --phones-dict=dump/phone_id_map.txt \ + --voice-cloning=True \ No newline at end of file diff --git a/examples/aishell3/vc0/local/voice_cloning.sh b/examples/aishell3/vc0/local/voice_cloning.sh index 3fe3de76..79831f3f 100755 --- a/examples/aishell3/vc0/local/voice_cloning.sh +++ b/examples/aishell3/vc0/local/voice_cloning.sh @@ -1,14 +1,24 @@ #!/bin/bash -ge2e_params_path=$1 -tacotron2_params_path=$2 -waveflow_params_path=$3 -vc_input=$4 -vc_output=$5 +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +ge2e_params_path=$4 +ref_audio_dir=$5 -python3 ${BIN_DIR}/voice_cloning.py \ +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../voice_cloning.py \ + --am=tacotron2_aishell3 \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_aishell3 \ + --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ + --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ + --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ --ge2e_params_path=${ge2e_params_path} \ - --tacotron2_params_path=${tacotron2_params_path} \ - --waveflow_params_path=${waveflow_params_path} \ - --input-dir=${vc_input} \ - --output-dir=${vc_output} \ No newline at end of file + --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \ + --input-dir=${ref_audio_dir} \ + --output-dir=${train_output_path}/vc_syn \ + --phones-dict=dump/phone_id_map.txt diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh index dfae49af..a37cd21e 100755 --- a/examples/aishell3/vc0/path.sh +++ b/examples/aishell3/vc0/path.sh @@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} -MODEL=voice_cloning/tacotron2_ge2e +MODEL=tacotron2 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc0/run.sh b/examples/aishell3/vc0/run.sh index 870360c1..64f4ee3b 100755 --- a/examples/aishell3/vc0/run.sh +++ b/examples/aishell3/vc0/run.sh @@ -3,25 +3,20 @@ set -e source path.sh -gpus=0 +gpus=0,1 stage=0 stop_stage=100 -input=~/datasets/data_aishell3/train -preprocess_path=dump -alignment=./alignment +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_482.pdz +ref_audio_dir=ref_audio # not include ".pdparams" here ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000 -train_output_path=output + # include ".pdparams" here ge2e_params_path=${ge2e_ckpt_path}.pdparams -tacotron2_params_path=${train_output_path}/checkpoints/step-1000.pdparams -# pretrained model -# tacotron2_params_path=./tacotron2_aishell3_ckpt_0.3/step-450000.pdparams -waveflow_params_path=./waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams -vc_input=ref_audio -vc_output=syn_audio # with the following command, you can choose the stage range you want to run # such as `./run.sh --stage 0 --stop-stage 0` @@ -30,15 +25,20 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path} || exit -1 fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1 + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} || exit -1 + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi - +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} || exit -1 +fi diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md index d5745bc3..04b83a5f 100644 --- a/examples/aishell3/vc1/README.md +++ b/examples/aishell3/vc1/README.md @@ -1,4 +1,3 @@ - # FastSpeech2 + AISHELL-3 Voice Cloning This example contains code used to train a [FastSpeech2](https://arxiv.org/abs/2006.04558) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows: 1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `FastSpeech2` because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e). @@ -114,7 +113,7 @@ ref_audio ├── LJ015-0254.wav └── audio_self_test.mp3 ``` -`./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py` +`./local/voice_cloning.sh` calls `${BIN_DIR}/../voice_cloning.py` ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml index 557a5a0a..ac495674 100644 --- a/examples/aishell3/vc1/conf/default.yaml +++ b/examples/aishell3/vc1/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type diff --git a/examples/aishell3/vc1/local/voice_cloning.sh b/examples/aishell3/vc1/local/voice_cloning.sh index 6a50826e..2a8864ba 100755 --- a/examples/aishell3/vc1/local/voice_cloning.sh +++ b/examples/aishell3/vc1/local/voice_cloning.sh @@ -8,13 +8,15 @@ ref_audio_dir=$5 FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/voice_cloning.py \ - --fastspeech2-config=${config_path} \ - --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --fastspeech2-stat=dump/train/speech_stats.npy \ - --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \ - --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ - --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ +python3 ${BIN_DIR}/../voice_cloning.py \ + --am=fastspeech2_aishell3 \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_aishell3 \ + --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ + --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ + --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ --ge2e_params_path=${ge2e_params_path} \ --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \ --input-dir=${ref_audio_dir} \ diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml index 7fbffbdd..e2102d6e 100644 --- a/examples/aishell3/voc1/conf/default.yaml +++ b/examples/aishell3/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/ami/README.md b/examples/ami/README.md new file mode 100644 index 00000000..a038eaeb --- /dev/null +++ b/examples/ami/README.md @@ -0,0 +1,3 @@ +# Speaker Diarization on AMI corpus + +* sd0 - speaker diarization by AHC,SC base on x-vectors diff --git a/examples/ami/sd0/.gitignore b/examples/ami/sd0/.gitignore new file mode 100644 index 00000000..872aa273 --- /dev/null +++ b/examples/ami/sd0/.gitignore @@ -0,0 +1 @@ +results \ No newline at end of file diff --git a/examples/ami/sd0/README.md b/examples/ami/sd0/README.md new file mode 100644 index 00000000..ffe95741 --- /dev/null +++ b/examples/ami/sd0/README.md @@ -0,0 +1,13 @@ +# Speaker Diarization on AMI corpus + +## About the AMI corpus: +"The AMI Meeting Corpus consists of 100 hours of meeting recordings. The recordings use a range of signals synchronized to a common timeline. These include close-talking and far-field microphones, individual and room-view video cameras, and output from a slide projector and an electronic whiteboard. During the meetings, the participants also have unsynchronized pens available to them that record what is written. The meetings were recorded in English using three different rooms with different acoustic properties, and include mostly non-native speakers." See [ami overview](http://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) for more details. + +## About the example +The script performs diarization using x-vectors(TDNN,ECAPA-TDNN) on the AMI mix-headset data. We demonstrate the use of different clustering methods: AHC, spectral. + +## How to Run +Use the following command to run diarization on AMI corpus. +`bash ./run.sh` + +## Results (DER) coming soon! :) diff --git a/examples/ami/sd0/local/ami_prepare.py b/examples/ami/sd0/local/ami_prepare.py new file mode 100644 index 00000000..b7bb8e67 --- /dev/null +++ b/examples/ami/sd0/local/ami_prepare.py @@ -0,0 +1,572 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Data preparation. + +Download: http://groups.inf.ed.ac.uk/ami/download/ + +Prepares metadata files (JSON) from manual annotations "segments/" using RTTM format (Oracle VAD). + +Authors + * qingenz123@126.com (Qingen ZHAO) 2022 + +""" + +import os +import logging +import argparse +import xml.etree.ElementTree as et +import glob +import json +from ami_splits import get_AMI_split +from distutils.util import strtobool + +from dataio import ( + load_pkl, + save_pkl, ) + +logger = logging.getLogger(__name__) +SAMPLERATE = 16000 + + +def prepare_ami( + data_folder, + manual_annot_folder, + save_folder, + ref_rttm_dir, + meta_data_dir, + split_type="full_corpus_asr", + skip_TNO=True, + mic_type="Mix-Headset", + vad_type="oracle", + max_subseg_dur=3.0, + overlap=1.5, ): + """ + Prepares reference RTTM and JSON files for the AMI dataset. + + Arguments + --------- + data_folder : str + Path to the folder where the original amicorpus is stored. + manual_annot_folder : str + Directory where the manual annotations are stored. + save_folder : str + The save directory in results. + ref_rttm_dir : str + Directory to store reference RTTM files. + meta_data_dir : str + Directory to store the meta data (json) files. + split_type : str + Standard dataset split. See ami_splits.py for more information. + Allowed split_type: "scenario_only", "full_corpus" or "full_corpus_asr" + skip_TNO: bool + Skips TNO meeting recordings if True. + mic_type : str + Type of microphone to be used. + vad_type : str + Type of VAD. Kept for future when VAD will be added. + max_subseg_dur : float + Duration in seconds of a subsegments to be prepared from larger segments. + overlap : float + Overlap duration in seconds between adjacent subsegments + + Example + ------- + >>> from dataset.ami.ami_prepare import prepare_ami + >>> data_folder = '/home/data/ami/amicorpus/' + >>> manual_annot_folder = '/home/data/ami/ami_public_manual/' + >>> save_folder = './results/ + >>> split_type = 'full_corpus_asr' + >>> mic_type = 'Mix-Headset' + >>> prepare_ami(data_folder, manual_annot_folder, save_folder, split_type, mic_type) + """ + + # Meta files + meta_files = [ + os.path.join(meta_data_dir, "ami_train." + mic_type + ".subsegs.json"), + os.path.join(meta_data_dir, "ami_dev." + mic_type + ".subsegs.json"), + os.path.join(meta_data_dir, "ami_eval." + mic_type + ".subsegs.json"), + ] + + # Create configuration for easily skipping data_preparation stage + conf = { + "data_folder": data_folder, + "save_folder": save_folder, + "ref_rttm_dir": ref_rttm_dir, + "meta_data_dir": meta_data_dir, + "split_type": split_type, + "skip_TNO": skip_TNO, + "mic_type": mic_type, + "vad": vad_type, + "max_subseg_dur": max_subseg_dur, + "overlap": overlap, + "meta_files": meta_files, + } + + if not os.path.exists(save_folder): + os.makedirs(save_folder) + + # Setting output option files. + opt_file = "opt_ami_prepare." + mic_type + ".pkl" + + # Check if this phase is already done (if so, skip it) + if skip(save_folder, conf, meta_files, opt_file): + logger.info( + "Skipping data preparation, as it was completed in previous run.") + return + + msg = "\tCreating meta-data file for the AMI Dataset.." + logger.debug(msg) + + # Get the split + train_set, dev_set, eval_set = get_AMI_split(split_type) + + # Prepare RTTM from XML(manual annot) and store are groundtruth + # Create ref_RTTM directory + if not os.path.exists(ref_rttm_dir): + os.makedirs(ref_rttm_dir) + + # Create reference RTTM files + splits = ["train", "dev", "eval"] + for i in splits: + rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm" + if i == "train": + prepare_segs_for_RTTM( + train_set, + rttm_file, + data_folder, + manual_annot_folder, + i, + skip_TNO, ) + if i == "dev": + prepare_segs_for_RTTM( + dev_set, + rttm_file, + data_folder, + manual_annot_folder, + i, + skip_TNO, ) + if i == "eval": + prepare_segs_for_RTTM( + eval_set, + rttm_file, + data_folder, + manual_annot_folder, + i, + skip_TNO, ) + + # Create meta_files for splits + meta_data_dir = meta_data_dir + if not os.path.exists(meta_data_dir): + os.makedirs(meta_data_dir) + + for i in splits: + rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm" + meta_filename_prefix = "ami_" + i + prepare_metadata( + rttm_file, + meta_data_dir, + data_folder, + meta_filename_prefix, + max_subseg_dur, + overlap, + mic_type, ) + + save_opt_file = os.path.join(save_folder, opt_file) + save_pkl(conf, save_opt_file) + + +def get_RTTM_per_rec(segs, spkrs_list, rec_id): + """Prepares rttm for each recording + """ + + rttm = [] + + # Prepare header + for spkr_id in spkrs_list: + # e.g. SPKR-INFO ES2008c 0 unknown ES2008c.A_PM + line = ("SPKR-INFO " + rec_id + " 0 unknown " + spkr_id + + " ") + rttm.append(line) + + # Append remaining lines + for row in segs: + # e.g. SPEAKER ES2008c 0 37.880 0.590 ES2008c.A_PM + + if float(row[1]) < float(row[0]): + msg1 = ( + "Possibly Incorrect Annotation Found!! transcriber_start (%s) > transcriber_end (%s)" + % (row[0], row[1])) + msg2 = ( + "Excluding this incorrect row from the RTTM : %s, %s, %s, %s" % + (rec_id, row[0], str(round(float(row[1]) - float(row[0]), 4)), + str(row[2]), )) + logger.info(msg1) + logger.info(msg2) + continue + + line = ("SPEAKER " + rec_id + " 0 " + str(round(float(row[0]), 4)) + " " + + str(round(float(row[1]) - float(row[0]), 4)) + " " + + str(row[2]) + " ") + rttm.append(line) + + return rttm + + +def prepare_segs_for_RTTM(list_ids, out_rttm_file, audio_dir, annot_dir, + split_type, skip_TNO): + + RTTM = [] # Stores all RTTMs clubbed together for a given dataset split + + for main_meet_id in list_ids: + + # Skip TNO meetings from dev and eval sets + if (main_meet_id.startswith("TS") and split_type != "train" and + skip_TNO is True): + msg = ("Skipping TNO meeting in AMI " + str(split_type) + " set : " + + str(main_meet_id)) + logger.info(msg) + continue + + list_sessions = glob.glob(audio_dir + "/" + main_meet_id + "*") + list_sessions.sort() + + for sess in list_sessions: + rec_id = os.path.basename(sess) + path = annot_dir + "/segments/" + rec_id + f = path + ".*.segments.xml" + list_spkr_xmls = glob.glob(f) + list_spkr_xmls.sort() # A, B, C, D, E etc (Speakers) + segs = [] + spkrs_list = ( + []) # Since non-scenario recordings contains 3-5 speakers + + for spkr_xml_file in list_spkr_xmls: + + # Speaker ID + spkr = os.path.basename(spkr_xml_file).split(".")[1] + spkr_ID = rec_id + "." + spkr + spkrs_list.append(spkr_ID) + + # Parse xml tree + tree = et.parse(spkr_xml_file) + root = tree.getroot() + + # Start, end and speaker_ID from xml file + segs = segs + [[ + elem.attrib["transcriber_start"], + elem.attrib["transcriber_end"], + spkr_ID, + ] for elem in root.iter("segment")] + + # Sort rows as per the start time (per recording) + segs.sort(key=lambda x: float(x[0])) + + rttm_per_rec = get_RTTM_per_rec(segs, spkrs_list, rec_id) + RTTM = RTTM + rttm_per_rec + + # Write one RTTM as groundtruth. For example, "fullref_eval.rttm" + with open(out_rttm_file, "w") as f: + for item in RTTM: + f.write("%s\n" % item) + + +def is_overlapped(end1, start2): + """Returns True if the two segments overlap + + Arguments + --------- + end1 : float + End time of the first segment. + start2 : float + Start time of the second segment. + """ + + if start2 > end1: + return False + else: + return True + + +def merge_rttm_intervals(rttm_segs): + """Merges adjacent segments in rttm if they overlap. + """ + # For one recording + # rec_id = rttm_segs[0][1] + rttm_segs.sort(key=lambda x: float(x[3])) + + # first_seg = rttm_segs[0] # first interval.. as it is + merged_segs = [rttm_segs[0]] + strt = float(rttm_segs[0][3]) + end = float(rttm_segs[0][3]) + float(rttm_segs[0][4]) + + for row in rttm_segs[1:]: + s = float(row[3]) + e = float(row[3]) + float(row[4]) + + if is_overlapped(end, s): + # Update only end. The strt will be same as in last segment + # Just update last row in the merged_segs + end = max(end, e) + merged_segs[-1][3] = str(round(strt, 4)) + merged_segs[-1][4] = str(round((end - strt), 4)) + merged_segs[-1][7] = "overlap" # previous_row[7] + '-'+ row[7] + else: + # Add a new disjoint segment + strt = s + end = e + merged_segs.append(row) # this will have 1 spkr ID + + return merged_segs + + +def get_subsegments(merged_segs, max_subseg_dur=3.0, overlap=1.5): + """Divides bigger segments into smaller sub-segments + """ + + shift = max_subseg_dur - overlap + subsegments = [] + + # These rows are in RTTM format + for row in merged_segs: + seg_dur = float(row[4]) + rec_id = row[1] + + if seg_dur > max_subseg_dur: + num_subsegs = int(seg_dur / shift) + # Taking 0.01 sec as small step + seg_start = float(row[3]) + seg_end = seg_start + seg_dur + + # Now divide this segment (new_row) in smaller subsegments + for i in range(num_subsegs): + subseg_start = seg_start + i * shift + subseg_end = min(subseg_start + max_subseg_dur - 0.01, seg_end) + subseg_dur = subseg_end - subseg_start + + new_row = [ + "SPEAKER", + rec_id, + "0", + str(round(float(subseg_start), 4)), + str(round(float(subseg_dur), 4)), + "", + "", + row[7], + "", + "", + ] + + subsegments.append(new_row) + + # Break if exceeding the boundary + if subseg_end >= seg_end: + break + else: + subsegments.append(row) + + return subsegments + + +def prepare_metadata(rttm_file, save_dir, data_dir, filename, max_subseg_dur, + overlap, mic_type): + # Read RTTM, get unique meeting_IDs (from RTTM headers) + # For each MeetingID. select that meetID -> merge -> subsegment -> json -> append + + # Read RTTM + RTTM = [] + with open(rttm_file, "r") as f: + for line in f: + entry = line[:-1] + RTTM.append(entry) + + spkr_info = filter(lambda x: x.startswith("SPKR-INFO"), RTTM) + rec_ids = list(set([row.split(" ")[1] for row in spkr_info])) + rec_ids.sort() # sorting just to make JSON look in proper sequence + + # For each recording merge segments and then perform subsegmentation + MERGED_SEGMENTS = [] + SUBSEGMENTS = [] + for rec_id in rec_ids: + segs_iter = filter(lambda x: x.startswith("SPEAKER " + str(rec_id)), + RTTM) + gt_rttm_segs = [row.split(" ") for row in segs_iter] + + # Merge, subsegment and then convert to json format. + merged_segs = merge_rttm_intervals( + gt_rttm_segs) # We lose speaker_ID after merging + MERGED_SEGMENTS = MERGED_SEGMENTS + merged_segs + + # Divide segments into smaller sub-segments + subsegs = get_subsegments(merged_segs, max_subseg_dur, overlap) + SUBSEGMENTS = SUBSEGMENTS + subsegs + + # Write segment AND sub-segments (in RTTM format) + segs_file = save_dir + "/" + filename + ".segments.rttm" + subsegment_file = save_dir + "/" + filename + ".subsegments.rttm" + + with open(segs_file, "w") as f: + for row in MERGED_SEGMENTS: + line_str = " ".join(row) + f.write("%s\n" % line_str) + + with open(subsegment_file, "w") as f: + for row in SUBSEGMENTS: + line_str = " ".join(row) + f.write("%s\n" % line_str) + + # Create JSON from subsegments + json_dict = {} + for row in SUBSEGMENTS: + rec_id = row[1] + strt = str(round(float(row[3]), 4)) + end = str(round((float(row[3]) + float(row[4])), 4)) + subsegment_ID = rec_id + "_" + strt + "_" + end + dur = row[4] + start_sample = int(float(strt) * SAMPLERATE) + end_sample = int(float(end) * SAMPLERATE) + + # If multi-mic audio is selected + if mic_type == "Array1": + wav_file_base_path = (data_dir + "/" + rec_id + "/audio/" + rec_id + + "." + mic_type + "-") + + f = [] # adding all 8 mics + for i in range(8): + f.append(wav_file_base_path + str(i + 1).zfill(2) + ".wav") + audio_files_path_list = f + + # Note: key "files" with 's' is used for multi-mic + json_dict[subsegment_ID] = { + "wav": { + "files": audio_files_path_list, + "duration": float(dur), + "start": int(start_sample), + "stop": int(end_sample), + }, + } + else: + # Single mic audio + wav_file_path = (data_dir + "/" + rec_id + "/audio/" + rec_id + "." + + mic_type + ".wav") + + # Note: key "file" without 's' is used for single-mic + json_dict[subsegment_ID] = { + "wav": { + "file": wav_file_path, + "duration": float(dur), + "start": int(start_sample), + "stop": int(end_sample), + }, + } + + out_json_file = save_dir + "/" + filename + "." + mic_type + ".subsegs.json" + with open(out_json_file, mode="w") as json_f: + json.dump(json_dict, json_f, indent=2) + + msg = "%s JSON prepared" % (out_json_file) + logger.debug(msg) + + +def skip(save_folder, conf, meta_files, opt_file): + """ + Detects if the AMI data_preparation has been already done. + If the preparation has been done, we can skip it. + + Returns + ------- + bool + if True, the preparation phase can be skipped. + if False, it must be done. + """ + # Checking if meta (json) files are available + skip = True + for file_path in meta_files: + if not os.path.isfile(file_path): + skip = False + + # Checking saved options + save_opt_file = os.path.join(save_folder, opt_file) + if skip is True: + if os.path.isfile(save_opt_file): + opts_old = load_pkl(save_opt_file) + if opts_old == conf: + skip = True + else: + skip = False + else: + skip = False + + return skip + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + prog='python ami_prepare.py --data_folder /home/data/ami/amicorpus \ + --manual_annot_folder /home/data/ami/ami_public_manual_1.6.2 \ + --save_folder ./results/ --ref_rttm_dir ./results/ref_rttms \ + --meta_data_dir ./results/metadata', + description='AMI Data preparation') + parser.add_argument( + '--data_folder', + required=True, + help='Path to the folder where the original amicorpus is stored') + parser.add_argument( + '--manual_annot_folder', + required=True, + help='Directory where the manual annotations are stored') + parser.add_argument( + '--save_folder', required=True, help='The save directory in results') + parser.add_argument( + '--ref_rttm_dir', + required=True, + help='Directory to store reference RTTM files') + parser.add_argument( + '--meta_data_dir', + required=True, + help='Directory to store the meta data (json) files') + parser.add_argument( + '--split_type', + default="full_corpus_asr", + help='Standard dataset split. See ami_splits.py for more information') + parser.add_argument( + '--skip_TNO', + default=True, + type=strtobool, + help='Skips TNO meeting recordings if True') + parser.add_argument( + '--mic_type', + default="Mix-Headset", + help='Type of microphone to be used') + parser.add_argument( + '--vad_type', + default="oracle", + help='Type of VAD. Kept for future when VAD will be added') + parser.add_argument( + '--max_subseg_dur', + default=3.0, + type=float, + help='Duration in seconds of a subsegments to be prepared from larger segments' + ) + parser.add_argument( + '--overlap', + default=1.5, + type=float, + help='Overlap duration in seconds between adjacent subsegments') + + args = parser.parse_args() + + prepare_ami(args.data_folder, args.manual_annot_folder, args.save_folder, + args.ref_rttm_dir, args.meta_data_dir) diff --git a/examples/ami/sd0/local/ami_splits.py b/examples/ami/sd0/local/ami_splits.py new file mode 100644 index 00000000..010638a3 --- /dev/null +++ b/examples/ami/sd0/local/ami_splits.py @@ -0,0 +1,234 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +AMI corpus contained 100 hours of meeting recording. +This script returns the standard train, dev and eval split for AMI corpus. +For more information on dataset please refer to http://groups.inf.ed.ac.uk/ami/corpus/datasets.shtml + +Authors + * qingenz123@126.com (Qingen ZHAO) 2022 + +""" + +ALLOWED_OPTIONS = ["scenario_only", "full_corpus", "full_corpus_asr"] + + +def get_AMI_split(split_option): + """ + Prepares train, dev, and test sets for given split_option + + Arguments + --------- + split_option: str + The standard split option. + Allowed options: "scenario_only", "full_corpus", "full_corpus_asr" + + Returns + ------- + Meeting IDs for train, dev, and test sets for given split_option + """ + + if split_option not in ALLOWED_OPTIONS: + print( + f'Invalid split "{split_option}" requested!\nValid split_options are: ', + ALLOWED_OPTIONS, ) + return + + if split_option == "scenario_only": + + train_set = [ + "ES2002", + "ES2005", + "ES2006", + "ES2007", + "ES2008", + "ES2009", + "ES2010", + "ES2012", + "ES2013", + "ES2015", + "ES2016", + "IS1000", + "IS1001", + "IS1002", + "IS1003", + "IS1004", + "IS1005", + "IS1006", + "IS1007", + "TS3005", + "TS3008", + "TS3009", + "TS3010", + "TS3011", + "TS3012", + ] + + dev_set = [ + "ES2003", + "ES2011", + "IS1008", + "TS3004", + "TS3006", + ] + + test_set = [ + "ES2004", + "ES2014", + "IS1009", + "TS3003", + "TS3007", + ] + + if split_option == "full_corpus": + # List of train: SA (TRAINING PART OF SEEN DATA) + train_set = [ + "ES2002", + "ES2005", + "ES2006", + "ES2007", + "ES2008", + "ES2009", + "ES2010", + "ES2012", + "ES2013", + "ES2015", + "ES2016", + "IS1000", + "IS1001", + "IS1002", + "IS1003", + "IS1004", + "IS1005", + "IS1006", + "IS1007", + "TS3005", + "TS3008", + "TS3009", + "TS3010", + "TS3011", + "TS3012", + "EN2001", + "EN2003", + "EN2004", + "EN2005", + "EN2006", + "EN2009", + "IN1001", + "IN1002", + "IN1005", + "IN1007", + "IN1008", + "IN1009", + "IN1012", + "IN1013", + "IN1014", + "IN1016", + ] + + # List of dev: SB (DEV PART OF SEEN DATA) + dev_set = [ + "ES2003", + "ES2011", + "IS1008", + "TS3004", + "TS3006", + "IB4001", + "IB4002", + "IB4003", + "IB4004", + "IB4010", + "IB4011", + ] + + # List of test: SC (UNSEEN DATA FOR EVALUATION) + # Note that IB4005 does not appear because it has speakers in common with two sets of data. + test_set = [ + "ES2004", + "ES2014", + "IS1009", + "TS3003", + "TS3007", + "EN2002", + ] + + if split_option == "full_corpus_asr": + train_set = [ + "ES2002", + "ES2003", + "ES2005", + "ES2006", + "ES2007", + "ES2008", + "ES2009", + "ES2010", + "ES2012", + "ES2013", + "ES2014", + "ES2015", + "ES2016", + "IS1000", + "IS1001", + "IS1002", + "IS1003", + "IS1004", + "IS1005", + "IS1006", + "IS1007", + "TS3005", + "TS3006", + "TS3007", + "TS3008", + "TS3009", + "TS3010", + "TS3011", + "TS3012", + "EN2001", + "EN2003", + "EN2004", + "EN2005", + "EN2006", + "EN2009", + "IN1001", + "IN1002", + "IN1005", + "IN1007", + "IN1008", + "IN1009", + "IN1012", + "IN1013", + "IN1014", + "IN1016", + ] + + dev_set = [ + "ES2011", + "IS1008", + "TS3004", + "IB4001", + "IB4002", + "IB4003", + "IB4004", + "IB4010", + "IB4011", + ] + + test_set = [ + "ES2004", + "IS1009", + "TS3003", + "EN2002", + ] + + return train_set, dev_set, test_set diff --git a/examples/ami/sd0/local/data.sh b/examples/ami/sd0/local/data.sh new file mode 100755 index 00000000..478ec432 --- /dev/null +++ b/examples/ami/sd0/local/data.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +stage=1 + +TARGET_DIR=${MAIN_ROOT}/dataset/ami +data_folder=${TARGET_DIR}/amicorpus #e.g., /path/to/amicorpus/ +manual_annot_folder=${TARGET_DIR}/ami_public_manual_1.6.2 #e.g., /path/to/ami_public_manual_1.6.2/ + +save_folder=${MAIN_ROOT}/examples/ami/sd0/data +ref_rttm_dir=${save_folder}/ref_rttms +meta_data_dir=${save_folder}/metadata + +set=L + +. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; +set -u +set -o pipefail + +mkdir -p ${save_folder} + +if [ ${stage} -le 0 ]; then + # Download AMI corpus, You need around 10GB of free space to get whole data + # The signals are too large to package in this way, + # so you need to use the chooser to indicate which ones you wish to download + echo "Please follow https://groups.inf.ed.ac.uk/ami/download/ to download the data." + echo "Annotations: AMI manual annotations v1.6.2 " + echo "Signals: " + echo "1) Select one or more AMI meetings: the IDs please follow ./ami_split.py" + echo "2) Select media streams: Just select Headset mix" + exit 0; +fi + +if [ ${stage} -le 1 ]; then + echo "AMI Data preparation" + + python local/ami_prepare.py --data_folder ${data_folder} \ + --manual_annot_folder ${manual_annot_folder} \ + --save_folder ${save_folder} --ref_rttm_dir ${ref_rttm_dir} \ + --meta_data_dir ${meta_data_dir} + + if [ $? -ne 0 ]; then + echo "Prepare AMI failed. Please check log message." + exit 1 + fi + +fi + +echo "AMI data preparation done." +exit 0 diff --git a/examples/ami/sd0/local/dataio.py b/examples/ami/sd0/local/dataio.py new file mode 100644 index 00000000..f7fe8815 --- /dev/null +++ b/examples/ami/sd0/local/dataio.py @@ -0,0 +1,97 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Data reading and writing. + +Authors + * qingenz123@126.com (Qingen ZHAO) 2022 + +""" +import os +import pickle + + +def save_pkl(obj, file): + """Save an object in pkl format. + + Arguments + --------- + obj : object + Object to save in pkl format + file : str + Path to the output file + sampling_rate : int + Sampling rate of the audio file, TODO: this is not used? + + Example + ------- + >>> tmpfile = os.path.join(getfixture('tmpdir'), "example.pkl") + >>> save_pkl([1, 2, 3, 4, 5], tmpfile) + >>> load_pkl(tmpfile) + [1, 2, 3, 4, 5] + """ + with open(file, "wb") as f: + pickle.dump(obj, f) + + +def load_pickle(pickle_path): + """Utility function for loading .pkl pickle files. + + Arguments + --------- + pickle_path : str + Path to pickle file. + + Returns + ------- + out : object + Python object loaded from pickle. + """ + with open(pickle_path, "rb") as f: + out = pickle.load(f) + return out + + +def load_pkl(file): + """Loads a pkl file. + + For an example, see `save_pkl`. + + Arguments + --------- + file : str + Path to the input pkl file. + + Returns + ------- + The loaded object. + """ + + # Deals with the situation where two processes are trying + # to access the same label dictionary by creating a lock + count = 100 + while count > 0: + if os.path.isfile(file + ".lock"): + time.sleep(1) + count -= 1 + else: + break + + try: + open(file + ".lock", "w").close() + with open(file, "rb") as f: + return pickle.load(f) + finally: + if os.path.isfile(file + ".lock"): + os.remove(file + ".lock") diff --git a/examples/ami/sd0/path.sh b/examples/ami/sd0/path.sh new file mode 100644 index 00000000..60146113 --- /dev/null +++ b/examples/ami/sd0/path.sh @@ -0,0 +1,15 @@ +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +# model exp +#MODEL=ECAPA_TDNN +#export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}/bin diff --git a/examples/ami/sd0/run.sh b/examples/ami/sd0/run.sh new file mode 100644 index 00000000..91d4b706 --- /dev/null +++ b/examples/ami/sd0/run.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. path.sh || exit 1; +set -e + +stage=1 + + +. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +if [ ${stage} -le 1 ]; then + # prepare data + bash ./local/data.sh || exit -1 +fi \ No newline at end of file diff --git a/examples/ami/sd0/utils b/examples/ami/sd0/utils new file mode 120000 index 00000000..973afe67 --- /dev/null +++ b/examples/ami/sd0/utils @@ -0,0 +1 @@ +../../../utils \ No newline at end of file diff --git a/examples/callcenter/README.md b/examples/callcenter/README.md new file mode 100644 index 00000000..1c715cb6 --- /dev/null +++ b/examples/callcenter/README.md @@ -0,0 +1,20 @@ +# Callcenter 8k sample rate + +Data distribution: + +``` +676048 utts +491.4004722221223 h +4357792.0 text +2.4633630739178654 text/sec +2.6167397877068495 sec/utt +``` + +train/dev/test partition: + +``` + 33802 manifest.dev + 67606 manifest.test + 574640 manifest.train + 676048 total +``` diff --git a/examples/csmsc/README.md b/examples/csmsc/README.md index a59a06ed..2aad609c 100644 --- a/examples/csmsc/README.md +++ b/examples/csmsc/README.md @@ -10,3 +10,5 @@ * voc2 - MelGAN * voc3 - MultiBand MelGAN * voc4 - Style MelGAN +* voc5 - HiFiGAN +* voc6 - WaveRNN diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md new file mode 100644 index 00000000..0129329a --- /dev/null +++ b/examples/csmsc/tts0/README.md @@ -0,0 +1,250 @@ +# Tacotron2 with CSMSC +This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). + +## Dataset +### Download and Extract +Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/BZNSYP`. +Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from a text file. + +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── speech_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and the id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + +Train a Tacotron2 model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG tacotron2 config file. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. +``` +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. + +### Synthesizing +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it. +```bash +unzip pwg_baker_ckpt_0.4.zip +``` +Parallel WaveGAN checkpoint contains files listed below. +```text +pwg_baker_ckpt_0.4 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] + [--voice-cloning VOICE_CLONING] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --voice-cloning VOICE_CLONING + whether training voice cloning model. + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. +``` +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. +``` +1. `--am` is acoustic model type with the format {model_name}_{dataset} +2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +3. `--voc` is vocoder type with the format {model_name}_{dataset} +4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +5. `--lang` is the model language, which can be `zh` or `en`. +6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +7. `--text` is the text file, which contains sentences to synthesize. +8. `--output_dir` is the directory to save synthesized audio files. +9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + + +## Pretrained Model +Pretrained Tacotron2 model with no silence in the edge of audios: +- [tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip) + +The static model can be downloaded here [tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip). + + +Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss +:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: +default| 1(gpu) x 30600|0.57185|0.39614|0.14642|0.029|5.8e-05| + +Tacotron2 checkpoint contains files listed below. +```text +tacotron2_csmsc_ckpt_0.2.0 +├── default.yaml # default config used to train Tacotron2 +├── phone_id_map.txt # phone vocabulary file when training Tacotron2 +├── snapshot_iter_30600.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training Tacotron2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained Tacotron2 and parallel wavegan models. +```bash +source path.sh + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=tacotron2_csmsc_ckpt_0.2.0/default.yaml \ + --am_ckpt=tacotron2_csmsc_ckpt_0.2.0/snapshot_iter_30600.pdz \ + --am_stat=tacotron2_csmsc_ckpt_0.2.0/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=exp/default/test_e2e \ + --inference_dir=exp/default/inference \ + --phones_dict=tacotron2_csmsc_ckpt_0.2.0/phone_id_map.txt +``` diff --git a/examples/csmsc/tts0/conf/default.yaml b/examples/csmsc/tts0/conf/default.yaml new file mode 100644 index 00000000..42635c50 --- /dev/null +++ b/examples/csmsc/tts0/conf/default.yaml @@ -0,0 +1,91 @@ +# This configuration is for Paddle to train Tacotron 2. Compared to the +# original paper, this configuration additionally use the guided attention +# loss to accelerate the learning of the diagonal attention. It requires +# only a single GPU with 12 GB memory and it takes ~1 days to finish the +# training on Titan V. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size (samples). +n_shift: 300 # Hop size (samples). 12.5ms +win_length: 1200 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 64 +num_workers: 2 + +########################################################### +# MODEL SETTING # +########################################################### +model: # keyword arguments for the selected model + embed_dim: 512 # char or phn embedding dimension + elayers: 1 # number of blstm layers in encoder + eunits: 512 # number of blstm units + econv_layers: 3 # number of convolutional layers in encoder + econv_chans: 512 # number of channels in convolutional layer + econv_filts: 5 # filter size of convolutional layer + atype: location # attention function type + adim: 512 # attention dimension + aconv_chans: 32 # number of channels in convolutional layer of attention + aconv_filts: 15 # filter size of convolutional layer of attention + cumulate_att_w: True # whether to cumulate attention weight + dlayers: 2 # number of lstm layers in decoder + dunits: 1024 # number of lstm units in decoder + prenet_layers: 2 # number of layers in prenet + prenet_units: 256 # number of units in prenet + postnet_layers: 5 # number of layers in postnet + postnet_chans: 512 # number of channels in postnet + postnet_filts: 5 # filter size of postnet layer + output_activation: null # activation function for the final output + use_batch_norm: True # whether to use batch normalization in encoder + use_concate: True # whether to concatenate encoder embedding with decoder outputs + use_residual: False # whether to use residual connection in encoder + dropout_rate: 0.5 # dropout rate + zoneout_rate: 0.1 # zoneout rate + reduction_factor: 1 # reduction factor + spk_embed_dim: null # speaker embedding dimension + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation + use_guided_attn_loss: True # whether to use guided attention loss + guided_attn_loss_sigma: 0.4 # sigma of guided attention loss + guided_attn_loss_lambda: 1.0 # strength of guided attention loss + + +########################################################## +# OPTIMIZER SETTING # +########################################################## +optimizer: + optim: adam # optimizer type + learning_rate: 1.0e-03 # learning rate + epsilon: 1.0e-06 # epsilon + weight_decay: 0.0 # weight decay coefficient + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 200 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +seed: 42 \ No newline at end of file diff --git a/examples/csmsc/tts0/local/inference.sh b/examples/csmsc/tts0/local/inference.sh new file mode 100755 index 00000000..e417d748 --- /dev/null +++ b/examples/csmsc/tts0/local/inference.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=tacotron2_csmsc \ + --voc=pwgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=tacotron2_csmsc \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt +fi + +# style melgan +# style melgan's Dygraph to Static Graph is not ready now +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=tacotron2_csmsc \ + --voc=style_melgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=tacotron2_csmsc \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt +fi \ No newline at end of file diff --git a/examples/csmsc/tts0/local/preprocess.sh b/examples/csmsc/tts0/local/preprocess.sh new file mode 100755 index 00000000..8a4b8dd9 --- /dev/null +++ b/examples/csmsc/tts0/local/preprocess.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./baker_alignment_tone \ + --output=durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/preprocess.py \ + --dataset=baker \ + --rootdir=~/datasets/BZNSYP/ \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --num-cpu=20 \ + --cut-sil=True +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="speech" + +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize and covert phone to id, dev and test should use train's stats + echo "Normalize ..." + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt +fi diff --git a/examples/csmsc/tts0/local/synthesize.sh b/examples/csmsc/tts0/local/synthesize.sh new file mode 100755 index 00000000..4be06dd8 --- /dev/null +++ b/examples/csmsc/tts0/local/synthesize.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh new file mode 100755 index 00000000..79bb9f83 --- /dev/null +++ b/examples/csmsc/tts0/local/synthesize_e2e.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +# TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐 +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference + +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ + --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ + --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi + +# the pretrained models haven't release now +# style melgan +# style melgan's Dygraph to Static Graph is not ready now +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt + # --inference_dir=${train_output_path}/inference +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference +fi \ No newline at end of file diff --git a/examples/csmsc/tts0/local/train.sh b/examples/csmsc/tts0/local/train.sh new file mode 100755 index 00000000..f90db915 --- /dev/null +++ b/examples/csmsc/tts0/local/train.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 \ + --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/csmsc/tts0/path.sh b/examples/csmsc/tts0/path.sh new file mode 100755 index 00000000..a37cd21e --- /dev/null +++ b/examples/csmsc/tts0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=tacotron2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/csmsc/tts0/run.sh b/examples/csmsc/tts0/run.sh new file mode 100755 index 00000000..8f06e933 --- /dev/null +++ b/examples/csmsc/tts0/run.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_153.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 +fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index 0a4cf69b..35fcf251 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -92,3 +92,26 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones_dict=dump/phone_id_map.txt \ --tones_dict=dump/tone_id_map.txt fi + + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference +fi diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 13d291b5..7b803526 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -1,3 +1,4 @@ +([简体中文](./README_cn.md)|English) # FastSpeech2 with CSMSC This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). @@ -242,6 +243,8 @@ fastspeech2_nosil_baker_ckpt_0.4 └── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 ``` You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. + +If you want to use fastspeech2_conformer, you must delete this line `--inference_dir=exp/default/inference \` to skip the step of dygraph to static graph, cause we haven't tested dygraph to static graph for fastspeech2_conformer till now. ```bash source path.sh diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md new file mode 100644 index 00000000..25931ecb --- /dev/null +++ b/examples/csmsc/tts3/README_cn.md @@ -0,0 +1,273 @@ +(简体中文|[English](./README.md)) +# 用 CSMSC 数据集训练 FastSpeech2 模型 + +本用例包含用于训练 [Fastspeech2](https://arxiv.org/abs/2006.04558) 模型的代码,使用 [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html) 数据集。 + +## 数据集 +### 下载并解压 +从 [官方网站](https://test.data-baker.com/data/index/source) 下载数据集 + +### 获取MFA结果并解压 +我们使用 [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) 去获得 fastspeech2 的音素持续时间。 +你们可以从这里下载 [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), 或参考 [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) 训练你自己的模型。 + +## 开始 +假设数据集的路径是 `~/datasets/BZNSYP`. +假设CSMSC的MFA结果路径为 `./baker_alignment_tone`. +运行下面的命令会进行如下操作: + +1. **设置原路径**。 +2. 对数据集进行预处理。 +3. 训练模型 +4. 合成波形 + - 从 `metadata.jsonl` 合成波形。 + - 从文本文件合成波形。 +5. 使用静态模型进行推理。 +```bash +./run.sh +``` +您可以选择要运行的一系列阶段,或者将 `stage` 设置为 `stop-stage` 以仅使用一个阶段,例如,运行以下命令只会预处理数据集。 +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### 数据预处理 +```bash +./local/preprocess.sh ${conf_path} +``` +当它完成时。将在当前目录中创建 `dump` 文件夹。转储文件夹的结构如下所示。 + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── energy_stats.npy + ├── norm + ├── pitch_stats.npy + ├── raw + └── speech_stats.npy +``` + +数据集分为三个部分,即 `train` 、 `dev` 和 `test` ,每个部分都包含一个 `norm` 和 `raw` 子文件夹。原始文件夹包含每个话语的语音、音调和能量特征,而 `norm` 文件夹包含规范化的特征。用于规范化特征的统计数据是从 `dump/train/*_stats.npy` 中的训练集计算出来的。 + +此外,还有一个 `metadata.jsonl` 在每个子文件夹中。它是一个类似表格的文件,包含音素、文本长度、语音长度、持续时间、语音特征路径、音调特征路径、能量特征路径、说话人和每个话语的 id。 + +### 模型训练 +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` 调用 `${BIN_DIR}/train.py` 。 +以下是完整的帮助信息。 + +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] + +Train a FastSpeech2 model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG fastspeech2 config file. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu=0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. + --speaker-dict SPEAKER_DICT + speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. +``` +1. `--config` 是一个 yaml 格式的配置文件,用于覆盖默认配置,位于 `conf/default.yaml`. +2. `--train-metadata` 和 `--dev-metadata` 应为 `dump` 文件夹中 `train` 和 `dev` 下的规范化元数据文件 +3. `--output-dir` 是保存结果的目录。 检查点保存在此目录中的 `checkpoints/` 目录下。 +4. `--ngpu` 要使用的 GPU 数,如果 ngpu==0,则使用 cpu 。 +5. `--phones-dict` 是音素词汇表文件的路径。 + +### 合成 +我们使用 [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) 作为神经声码器(vocoder)。 +从 [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) 下载预训练的 parallel wavegan 模型并将其解压。 + +```bash +unzip pwg_baker_ckpt_0.4.zip +``` +Parallel WaveGAN 检查点包含如下文件。 +```text +pwg_baker_ckpt_0.4 +├── pwg_default.yaml # 用于训练 parallel wavegan 的默认配置 +├── pwg_snapshot_iter_400000.pdz # parallel wavegan 的模型参数 +└── pwg_stats.npy # 训练平行波形时用于规范化谱图的统计数据 +``` +`./local/synthesize.sh` 调用 `${BIN_DIR}/../synthesize.py` 即可从 `metadata.jsonl`中合成波形。 + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] + [--voice-cloning VOICE_CLONING] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --voice-cloning VOICE_CLONING + whether training voice cloning model. + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. +``` +`./local/synthesize_e2e.sh` 调用 `${BIN_DIR}/../synthesize_e2e.py`,即可从文本文件中合成波形。 + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. +``` +1. `--am` 声学模型格式是否符合 {model_name}_{dataset} +2. `--am_config`, `--am_checkpoint`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。 +3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset} +4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。 +5. `--lang` 对应模型的语言可以是 `zh` 或 `en` 。 +6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、 +7. `--text` 是文本文件,其中包含要合成的句子。 +8. `--output_dir` 是保存合成音频文件的目录。 +9. `--ngpu` 要使用的GPU数,如果 ngpu==0,则使用 cpu 。 + +### 推理 +在合成之后,我们将在 `${train_output_path}/inference` 中得到 fastspeech2 和 pwgan 的静态模型 +`./local/inference.sh` 调用 `${BIN_DIR}/inference.py` 为 fastspeech2 + pwgan 综合提供了一个 paddle 静态模型推理示例。 + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} +``` + +## 预训练模型 +预先训练的 FastSpeech2 模型,在音频边缘没有空白音频: +- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) +- [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip) + +静态模型可以在这里下载 [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip). + +Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss +:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: +default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287| +conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509| + +FastSpeech2检查点包含下列文件。 +```text +fastspeech2_nosil_baker_ckpt_0.4 +├── default.yaml # 用于训练 fastspeech2 的默认配置 +├── phone_id_map.txt # 训练 fastspeech2 时的音素词汇文件 +├── snapshot_iter_76000.pdz # 模型参数和优化器状态 +└── speech_stats.npy # 训练 fastspeech2 时用于规范化频谱图的统计数据 +``` +您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences.txt` 合成句子 +```bash +source path.sh + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --am_ckpt=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --am_stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=exp/default/test_e2e \ + --inference_dir=exp/default/inference \ + --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt +``` diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml index 252f634d..fcad8615 100644 --- a/examples/csmsc/tts3/conf/conformer.yaml +++ b/examples/csmsc/tts3/conf/conformer.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -53,8 +53,8 @@ model: conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type conformer_activation_type: swish # conformer activation type - use_macaron_style_in_conformer: true # whether to use macaron style in conformer - use_cnn_in_conformer: true # whether to use CNN in conformer + use_macaron_style_in_conformer: True # whether to use macaron style in conformer + use_cnn_in_conformer: True # whether to use CNN in conformer conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder init_type: xavier_uniform # initialization type @@ -70,14 +70,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml index 1f723d67..2c2a1ea1 100644 --- a/examples/csmsc/tts3/conf/default.yaml +++ b/examples/csmsc/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder @@ -82,7 +82,6 @@ updater: use_masking: True # whether to apply masking for padded part in loss calculation - ########################################################### # OPTIMIZER SETTING # ########################################################### diff --git a/examples/csmsc/tts3/local/inference.sh b/examples/csmsc/tts3/local/inference.sh index 7c58980c..9322cfd6 100755 --- a/examples/csmsc/tts3/local/inference.sh +++ b/examples/csmsc/tts3/local/inference.sh @@ -48,4 +48,15 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/pd_infer_out \ --phones_dict=dump/phone_id_map.txt +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_csmsc \ + --voc=wavernn_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt fi \ No newline at end of file diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index d4744486..44356e4b 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -89,3 +89,25 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt fi + + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference +fi diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index c1ddd3b9..e1a149b6 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -18,7 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - bash ./local/preprocess.sh ${conf_path} || exit -1 + ./local/preprocess.sh ${conf_path} || exit -1 fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then @@ -40,3 +40,4 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # inference with static model CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 fi + diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml index 28d218ff..703be21b 100644 --- a/examples/csmsc/voc1/conf/default.yaml +++ b/examples/csmsc/voc1/conf/default.yaml @@ -34,10 +34,10 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - bias: true # use bias in residual blocks - use_weight_norm: true # Whether to use weight norm. + bias: True # use bias in residual blocks + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. - use_causal_conv: false # use causal conv in residual blocks and upsample layers + use_causal_conv: False # use causal conv in residual blocks and upsample layers upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size. interpolate_mode: "nearest" # upsample net interpolate mode freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis @@ -53,8 +53,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml index 27e97664..fbff54f1 100644 --- a/examples/csmsc/voc3/conf/default.yaml +++ b/examples/csmsc/voc3/conf/default.yaml @@ -63,13 +63,13 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss. win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. window: "hann" # Window function for STFT-based loss -use_subband_stft_loss: true +use_subband_stft_loss: True subband_stft_loss_params: fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss @@ -79,7 +79,7 @@ subband_stft_loss_params: ########################################################### # ADVERSARIAL LOSS SETTING # ########################################################### -use_feat_match_loss: false # Whether to use feature matching loss. +use_feat_match_loss: False # Whether to use feature matching loss. lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. ########################################################### diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml index a3b1d8b1..0a38c282 100644 --- a/examples/csmsc/voc3/conf/finetune.yaml +++ b/examples/csmsc/voc3/conf/finetune.yaml @@ -63,13 +63,13 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. window: "hann" # Window function for STFT-based loss -use_subband_stft_loss: true +use_subband_stft_loss: True subband_stft_loss_params: fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss. @@ -79,7 +79,7 @@ subband_stft_loss_params: ########################################################### # ADVERSARIAL LOSS SETTING # ########################################################### -use_feat_match_loss: false # Whether to use feature matching loss. +use_feat_match_loss: False # Whether to use feature matching loss. lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. ########################################################### diff --git a/examples/csmsc/voc4/conf/default.yaml b/examples/csmsc/voc4/conf/default.yaml index c9abf78d..cd8f8e28 100644 --- a/examples/csmsc/voc4/conf/default.yaml +++ b/examples/csmsc/voc4/conf/default.yaml @@ -65,7 +65,7 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss @@ -78,9 +78,9 @@ lambda_aux: 1.0 # Loss balancing coefficient for aux loss. ########################################################### lambda_adv: 1.0 # Loss balancing coefficient for adv loss. generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. ########################################################### # DATA LOADER SETTING # diff --git a/examples/csmsc/voc5/conf/default.yaml b/examples/csmsc/voc5/conf/default.yaml index f42fc385..38b94cf5 100644 --- a/examples/csmsc/voc5/conf/default.yaml +++ b/examples/csmsc/voc5/conf/default.yaml @@ -35,12 +35,12 @@ generator_params: - [1, 3, 5] - [1, 3, 5] - [1, 3, 5] - use_additional_convs: true # Whether to use additional conv layer in residual blocks. - bias: true # Whether to use bias parameter in conv. + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. + use_weight_norm: True # Whether to apply weight normalization. ########################################################### @@ -60,12 +60,12 @@ discriminator_params: channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. - bias: true + bias: True downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: negative_slope: 0.1 - follow_official_norm: true # Whether to follow the official norm setting. + follow_official_norm: True # Whether to follow the official norm setting. periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. period_discriminator_params: in_channels: 1 # Number of input channels. @@ -74,19 +74,19 @@ discriminator_params: channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. - bias: true # Whether to use bias parameter in conv layer." + bias: True # Whether to use bias parameter in conv layer." nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. - use_spectral_norm: false # Whether to apply spectral normalization. + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: false # Whether to use multi-resolution STFT loss. -use_mel_loss: true # Whether to use Mel-spectrogram loss. +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. mel_loss_params: fs: 24000 fft_size: 2048 @@ -98,14 +98,14 @@ mel_loss_params: fmax: 12000 log_base: null generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. -use_feat_match_loss: true + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True feat_match_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. - average_by_layers: false # Whether to average loss by #layers in each discriminator. - include_final_outputs: false # Whether to include final outputs in feat match loss calculation. + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. ########################################################### # ADVERSARIAL LOSS SETTING # diff --git a/examples/csmsc/voc5/conf/finetune.yaml b/examples/csmsc/voc5/conf/finetune.yaml index 73420625..110ae052 100644 --- a/examples/csmsc/voc5/conf/finetune.yaml +++ b/examples/csmsc/voc5/conf/finetune.yaml @@ -35,12 +35,12 @@ generator_params: - [1, 3, 5] - [1, 3, 5] - [1, 3, 5] - use_additional_convs: true # Whether to use additional conv layer in residual blocks. - bias: true # Whether to use bias parameter in conv. + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. + use_weight_norm: True # Whether to apply weight normalization. ########################################################### @@ -60,12 +60,12 @@ discriminator_params: channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. - bias: true + bias: True downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: negative_slope: 0.1 - follow_official_norm: true # Whether to follow the official norm setting. + follow_official_norm: True # Whether to follow the official norm setting. periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. period_discriminator_params: in_channels: 1 # Number of input channels. @@ -74,19 +74,19 @@ discriminator_params: channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. - bias: true # Whether to use bias parameter in conv layer." + bias: True # Whether to use bias parameter in conv layer." nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. - use_spectral_norm: false # Whether to apply spectral normalization. + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: false # Whether to use multi-resolution STFT loss. -use_mel_loss: true # Whether to use Mel-spectrogram loss. +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. mel_loss_params: fs: 24000 fft_size: 2048 @@ -98,14 +98,14 @@ mel_loss_params: fmax: 12000 log_base: null generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. -use_feat_match_loss: true + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True feat_match_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. - average_by_layers: false # Whether to average loss by #layers in each discriminator. - include_final_outputs: false # Whether to include final outputs in feat match loss calculation. + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. ########################################################### # ADVERSARIAL LOSS SETTING # diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md new file mode 100644 index 00000000..7763b355 --- /dev/null +++ b/examples/csmsc/voc6/README.md @@ -0,0 +1,127 @@ +# WaveRNN with CSMSC +This example contains code used to train a [WaveRNN](https://arxiv.org/abs/1802.08435) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). +## Dataset +### Download and Extract +Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/BZNSYP`. +Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── feats_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. + +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] + +Train a WaveRNN model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG config file to overwrite default config. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. +``` + +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +### Synthesizing +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] + [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] + +Synthesize with WaveRNN. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG Vocoder config file. + --checkpoint CHECKPOINT + snapshot to load. + --test-metadata TEST_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. +``` + +1. `--config` wavernn config file. You should use the same config with which the model is trained. +2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. +3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. +4. `--output-dir` is the directory to save the synthesized audio files. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Models +The pretrained model can be downloaded here [wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip). + +The static model can be downloaded here [wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip). + +Model | Step | eval/loss +:-------------:|:------------:| :------------: +default| 1(gpu) x 400000|2.602768 + +WaveRNN checkpoint contains files listed below. + +```text +wavernn_csmsc_ckpt_0.2.0 +├── default.yaml # default config used to train wavernn +├── feats_stats.npy # statistics used to normalize spectrogram when training wavernn +└── snapshot_iter_400000.pdz # parameters of wavernn +``` diff --git a/examples/csmsc/voc6/conf/default.yaml b/examples/csmsc/voc6/conf/default.yaml new file mode 100644 index 00000000..e7696cf4 --- /dev/null +++ b/examples/csmsc/voc6/conf/default.yaml @@ -0,0 +1,67 @@ + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 2048 # FFT size (samples). +n_shift: 300 # Hop size (samples). 12.5ms +win_length: 1200 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 7600 # Maximum frequency in mel basis calculation. (Hz) +mu_law: True # Recommended to suppress noise if using raw bitsexit() + + +########################################################### +# MODEL SETTING # +########################################################### +model: + rnn_dims: 512 # Hidden dims of RNN Layers. + fc_dims: 512 + bits: 9 # Bit depth of signal + aux_context_window: 2 # Context window size for auxiliary feature. + # If set to 2, previous 2 and future 2 frames will be considered. + aux_channels: 80 # Number of channels for auxiliary feature conv. + # Must be the same as num_mels. + upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here + compute_dims: 128 # Dims of Conv1D in MelResNet. + res_out_dims: 128 # Dims of output in MelResNet. + res_blocks: 10 # Number of residual blocks. + mode: RAW # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics) +inference: + gen_batched: True # whether to genenate sample in batch mode + target: 12000 # target number of samples to be generated in each batch entry + overlap: 600 # number of samples for crossfading between batches + + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 64 # Batch size. +batch_max_steps: 4500 # Length of each audio in batch. Make sure dividable by hop_size. +num_workers: 2 # Number of workers in DataLoader. + +########################################################### +# OPTIMIZER SETTING # +########################################################### +grad_clip: 4.0 +learning_rate: 1.0e-4 + + +########################################################### +# INTERVAL SETTING # +########################################################### + +train_max_steps: 400000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. +gen_eval_samples_interval_steps: 5000 # the iteration interval of generating valid samples +generate_num: 5 # number of samples to generate at each checkpoint + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh new file mode 100755 index 00000000..2dcc39ac --- /dev/null +++ b/examples/csmsc/voc6/local/preprocess.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./baker_alignment_tone \ + --output=durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/../gan_vocoder/preprocess.py \ + --rootdir=~/datasets/BZNSYP/ \ + --dataset=baker \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --cut-sil=True \ + --num-cpu=20 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="feats" +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize, dev and test should use train's stats + echo "Normalize ..." + + python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --stats=dump/train/feats_stats.npy + python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --stats=dump/train/feats_stats.npy + + python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --stats=dump/train/feats_stats.npy +fi diff --git a/examples/csmsc/voc6/local/synthesize.sh b/examples/csmsc/voc6/local/synthesize.sh new file mode 100755 index 00000000..7f0cbe48 --- /dev/null +++ b/examples/csmsc/voc6/local/synthesize.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh new file mode 100755 index 00000000..9695631e --- /dev/null +++ b/examples/csmsc/voc6/local/train.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +FLAGS_cudnn_exhaustive_search=true \ +FLAGS_conv_workspace_size_limit=4000 \ +python ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 diff --git a/examples/csmsc/voc6/path.sh b/examples/csmsc/voc6/path.sh new file mode 100755 index 00000000..b0c98584 --- /dev/null +++ b/examples/csmsc/voc6/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=wavernn +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} \ No newline at end of file diff --git a/examples/csmsc/voc6/run.sh b/examples/csmsc/voc6/run.sh new file mode 100755 index 00000000..5f754fff --- /dev/null +++ b/examples/csmsc/voc6/run.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +test_input=dump/dump_gta_test +ckpt_name=snapshot_iter_100000.pdz + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # prepare data + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/esc50/README.md b/examples/esc50/README.md index 2ce57ae0..911a72ad 100644 --- a/examples/esc50/README.md +++ b/examples/esc50/README.md @@ -122,3 +122,6 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 4 cpu ./export /audio/dog.wav - `device`: 指定模型预测时使用的设备。 - `model_dir`: 导出静态图模型和参数文件的保存目录。 - `wav`: 指定预测的音频文件。 + +## Reference +* [PANNs(PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition)](https://arxiv.org/abs/1912.10211) diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md index baaec818..ba7ad619 100644 --- a/examples/ljspeech/tts0/README.md +++ b/examples/ljspeech/tts0/README.md @@ -1,20 +1,25 @@ -# Tacotron2 with LJSpeech -PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from the text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884). +# Tacotron2 with LJSpeech-1.1 +This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/) ## Dataset -We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/). +### Download and Extract +Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/). + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here. +You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. -```bash -wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 -tar xjvf LJSpeech-1.1.tar.bz2 -``` ## Get Started Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. +Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`. Run the command below to 1. **source path**. 2. preprocess the dataset. 3. train the model. -4. synthesize mels. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from a text file. + ```bash ./run.sh ``` @@ -26,64 +31,217 @@ You can choose a range of stages you want to run, or set `stage` equal to `stop- ```bash ./local/preprocess.sh ${conf_path} ``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── speech_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and the id of each utterance. + ### Model Training -`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. Here's the complete help message. ```text -usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR] - [--checkpoint_path CHECKPOINT_PATH] [--ngpu NGPU] [--opts ...] +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + +Train a Tacotron2 model. optional arguments: -h, --help show this help message and exit - --config FILE path of the config file to overwrite to default config - with. - --data DATA_DIR path to the dataset. - --output OUTPUT_DIR path to save checkpoint and logs. - --checkpoint_path CHECKPOINT_PATH - path of the checkpoint to load + --config CONFIG tacotron2 config file. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. --ngpu NGPU if ngpu == 0, use cpu. - --opts ... options to overwrite --config file and the default - config, passing in KEY VALUE pairs + --phones-dict PHONES_DICT + phone vocabulary file. ``` - -If you want to train on CPU, just set `--ngpu=0`. -If you want to train on multiple GPUs, just set `--ngpu` as the num of GPU. -By default, training will be resumed from the latest checkpoint in `--output`, if you want to start a new training, please use a new `${OUTPUTPATH}` with no checkpoint. -And if you want to resume from another existing model, you should set `checkpoint_path` to be the checkpoint path you want to load. -**Note: The checkpoint path cannot contain the file extension.** +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. ### Synthesizing -`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which synthesize **mels** from text_list here. +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) and unzip it. +```bash +unzip pwg_ljspeech_ckpt_0.5.zip +``` +Parallel WaveGAN checkpoint contains files listed below. +```text +pwg_ljspeech_ckpt_0.5 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name} +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text -usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH] - [--input INPUT] [--output OUTPUT] [--ngpu NGPU] - [--opts ...] [-v] +usage: synthesize.py [-h] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] + [--voice-cloning VOICE_CLONING] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] -generate mel spectrogram with TransformerTTS. +Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --config FILE extra config to overwrite the default config - --checkpoint_path CHECKPOINT_PATH - path of the checkpoint to load. - --input INPUT path of the text sentences - --output OUTPUT path to save outputs + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --voice-cloning VOICE_CLONING + whether training voice cloning model. + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. --ngpu NGPU if ngpu == 0, use cpu. - --opts ... options to overwrite --config file and the default - config, passing in KEY VALUE pairs - -v, --verbose print msg + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. ``` -**Ps.** You can use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder to synthesize mels to wavs. (Please refer to `synthesize.sh` in our LJSpeech waveflow example) +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. +``` +1. `--am` is acoustic model type with the format {model_name}_{dataset} +2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +3. `--voc` is vocoder type with the format {model_name}_{dataset} +4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +5. `--lang` is the model language, which can be `zh` or `en`. +6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +7. `--text` is the text file, which contains sentences to synthesize. +8. `--output_dir` is the directory to save synthesized audio files. +9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + + +## Pretrained Model +Pretrained Tacotron2 model with no silence in the edge of audios: +- [tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip) + -## Pretrained Models -Pretrained Models can be downloaded from the links below. We provide 2 models with different configurations. +Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss +:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: +default| 1(gpu) x 60300|0.554092|0.394260|0.141046|0.018747|3.8e-05| -1. This model uses a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip) +Tacotron2 checkpoint contains files listed below. +```text +tacotron2_ljspeech_ckpt_0.2.0 +├── default.yaml # default config used to train Tacotron2 +├── phone_id_map.txt # phone vocabulary file when training Tacotron2 +├── snapshot_iter_60300.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training Tacotron2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained Tacotron2 and parallel wavegan models. +```bash +source path.sh -2. This model does not have a stop token predictor. It uses the attention peak position to decide whether all the contents have been uttered. Also, guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative.zip) +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_ljspeech \ + --am_config=tacotron2_ljspeech_ckpt_0.2.0/default.yaml \ + --am_ckpt=tacotron2_ljspeech_ckpt_0.2.0/snapshot_iter_60300.pdz \ + --am_stat=tacotron2_ljspeech_ckpt_0.2.0/speech_stats.npy \ + --voc=pwgan_ljspeech\ + --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ + --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ + --lang=en \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=exp/default/test_e2e \ + --phones_dict=tacotron2_ljspeech_ckpt_0.2.0/phone_id_map.txt +``` diff --git a/examples/ljspeech/tts0/conf/default.yaml b/examples/ljspeech/tts0/conf/default.yaml new file mode 100644 index 00000000..d76ebd43 --- /dev/null +++ b/examples/ljspeech/tts0/conf/default.yaml @@ -0,0 +1,87 @@ +# This configuration is for Paddle to train Tacotron 2. Compared to the +# original paper, this configuration additionally use the guided attention +# loss to accelerate the learning of the diagonal attention. It requires +# only a single GPU with 12 GB memory and it takes ~1 days to finish the +# training on Titan V. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 22050 # Sampling rate. +n_fft: 1024 # FFT size (samples). +n_shift: 256 # Hop size (samples). 11.6ms +win_length: null # Window length (samples). + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 7600 # Maximum frequency in mel basis calculation. (Hz) + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 64 +num_workers: 2 + +########################################################### +# MODEL SETTING # +########################################################### +model: # keyword arguments for the selected model + embed_dim: 512 # char or phn embedding dimension + elayers: 1 # number of blstm layers in encoder + eunits: 512 # number of blstm units + econv_layers: 3 # number of convolutional layers in encoder + econv_chans: 512 # number of channels in convolutional layer + econv_filts: 5 # filter size of convolutional layer + atype: location # attention function type + adim: 512 # attention dimension + aconv_chans: 32 # number of channels in convolutional layer of attention + aconv_filts: 15 # filter size of convolutional layer of attention + cumulate_att_w: True # whether to cumulate attention weight + dlayers: 2 # number of lstm layers in decoder + dunits: 1024 # number of lstm units in decoder + prenet_layers: 2 # number of layers in prenet + prenet_units: 256 # number of units in prenet + postnet_layers: 5 # number of layers in postnet + postnet_chans: 512 # number of channels in postnet + postnet_filts: 5 # filter size of postnet layer + output_activation: null # activation function for the final output + use_batch_norm: True # whether to use batch normalization in encoder + use_concate: True # whether to concatenate encoder embedding with decoder outputs + use_residual: False # whether to use residual connection in encoder + dropout_rate: 0.5 # dropout rate + zoneout_rate: 0.1 # zoneout rate + reduction_factor: 1 # reduction factor + spk_embed_dim: null # speaker embedding dimension + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation + use_guided_attn_loss: True # whether to use guided attention loss + guided_attn_loss_sigma: 0.4 # sigma of guided attention loss + guided_attn_loss_lambda: 1.0 # strength of guided attention loss + + +########################################################## +# OPTIMIZER SETTING # +########################################################## +optimizer: + optim: adam # optimizer type + learning_rate: 1.0e-03 # learning rate + epsilon: 1.0e-06 # epsilon + weight_decay: 0.0 # weight decay coefficient + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 300 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +seed: 42 diff --git a/examples/ljspeech/tts0/local/preprocess.sh b/examples/ljspeech/tts0/local/preprocess.sh index c39a3172..e0e4bc7a 100755 --- a/examples/ljspeech/tts0/local/preprocess.sh +++ b/examples/ljspeech/tts0/local/preprocess.sh @@ -1,8 +1,62 @@ #!/bin/bash -preprocess_path=$1 +stage=0 +stop_stage=100 -python3 ${BIN_DIR}/preprocess.py \ - --input=~/datasets/LJSpeech-1.1 \ - --output=${preprocess_path} \ - -v \ \ No newline at end of file +config_path=$1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./ljspeech_alignment \ + --output=durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/preprocess.py \ + --dataset=ljspeech \ + --rootdir=~/datasets/LJSpeech-1.1/ \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --num-cpu=20 \ + --cut-sil=True +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="speech" + +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize and covert phone to id, dev and test should use train's stats + echo "Normalize ..." + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt +fi diff --git a/examples/ljspeech/tts0/local/synthesize.sh b/examples/ljspeech/tts0/local/synthesize.sh index 3f5f9c06..0d005820 100755 --- a/examples/ljspeech/tts0/local/synthesize.sh +++ b/examples/ljspeech/tts0/local/synthesize.sh @@ -1,11 +1,20 @@ #!/bin/bash -train_output_path=$1 -ckpt_name=$2 +config_path=$1 +train_output_path=$2 +ckpt_name=$3 -python3 ${BIN_DIR}/synthesize.py \ - --config=${train_output_path}/config.yaml \ - --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ - --input=${BIN_DIR}/../sentences_en.txt \ - --output=${train_output_path}/test \ - --ngpu=1 +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_ljspeech \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_ljspeech \ + --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ + --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts0/local/synthesize_e2e.sh b/examples/ljspeech/tts0/local/synthesize_e2e.sh new file mode 100755 index 00000000..73dfff60 --- /dev/null +++ b/examples/ljspeech/tts0/local/synthesize_e2e.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +# TODO: dygraph to static graph is not good for tacotron2_ljspeech now +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_ljspeech \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_ljspeech \ + --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ + --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ + --lang=en \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + # --inference_dir=${train_output_path}/inference \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh index a94f955a..f90db915 100755 --- a/examples/ljspeech/tts0/local/train.sh +++ b/examples/ljspeech/tts0/local/train.sh @@ -1,9 +1,12 @@ #!/bin/bash -preprocess_path=$1 +config_path=$1 train_output_path=$2 python3 ${BIN_DIR}/train.py \ - --data=${preprocess_path} \ - --output=${train_output_path} \ - --ngpu=1 \ \ No newline at end of file + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 \ + --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/ljspeech/tts0/run.sh b/examples/ljspeech/tts0/run.sh index 47c76c3d..c64fa888 100755 --- a/examples/ljspeech/tts0/run.sh +++ b/examples/ljspeech/tts0/run.sh @@ -3,13 +3,13 @@ set -e source path.sh -gpus=0 +gpus=0,1 stage=0 stop_stage=100 -preprocess_path=preprocessed_ljspeech -train_output_path=output -ckpt_name=step-35000 +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_201.pdz # with the following command, you can choose the stage range you want to run # such as `./run.sh --stage 0 --stop-stage 0` @@ -18,16 +18,20 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - ./local/preprocess.sh ${preprocess_path} || exit -1 + ./local/preprocess.sh ${conf_path} || exit -1 fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `train_output_path/checkpoints/` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # train model, all `ckpt` under `train_output_path/checkpoints/` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name} || exit -1 + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/ljspeech/tts1/conf/default.yaml b/examples/ljspeech/tts1/conf/default.yaml index 6b495eff..456b6a1e 100644 --- a/examples/ljspeech/tts1/conf/default.yaml +++ b/examples/ljspeech/tts1/conf/default.yaml @@ -63,9 +63,9 @@ model: # keyword arguments for the selected model # UPDATER SETTING # ########################################################### updater: - use_masking: true # whether to apply masking for padded part in loss calculation + use_masking: True # whether to apply masking for padded part in loss calculation loss_type: L1 - use_guided_attn_loss: true # whether to use guided attention loss + use_guided_attn_loss: True # whether to use guided attention loss guided_attn_loss_sigma: 0.4 # sigma in guided attention loss guided_attn_loss_lambda: 10.0 # lambda in guided attention loss modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index f3602c34..f5e919c0 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -1,4 +1,4 @@ -# FastSpeech2 with the LJSpeech-1.1 +# FastSpeech2 with LJSpeech-1.1 This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/). ## Dataset diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml index 872dafcb..5305c912 100644 --- a/examples/ljspeech/tts3/conf/default.yaml +++ b/examples/ljspeech/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder diff --git a/examples/ljspeech/voc0/run.sh b/examples/ljspeech/voc0/run.sh index ddd82cb4..b040c0b2 100755 --- a/examples/ljspeech/voc0/run.sh +++ b/examples/ljspeech/voc0/run.sh @@ -10,7 +10,7 @@ stop_stage=100 preprocess_path=preprocessed_ljspeech train_output_path=output # mel generated by Tacotron2 -input_mel_path=../tts0/output/test +input_mel_path=${preprocess_path}/mel_test ckpt_name=step-10000 # with the following command, you can choose the stage range you want to run @@ -28,5 +28,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + mkdir -p ${preprocess_path}/mel_test + cp ${preprocess_path}/mel/LJ050-001*.npy ${preprocess_path}/mel_test/ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name} || exit -1 fi diff --git a/examples/ljspeech/voc1/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml index 2d39beb7..d30960d6 100644 --- a/examples/ljspeech/voc1/conf/default.yaml +++ b/examples/ljspeech/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 4, 4, 4] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py index fb8b321c..f6e185ff 100644 --- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py +++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py @@ -162,39 +162,17 @@ class DeepSpeech2Model(nn.Layer): return loss @paddle.no_grad() - def decode(self, audio, audio_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes): - # init once + def decode(self, audio, audio_len): # decoders only accept string encoded in utf-8 - self.decoder.init_decode( - beam_alpha=beam_alpha, - beam_beta=beam_beta, - lang_model_path=lang_model_path, - vocab_list=vocab_list, - decoding_method=decoding_method) + # Make sure the decoder has been initialized eouts, eouts_len = self.encoder(audio, audio_len) probs = self.decoder.softmax(eouts) - print("probs.shape", probs.shape) - return self.decoder.decode_probs( - probs.numpy(), eouts_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes) - - def decode_probs_split(self, probs_split, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, - cutoff_prob, cutoff_top_n, num_processes): - self.decoder.init_decode( - beam_alpha=beam_alpha, - beam_beta=beam_beta, - lang_model_path=lang_model_path, - vocab_list=vocab_list, - decoding_method=decoding_method) - return self.decoder.decode_probs_split( - probs_split, vocab_list, decoding_method, lang_model_path, - beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, - num_processes) + batch_size = probs.shape[0] + self.decoder.reset_decoder(batch_size=batch_size) + self.decoder.next(probs, eouts_len) + trans_best, trans_beam = self.decoder.decode() + return trans_best @classmethod def from_pretrained(cls, dataloader, config, checkpoint_path): diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py index 2a38fb5c..11b85442 100644 --- a/examples/other/1xt2x/src_deepspeech2x/test_model.py +++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py @@ -254,12 +254,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer - vocab_list = self.test_loader.collate_fn.vocab_list - target_transcripts = self.ordid2token(texts, texts_len) - result_transcripts = self.compute_result_transcripts(audio, audio_len, - vocab_list, cfg) + result_transcripts = self.compute_result_transcripts(audio, audio_len) + for utt, target, result in zip(utts, target_transcripts, result_transcripts): errors, len_ref = errors_func(target, result) @@ -280,19 +278,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): error_rate=errors_sum / len_refs, error_rate_type=cfg.error_rate_type) - def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): - result_transcripts = self.model.decode( - audio, - audio_len, - vocab_list, - decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, - beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch) + def compute_result_transcripts(self, audio, audio_len): + result_transcripts = self.model.decode(audio, audio_len) + result_transcripts = [ self._text_featurizer.detokenize(item) for item in result_transcripts @@ -307,6 +295,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): cfg = self.config error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 + + # Initialized the decoder in model + decode_cfg = self.config.decode + vocab_list = self.test_loader.collate_fn.vocab_list + decode_batch_size = self.test_loader.batch_size + self.model.decoder.init_decoder( + decode_batch_size, vocab_list, decode_cfg.decoding_method, + decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta, + decode_cfg.beam_size, decode_cfg.cutoff_prob, + decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch) + with open(self.args.result_file, 'w') as fout: for i, batch in enumerate(self.test_loader): utts, audio, audio_len, texts, texts_len = batch @@ -326,6 +325,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): msg += "Final error rate [%s] (%d/%d) = %f" % ( error_rate_type, num_ins, num_ins, errors_sum / len_refs) logger.info(msg) + self.model.decoder.del_decoder() def run_test(self): self.resume_or_scratch() diff --git a/examples/other/g2p/run.sh b/examples/other/g2p/run.sh index 214b8b3d..9794e791 100755 --- a/examples/other/g2p/run.sh +++ b/examples/other/g2p/run.sh @@ -4,6 +4,10 @@ source path.sh USE_SCLITE=true # test g2p +if [ ! -d ~/datasets/BZNSYP ];then + echo "Please download BZNSYP dataset" + exit +fi echo "Start get g2p test data ..." python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p echo "Start test g2p ..." diff --git a/examples/ted_en_zh/st0/conf/tuning/decode.yaml b/examples/ted_en_zh/st0/conf/tuning/decode.yaml index ed081cf4..7d8d1daf 100644 --- a/examples/ted_en_zh/st0/conf/tuning/decode.yaml +++ b/examples/ted_en_zh/st0/conf/tuning/decode.yaml @@ -1,8 +1,9 @@ -batch_size: 5 +batch_size: 1 error_rate_type: char-bleu decoding_method: fullsentence # 'fullsentence', 'simultaneous' beam_size: 10 word_reward: 0.7 +maxlenratio: 0.3 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. # <0: for decoding, use full chunk. # >0: for decoding, use fixed chunk size as set. diff --git a/examples/ted_en_zh/st1/conf/tuning/decode.yaml b/examples/ted_en_zh/st1/conf/tuning/decode.yaml index d6104dbc..4f10acf7 100644 --- a/examples/ted_en_zh/st1/conf/tuning/decode.yaml +++ b/examples/ted_en_zh/st1/conf/tuning/decode.yaml @@ -1,9 +1,10 @@ -batch_size: 5 +batch_size: 1 error_rate_type: char-bleu decoding_method: fullsentence # 'fullsentence', 'simultaneous' beam_size: 10 word_reward: 0.7 +maxlenratio: 0.3 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. # <0: for decoding, use full chunk. # >0: for decoding, use fixed chunk size as set. diff --git a/examples/thchs30/align0/README.md b/examples/thchs30/align0/README.md index da56fffc..5195ab80 100644 --- a/examples/thchs30/align0/README.md +++ b/examples/thchs30/align0/README.md @@ -27,7 +27,7 @@ cd a0 应用程序会自动下载 THCHS-30数据集,处理成 MFA 所需的文件格式并开始训练,您可以修改 `run.sh` 中的参数 `LEXICON_NAME` 来决定您需要强制对齐的级别(word、syllable 和 phone) ## MFA 所使用的字典 --- -MFA 字典的格式请参考: [MFA 官方文档 Dictionary format ](https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html) +MFA 字典的格式请参考: [MFA 官方文档](https://montreal-forced-aligner.readthedocs.io/en/latest/) phone.lexicon 直接使用的是 `THCHS-30/data_thchs30/lm_phone/lexicon.txt` word.lexicon 考虑到了中文的多音字,使用**带概率的字典**, 生成规则请参考 `local/gen_word2phone.py` `syllable.lexicon` 获取自 [DNSun/thchs30-pinyin2tone](https://github.com/DNSun/thchs30-pinyin2tone) @@ -39,4 +39,4 @@ word.lexicon 考虑到了中文的多音字,使用**带概率的字典**, 生 **syllabel 级别:** [syllable.lexicon](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/syllable.lexicon)、[对齐结果](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/thchs30_alignment.tar.gz)、[模型](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/thchs30_model.zip) **word 级别:** [word.lexicon](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/word.lexicon)、[对齐结果](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/thchs30_alignment.tar.gz)、[模型](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/thchs30_model.zip) -随后,您可以参考 [MFA 官方文档 Align using pretrained models](https://montreal-forced-aligner.readthedocs.io/en/stable/aligning.html#align-using-pretrained-models) 使用我们给您提供好的模型直接对自己的数据集进行强制对齐,注意,您需要使用和模型对应的 lexicon 文件,当文本是汉字时,您需要用空格把不同的**汉字**(而不是词语)分开 +随后,您可以参考 [MFA 官方文档](https://montreal-forced-aligner.readthedocs.io/en/latest/) 使用我们给您提供好的模型直接对自己的数据集进行强制对齐,注意,您需要使用和模型对应的 lexicon 文件,当文本是汉字时,您需要用空格把不同的**汉字**(而不是词语)分开 diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 74c1086a..157949d1 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -240,13 +240,14 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \ --am_ckpt=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_66200.pdz \ --am_stat=fastspeech2_nosil_vctk_ckpt_0.5/speech_stats.npy \ --voc=pwgan_vctk \ - --voc_config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ - --voc_ckpt=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ - --voc_stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml \ + --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \ --lang=en \ --text=${BIN_DIR}/../sentences_en.txt \ --output_dir=exp/default/test_e2e \ --phones_dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \ --speaker_dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt \ - --spk_id=0 + --spk_id=0 \ + --inference_dir=exp/default/inference ``` diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml index 2738e7c2..1bca9107 100644 --- a/examples/vctk/tts3/conf/default.yaml +++ b/examples/vctk/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type diff --git a/examples/vctk/tts3/local/inference.sh b/examples/vctk/tts3/local/inference.sh new file mode 100755 index 00000000..caef89d8 --- /dev/null +++ b/examples/vctk/tts3/local/inference.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_vctk \ + --voc=pwgan_vctk \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=0 \ + --lang=en +fi + diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh index 51bb9e19..60d56d1c 100755 --- a/examples/vctk/tts3/local/synthesize_e2e.sh +++ b/examples/vctk/tts3/local/synthesize_e2e.sh @@ -20,4 +20,5 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \ --output_dir=${train_output_path}/test_e2e \ --phones_dict=dump/phone_id_map.txt \ --speaker_dict=dump/speaker_id_map.txt \ - --spk_id=0 + --spk_id=0 \ + --inference_dir=${train_output_path}/inference diff --git a/examples/vctk/voc1/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml index 59ce3825..af859d4c 100644 --- a/examples/vctk/voc1/conf/default.yaml +++ b/examples/vctk/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md new file mode 100644 index 00000000..2c8ad138 --- /dev/null +++ b/examples/voxceleb/README.md @@ -0,0 +1,8 @@ + +dataset info refer to [VoxCeleb](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/index.html#about) + +sv0 - speaker verfication with softmax backend etc, all python code + more info refer to the sv0/readme.txt + +sv1 - dependence on kaldi, speaker verfication with plda/sc backend, + more info refer to the sv1/readme.txt diff --git a/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py new file mode 100644 index 00000000..c92ede1a --- /dev/null +++ b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py @@ -0,0 +1,81 @@ +#!/usr/bin/python3 +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Make VoxCeleb1 trial of kaldi format +this script creat the test trial from kaldi trial voxceleb1_test_v2.txt or official trial veri_test2.txt +to kaldi trial format +""" + +import argparse +import codecs +import os + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument("--voxceleb_trial", + default="voxceleb1_test_v2", + type=str, + help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt") +parser.add_argument("--trial", + default="data/test/trial", + type=str, + help="Kaldi format trial file") +args = parser.parse_args() + +def main(voxceleb_trial, trial): + """ + VoxCeleb provide several trial file, which format is different with kaldi format. + + VoxCeleb format's meaning is as following: + -------------------------------- + target_or_nontarget path1 path2 + -------------------------------- + target_or_nontarget is an integer: 1 target path1 is equal to path2 + 0 nontarget path1 is unequal to path2 + path1: spkr_id/rec_id/name + path2: spkr_id/rec_id/name + + Kaldi format's meaning is as following: + --------------------------------------- + utt_id1 utt_id2 target_or_nontarget + --------------------------------------- + utt_id1: utterance identification or speaker identification + utt_id2: utterance identification or speaker identification + target_or_nontarget is an string: 'target' utt_id1 is equal to utt_id2 + 'nontarget' utt_id2 is unequal to utt_id2 + """ + print("Start convert the voxceleb trial to kaldi format") + if not os.path.exists(voxceleb_trial): + raise RuntimeError("{} does not exist. Pleas input the correct file path".format(voxceleb_trial)) + + trial_dirname = os.path.dirname(trial) + if not os.path.exists(trial_dirname): + os.mkdir(trial_dirname) + + with codecs.open(voxceleb_trial, 'r', encoding='utf-8') as f, \ + codecs.open(trial, 'w', encoding='utf-8') as w: + for line in f: + target_or_nontarget, path1, path2 = line.strip().split() + + utt_id1 = "-".join(path1.split("/")) + utt_id2 = "-".join(path2.split("/")) + target = "nontarget" + if int(target_or_nontarget): + target = "target" + w.write("{} {} {}\n".format(utt_id1, utt_id2, target)) + print("Convert the voxceleb trial to kaldi format successfully") + +if __name__ == "__main__": + main(args.voxceleb_trial, args.trial) diff --git a/paddleaudio/CHANGELOG.md b/paddleaudio/CHANGELOG.md index 4dc68c6f..825c32f0 100644 --- a/paddleaudio/CHANGELOG.md +++ b/paddleaudio/CHANGELOG.md @@ -1,2 +1 @@ # Changelog - diff --git a/paddleaudio/__init__.py b/paddleaudio/__init__.py index b717777d..2685cf57 100644 --- a/paddleaudio/__init__.py +++ b/paddleaudio/__init__.py @@ -13,5 +13,3 @@ # limitations under the License. from .backends import * from .features import * - -__version__ = '0.1.0' diff --git a/paddleaudio/features/core.py b/paddleaudio/features/core.py index d3c2e290..01925ec6 100644 --- a/paddleaudio/features/core.py +++ b/paddleaudio/features/core.py @@ -415,11 +415,11 @@ def mfcc(x, **kwargs) # librosa mfcc: - spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512, + spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512, win_length=512, hop_length=320, n_mels=64, fmin=50) - b = librosa.feature.mfcc(x, + b = librosa.feature.mfcc(y=x, sr=16000, S=spect, n_mfcc=20, diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py index 8d32f287..185a92b8 100644 --- a/paddlespeech/__init__.py +++ b/paddlespeech/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -__version__ = '0.1.0' diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index aa4e31d9..ef769fbc 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse -import io import os import sys from typing import List @@ -23,9 +22,9 @@ import librosa import numpy as np import paddle import soundfile -import yaml from yacs.config import CfgNode +from ..download import get_path_from_url from ..executor import BaseExecutor from ..log import logger from ..utils import cli_register @@ -64,14 +63,61 @@ pretrained_models = { 'ckpt_path': 'exp/transformer/checkpoints/avg_10', }, + "deepspeech2offline_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz', + 'md5': + '932c3593d62fe5c741b59b31318aa314', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2/checkpoints/avg_1', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, + "deepspeech2online_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz', + 'md5': + 'd5e076217cf60486519f72c217d21b9b', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2_online/checkpoints/avg_1', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, + "deepspeech2offline_librispeech-en-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz', + 'md5': + 'f5666c81ad015c8de03aac2bc92e5762', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2/checkpoints/avg_1', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm', + 'lm_md5': + '099a601759d467cd0a8523ff939819c5' + }, } model_alias = { - "deepspeech2offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model", - "deepspeech2online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline", - "conformer": "paddlespeech.s2t.models.u2:U2Model", - "transformer": "paddlespeech.s2t.models.u2:U2Model", - "wenetspeech": "paddlespeech.s2t.models.u2:U2Model", + "deepspeech2offline": + "paddlespeech.s2t.models.ds2:DeepSpeech2Model", + "deepspeech2online": + "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline", + "conformer": + "paddlespeech.s2t.models.u2:U2Model", + "transformer": + "paddlespeech.s2t.models.u2:U2Model", + "wenetspeech": + "paddlespeech.s2t.models.u2:U2Model", } @@ -95,7 +141,8 @@ class ASRExecutor(BaseExecutor): '--lang', type=str, default='zh', - help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]') + help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]' + ) self.parser.add_argument( "--sample_rate", type=int, @@ -111,7 +158,10 @@ class ASRExecutor(BaseExecutor): '--decode_method', type=str, default='attention_rescoring', - choices=['ctc_greedy_search', 'ctc_prefix_beam_search', 'attention', 'attention_rescoring'], + choices=[ + 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention', + 'attention_rescoring' + ], help='only support transformer and conformer model') self.parser.add_argument( '--ckpt_path', @@ -135,8 +185,9 @@ class ASRExecutor(BaseExecutor): """ Download and returns pretrained resources path of current task. """ - assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format( - tag) + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) res_path = os.path.join(MODEL_HOME, tag) decompressed_path = download_and_decompress(pretrained_models[tag], @@ -187,13 +238,21 @@ class ASRExecutor(BaseExecutor): if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: from paddlespeech.s2t.io.collator import SpeechCollator self.vocab = self.config.vocab_filepath - self.config.decode.lang_model_path = os.path.join(res_path, self.config.decode.lang_model_path) + self.config.decode.lang_model_path = os.path.join( + MODEL_HOME, 'language_model', + self.config.decode.lang_model_path) self.collate_fn_test = SpeechCollator.from_config(self.config) self.text_feature = TextFeaturizer( - unit_type=self.config.unit_type, - vocab=self.vocab) + unit_type=self.config.unit_type, vocab=self.vocab) + lm_url = pretrained_models[tag]['lm_url'] + lm_md5 = pretrained_models[tag]['lm_md5'] + self.download_lm( + lm_url, + os.path.dirname(self.config.decode.lang_model_path), lm_md5) + elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: - self.config.spm_model_prefix = os.path.join(self.res_path, self.config.spm_model_prefix) + self.config.spm_model_prefix = os.path.join( + self.res_path, self.config.spm_model_prefix) self.text_feature = TextFeaturizer( unit_type=self.config.unit_type, vocab=self.config.vocab_filepath, @@ -252,8 +311,10 @@ class ASRExecutor(BaseExecutor): audio = audio[:, 0] # pcm16 -> pcm 32 audio = self._pcm16to32(audio) - audio = librosa.resample(audio, audio_sample_rate, - self.sample_rate) + audio = librosa.resample( + audio, + orig_sr=audio_sample_rate, + target_sr=self.sample_rate) audio_sample_rate = self.sample_rate # pcm32 -> pcm 16 audio = self._pcm32to16(audio) @@ -284,18 +345,15 @@ class ASRExecutor(BaseExecutor): audio = self._inputs["audio"] audio_len = self._inputs["audio_len"] if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: - result_transcripts = self.model.decode( - audio, - audio_len, - self.text_feature.vocab_list, - decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, - beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch) + decode_batch_size = audio.shape[0] + self.model.decoder.init_decoder( + decode_batch_size, self.text_feature.vocab_list, + cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta, + cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n, + cfg.num_proc_bsearch) + + result_transcripts = self.model.decode(audio, audio_len) + self.model.decoder.del_decoder() self._outputs["result"] = result_transcripts[0] elif "conformer" in model_type or "transformer" in model_type: @@ -319,6 +377,13 @@ class ASRExecutor(BaseExecutor): """ return self._outputs["result"] + def download_lm(self, url, lm_dir, md5sum): + download_path = get_path_from_url( + url=url, + root_dir=lm_dir, + md5sum=md5sum, + decompress=False, ) + def _pcm16to32(self, audio): assert (audio.dtype == np.int16) audio = audio.astype("float32") @@ -411,7 +476,7 @@ class ASRExecutor(BaseExecutor): try: res = self(audio_file, model, lang, sample_rate, config, ckpt_path, - decode_method, force_yes, device) + decode_method, force_yes, device) logger.info('ASR Result: {}'.format(res)) return True except Exception as e: @@ -435,7 +500,8 @@ class ASRExecutor(BaseExecutor): audio_file = os.path.abspath(audio_file) self._check(audio_file, sample_rate, force_yes) paddle.set_device(device) - self._init_from_path(model, lang, sample_rate, config, decode_method, ckpt_path) + self._init_from_path(model, lang, sample_rate, config, decode_method, + ckpt_path) self.preprocess(model, audio_file) self.infer(model) res = self.postprocess() # Retrieve result of asr. diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 52bc1972..5839ff30 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -114,8 +114,9 @@ class CLSExecutor(BaseExecutor): """ Download and returns pretrained resources path of current task. """ - assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format( - tag) + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) res_path = os.path.join(MODEL_HOME, tag) decompressed_path = download_and_decompress(pretrained_models[tag], diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index 1276424c..1709c754 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -112,8 +112,9 @@ class STExecutor(BaseExecutor): """ Download and returns pretrained resources path of current task. """ - assert tag in pretrained_models, "Can not find pretrained resources of {}.".format( - tag) + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) res_path = os.path.join(MODEL_HOME, tag) decompressed_path = download_and_decompress(pretrained_models[tag], @@ -173,8 +174,8 @@ class STExecutor(BaseExecutor): self.config.decode.decoding_method = "fullsentence" with UpdateConfig(self.config): - self.config.cmvn_path = os.path.join( - res_path, self.config.cmvn_path) + self.config.cmvn_path = os.path.join(res_path, + self.config.cmvn_path) self.config.spm_model_prefix = os.path.join( res_path, self.config.spm_model_prefix) self.text_feature = TextFeaturizer( diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index 1cef8fcf..b0977c88 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -124,8 +124,9 @@ class TextExecutor(BaseExecutor): """ Download and returns pretrained resources path of current task. """ - assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format( - tag) + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) res_path = os.path.join(MODEL_HOME, tag) decompressed_path = download_and_decompress(pretrained_models[tag], diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index a39a5c4e..dfd6a42f 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -117,6 +117,36 @@ pretrained_models = { 'speaker_dict': 'speaker_id_map.txt', }, + # tacotron2 + "tacotron2_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip', + 'md5': + '0df4b6f0bcbe0d73c5ed6df8867ab91a', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_30600.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + "tacotron2_ljspeech-en": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip', + 'md5': + '6a5eddd81ae0e81d16959b97481135f3', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_60300.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + # pwgan "pwgan_csmsc-zh": { 'url': @@ -205,6 +235,20 @@ pretrained_models = { 'speech_stats': 'feats_stats.npy', }, + + # wavernn + "wavernn_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip', + 'md5': + 'ee37b752f09bcba8f2af3b777ca38e13', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_400000.pdz', + 'speech_stats': + 'feats_stats.npy', + } } model_alias = { @@ -217,6 +261,10 @@ model_alias = { "paddlespeech.t2s.models.fastspeech2:FastSpeech2", "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -234,6 +282,10 @@ model_alias = { "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", "hifigan_inference": "paddlespeech.t2s.models.hifigan:HiFiGANInference", + "wavernn": + "paddlespeech.t2s.models.wavernn:WaveRNN", + "wavernn_inference": + "paddlespeech.t2s.models.wavernn:WaveRNNInference", } @@ -253,9 +305,13 @@ class TTSExecutor(BaseExecutor): type=str, default='fastspeech2_csmsc', choices=[ - 'speedyspeech_csmsc', 'fastspeech2_csmsc', - 'fastspeech2_ljspeech', 'fastspeech2_aishell3', - 'fastspeech2_vctk' + 'speedyspeech_csmsc', + 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', + 'fastspeech2_aishell3', + 'fastspeech2_vctk', + 'tacotron2_csmsc', + 'tacotron2_ljspeech', ], help='Choose acoustic model type of tts task.') self.parser.add_argument( @@ -300,8 +356,14 @@ class TTSExecutor(BaseExecutor): type=str, default='pwgan_csmsc', choices=[ - 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc' + 'pwgan_csmsc', + 'pwgan_ljspeech', + 'pwgan_aishell3', + 'pwgan_vctk', + 'mb_melgan_csmsc', + 'style_melgan_csmsc', + 'hifigan_csmsc', + 'wavernn_csmsc', ], help='Choose vocoder type of tts task.') @@ -340,8 +402,9 @@ class TTSExecutor(BaseExecutor): """ Download and returns pretrained resources path of current task. """ - assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format( - tag) + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) res_path = os.path.join(MODEL_HOME, tag) decompressed_path = download_and_decompress(pretrained_models[tag], @@ -368,7 +431,7 @@ class TTSExecutor(BaseExecutor): """ Init model and other resources from a specific path. """ - if hasattr(self, 'am') and hasattr(self, 'voc'): + if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'): logger.info('Models had been initialized.') return # am @@ -488,6 +551,8 @@ class TTSExecutor(BaseExecutor): vocab_size=vocab_size, tone_size=tone_size, **self.am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **self.am_config["model"]) am.set_state_dict(paddle.load(self.am_ckpt)["main_params"]) am.eval() @@ -505,10 +570,15 @@ class TTSExecutor(BaseExecutor): voc_class = dynamic_import(voc_name, model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - voc = voc_class(**self.voc_config["generator_params"]) - voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() + if voc_name != 'wavernn': + voc = voc_class(**self.voc_config["generator_params"]) + voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"]) + voc.remove_weight_norm() + voc.eval() + else: + voc = voc_class(**self.voc_config["model"]) + voc.set_state_dict(paddle.load(self.voc_ckpt)["main_params"]) + voc.eval() voc_mu, voc_std = np.load(self.voc_stat) voc_mu = paddle.to_tensor(voc_mu) voc_std = paddle.to_tensor(voc_std) diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py index 63b670c8..d7dcc90c 100644 --- a/paddlespeech/cli/utils.py +++ b/paddlespeech/cli/utils.py @@ -24,14 +24,17 @@ from typing import Any from typing import Dict import paddle -import paddleaudio import requests import yaml from paddle.framework import load +import paddleaudio from . import download -from .. import __version__ from .entry import commands +try: + from .. import __version__ +except ImportError: + __version__ = "0.0.0" # for develop branch requests.adapters.DEFAULT_RETRIES = 3 diff --git a/paddlespeech/s2t/decoders/ctcdecoder/__init__.py b/paddlespeech/s2t/decoders/ctcdecoder/__init__.py index 185a92b8..37ceae6e 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/__init__.py +++ b/paddlespeech/s2t/decoders/ctcdecoder/__init__.py @@ -11,3 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .swig_wrapper import ctc_beam_search_decoding +from .swig_wrapper import ctc_beam_search_decoding_batch +from .swig_wrapper import ctc_greedy_decoding +from .swig_wrapper import CTCBeamSearchDecoder +from .swig_wrapper import Scorer diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py b/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py index d883d430..9e2a8506 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py +++ b/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. """Wrapper for various CTC decoders in SWIG.""" -import swig_decoders +import paddlespeech_ctcdecoders -class Scorer(swig_decoders.Scorer): +class Scorer(paddlespeech_ctcdecoders.Scorer): """Wrapper for Scorer. :param alpha: Parameter associated with language model. Don't use @@ -26,14 +26,17 @@ class Scorer(swig_decoders.Scorer): :type beta: float :model_path: Path to load language model. :type model_path: str + :param vocabulary: Vocabulary list. + :type vocabulary: list """ def __init__(self, alpha, beta, model_path, vocabulary): - swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary) + paddlespeech_ctcdecoders.Scorer.__init__(self, alpha, beta, model_path, + vocabulary) -def ctc_greedy_decoder(probs_seq, vocabulary, blank_id): - """Wrapper for ctc best path decoder in swig. +def ctc_greedy_decoding(probs_seq, vocabulary, blank_id): + """Wrapper for ctc best path decodeing function in swig. :param probs_seq: 2-D list of probability distributions over each time step, with each element being a list of normalized @@ -44,19 +47,19 @@ def ctc_greedy_decoder(probs_seq, vocabulary, blank_id): :return: Decoding result string. :rtype: str """ - result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary, - blank_id) + result = paddlespeech_ctcdecoders.ctc_greedy_decoding(probs_seq.tolist(), + vocabulary, blank_id) return result -def ctc_beam_search_decoder(probs_seq, - vocabulary, - beam_size, - cutoff_prob=1.0, - cutoff_top_n=40, - ext_scoring_func=None, - blank_id=0): - """Wrapper for the CTC Beam Search Decoder. +def ctc_beam_search_decoding(probs_seq, + vocabulary, + beam_size, + cutoff_prob=1.0, + cutoff_top_n=40, + ext_scoring_func=None, + blank_id=0): + """Wrapper for the CTC Beam Search Decoding function. :param probs_seq: 2-D list of probability distributions over each time step, with each element being a list of normalized @@ -81,22 +84,22 @@ def ctc_beam_search_decoder(probs_seq, results, in descending order of the probability. :rtype: list """ - beam_results = swig_decoders.ctc_beam_search_decoder( + beam_results = paddlespeech_ctcdecoders.ctc_beam_search_decoding( probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n, ext_scoring_func, blank_id) beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results] return beam_results -def ctc_beam_search_decoder_batch(probs_split, - vocabulary, - beam_size, - num_processes, - cutoff_prob=1.0, - cutoff_top_n=40, - ext_scoring_func=None, - blank_id=0): - """Wrapper for the batched CTC beam search decoder. +def ctc_beam_search_decoding_batch(probs_split, + vocabulary, + beam_size, + num_processes, + cutoff_prob=1.0, + cutoff_top_n=40, + ext_scoring_func=None, + blank_id=0): + """Wrapper for the batched CTC beam search decodeing batch function. :param probs_seq: 3-D list with each element as an instance of 2-D list of probabilities used by ctc_beam_search_decoder(). @@ -126,9 +129,31 @@ def ctc_beam_search_decoder_batch(probs_split, """ probs_split = [probs_seq.tolist() for probs_seq in probs_split] - batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch( + batch_beam_results = paddlespeech_ctcdecoders.ctc_beam_search_decoding_batch( probs_split, vocabulary, beam_size, num_processes, cutoff_prob, cutoff_top_n, ext_scoring_func, blank_id) batch_beam_results = [[(res[0], res[1]) for res in beam_results] for beam_results in batch_beam_results] return batch_beam_results + + +class CTCBeamSearchDecoder(paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch): + """Wrapper for CtcBeamSearchDecoderBatch. + Args: + vocab_list (list): Vocabulary list. + beam_size (int): Width for beam search. + num_processes (int): Number of parallel processes. + param cutoff_prob (float): Cutoff probability in vocabulary pruning, + default 1.0, no pruning. + cutoff_top_n (int): Cutoff number in pruning, only top cutoff_top_n + characters with highest probs in vocabulary will be + used in beam search, default 40. + param ext_scorer (Scorer): External scorer for partially decoded sentence, e.g. word count + or language model. + """ + + def __init__(self, vocab_list, batch_size, beam_size, num_processes, + cutoff_prob, cutoff_top_n, _ext_scorer, blank_id): + paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch.__init__( + self, vocab_list, batch_size, beam_size, num_processes, cutoff_prob, + cutoff_top_n, _ext_scorer, blank_id) diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index 049311c7..3e9ede76 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -267,12 +267,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer - vocab_list = self.test_loader.collate_fn.vocab_list - target_transcripts = self.ordid2token(texts, texts_len) - result_transcripts = self.compute_result_transcripts( - audio, audio_len, vocab_list, decode_cfg) + result_transcripts = self.compute_result_transcripts(audio, audio_len) for utt, target, result in zip(utts, target_transcripts, result_transcripts): @@ -296,21 +293,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): error_rate=errors_sum / len_refs, error_rate_type=decode_cfg.error_rate_type) - def compute_result_transcripts(self, audio, audio_len, vocab_list, - decode_cfg): - result_transcripts = self.model.decode( - audio, - audio_len, - vocab_list, - decoding_method=decode_cfg.decoding_method, - lang_model_path=decode_cfg.lang_model_path, - beam_alpha=decode_cfg.alpha, - beam_beta=decode_cfg.beta, - beam_size=decode_cfg.beam_size, - cutoff_prob=decode_cfg.cutoff_prob, - cutoff_top_n=decode_cfg.cutoff_top_n, - num_processes=decode_cfg.num_proc_bsearch) - + def compute_result_transcripts(self, audio, audio_len): + result_transcripts = self.model.decode(audio, audio_len) return result_transcripts @mp_tools.rank_zero_only @@ -320,6 +304,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.model.eval() error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 + + # Initialized the decoder in model + decode_cfg = self.config.decode + vocab_list = self.test_loader.collate_fn.vocab_list + decode_batch_size = self.test_loader.batch_size + self.model.decoder.init_decoder( + decode_batch_size, vocab_list, decode_cfg.decoding_method, + decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta, + decode_cfg.beam_size, decode_cfg.cutoff_prob, + decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch) + with jsonlines.open(self.args.result_file, 'w') as fout: for i, batch in enumerate(self.test_loader): utts, audio, audio_len, texts, texts_len = batch @@ -339,6 +334,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): msg += "Final error rate [%s] (%d/%d) = %f" % ( error_rate_type, num_ins, num_ins, errors_sum / len_refs) logger.info(msg) + self.model.decoder.del_decoder() @paddle.no_grad() def export(self): @@ -377,6 +373,22 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): self.model.eval() error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 + + # Initialized the decoder in model + decode_cfg = self.config.decode + vocab_list = self.test_loader.collate_fn.vocab_list + if self.args.model_type == "online": + decode_batch_size = 1 + elif self.args.model_type == "offline": + decode_batch_size = self.test_loader.batch_size + else: + raise Exception("wrong model type") + self.model.decoder.init_decoder( + decode_batch_size, vocab_list, decode_cfg.decoding_method, + decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta, + decode_cfg.beam_size, decode_cfg.cutoff_prob, + decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch) + with jsonlines.open(self.args.result_file, 'w') as fout: for i, batch in enumerate(self.test_loader): utts, audio, audio_len, texts, texts_len = batch @@ -388,7 +400,6 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): error_rate_type = metrics['error_rate_type'] logger.info("Error rate [%s] (%d/?) = %f" % (error_rate_type, num_ins, errors_sum / len_refs)) - # logging msg = "Test: " msg += "epoch: {}, ".format(self.epoch) @@ -398,30 +409,31 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): logger.info(msg) if self.args.enable_auto_log is True: self.autolog.report() + self.model.decoder.del_decoder() - def compute_result_transcripts(self, audio, audio_len, vocab_list, - decode_cfg): + def compute_result_transcripts(self, audio, audio_len): if self.args.model_type == "online": - output_probs, output_lens = self.static_forward_online(audio, - audio_len) + output_probs, output_lens, trans_batch = self.static_forward_online( + audio, audio_len, decoder_chunk_size=1) + result_transcripts = [trans[-1] for trans in trans_batch] elif self.args.model_type == "offline": output_probs, output_lens = self.static_forward_offline(audio, audio_len) + batch_size = output_probs.shape[0] + self.model.decoder.reset_decoder(batch_size=batch_size) + + self.model.decoder.next(output_probs, output_lens) + + trans_best, trans_beam = self.model.decoder.decode() + + result_transcripts = trans_best + else: raise Exception("wrong model type") self.predictor.clear_intermediate_tensor() self.predictor.try_shrink_memory() - self.model.decoder.init_decode(decode_cfg.alpha, decode_cfg.beta, - decode_cfg.lang_model_path, vocab_list, - decode_cfg.decoding_method) - - result_transcripts = self.model.decoder.decode_probs( - output_probs, output_lens, vocab_list, decode_cfg.decoding_method, - decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta, - decode_cfg.beam_size, decode_cfg.cutoff_prob, - decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch) #replace the with ' ' result_transcripts = [ self._text_featurizer.detokenize(sentence) @@ -451,6 +463,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): ------- output_probs(numpy.array): shape[B, T, vocab_size] output_lens(numpy.array): shape[B] + trans(list(list(str))): shape[B, T] """ output_probs_list = [] output_lens_list = [] @@ -464,14 +477,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): batch_size, Tmax, x_dim = x_batch.shape x_len_batch = audio_len.numpy().astype(np.int64) if (Tmax - chunk_size) % chunk_stride != 0: - padding_len_batch = chunk_stride - ( - Tmax - chunk_size - ) % chunk_stride # The length of padding for the batch + # The length of padding for the batch + padding_len_batch = chunk_stride - (Tmax - chunk_size + ) % chunk_stride else: padding_len_batch = 0 x_list = np.split(x_batch, batch_size, axis=0) x_len_list = np.split(x_len_batch, batch_size, axis=0) + trans_batch = [] for x, x_len in zip(x_list, x_len_list): if self.args.enable_auto_log is True: self.autolog.times.start() @@ -504,12 +518,14 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): h_box_handle = self.predictor.get_input_handle(input_names[2]) c_box_handle = self.predictor.get_input_handle(input_names[3]) + trans = [] probs_chunk_list = [] probs_chunk_lens_list = [] if self.args.enable_auto_log is True: # record the model preprocessing time self.autolog.times.stamp() + self.model.decoder.reset_decoder(batch_size=1) for i in range(0, num_chunk): start = i * chunk_stride end = start + chunk_size @@ -518,9 +534,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): x_chunk_lens = 0 else: x_chunk_lens = min(x_len - i * chunk_stride, chunk_size) - - if (x_chunk_lens < - receptive_field_length): #means the number of input frames in the chunk is not enough for predicting one prob + #means the number of input frames in the chunk is not enough for predicting one prob + if (x_chunk_lens < receptive_field_length): break x_chunk_lens = np.array([x_chunk_lens]) audio_handle.reshape(x_chunk.shape) @@ -549,9 +564,12 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): output_chunk_lens = output_lens_handle.copy_to_cpu() chunk_state_h_box = output_state_h_handle.copy_to_cpu() chunk_state_c_box = output_state_c_handle.copy_to_cpu() - + self.model.decoder.next(output_chunk_probs, output_chunk_lens) probs_chunk_list.append(output_chunk_probs) probs_chunk_lens_list.append(output_chunk_lens) + trans_best, trans_beam = self.model.decoder.decode() + trans.append(trans_best[0]) + trans_batch.append(trans) output_probs = np.concatenate(probs_chunk_list, axis=1) output_lens = np.sum(probs_chunk_lens_list, axis=0) vocab_size = output_probs.shape[2] @@ -573,7 +591,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): self.autolog.times.end() output_probs = np.concatenate(output_probs_list, axis=0) output_lens = np.concatenate(output_lens_list, axis=0) - return output_probs, output_lens + return output_probs, output_lens, trans_batch def static_forward_offline(self, audio, audio_len): """ diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 85bb877b..d7bee6d7 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -175,7 +175,7 @@ class U2Trainer(Trainer): observation['batch_cost'] = observation[ 'reader_cost'] + observation['step_cost'] observation['samples'] = observation['batch_size'] - observation['ips,sent./sec'] = observation[ + observation['ips,samples/s'] = observation[ 'batch_size'] / observation['batch_cost'] for k, v in observation.items(): msg += f" {k.split(',')[0]}: " diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index b03ca38b..6a32eda7 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -285,7 +285,7 @@ class U2STTrainer(Trainer): subsampling_factor=1, load_aux_output=load_transcript, num_encs=1, - dist_sampler=True) + dist_sampler=False) logger.info("Setup train/valid Dataloader!") else: # test dataset, return raw text @@ -408,6 +408,7 @@ class U2STTester(U2STTrainer): decoding_method=decode_cfg.decoding_method, beam_size=decode_cfg.beam_size, word_reward=decode_cfg.word_reward, + maxlenratio=decode_cfg.maxlenratio, decoding_chunk_size=decode_cfg.decoding_chunk_size, num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, simulate_streaming=decode_cfg.simulate_streaming) @@ -435,6 +436,7 @@ class U2STTester(U2STTrainer): decoding_method=decode_cfg.decoding_method, beam_size=decode_cfg.beam_size, word_reward=decode_cfg.word_reward, + maxlenratio=decode_cfg.maxlenratio, decoding_chunk_size=decode_cfg.decoding_chunk_size, num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, simulate_streaming=decode_cfg.simulate_streaming) diff --git a/paddlespeech/s2t/io/batchfy.py b/paddlespeech/s2t/io/batchfy.py index f59fb24c..f3630f2e 100644 --- a/paddlespeech/s2t/io/batchfy.py +++ b/paddlespeech/s2t/io/batchfy.py @@ -419,7 +419,7 @@ def make_batchset( # sort it by input lengths (long to short) sorted_data = sorted( d.items(), - key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]), + key=lambda data: float(data[1][batch_sort_key][batch_sort_axis]["shape"][0]), reverse=not shortest_first, ) logger.info("# utts: " + str(len(sorted_data))) diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py index 920de34f..55aa13ff 100644 --- a/paddlespeech/s2t/io/dataloader.py +++ b/paddlespeech/s2t/io/dataloader.py @@ -61,7 +61,7 @@ class BatchDataLoader(): def __init__(self, json_file: str, train_mode: bool, - sortagrad: bool=False, + sortagrad: int=0, batch_size: int=0, maxlen_in: float=float('inf'), maxlen_out: float=float('inf'), diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py index ac55af12..89752bb9 100644 --- a/paddlespeech/s2t/io/sampler.py +++ b/paddlespeech/s2t/io/sampler.py @@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False): """ rng = np.random.RandomState(epoch) shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) + batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size)) rng.shuffle(batch_indices) batch_indices = [item for batch in batch_indices for item in batch] assert clipped is False diff --git a/paddlespeech/s2t/models/ds2/__init__.py b/paddlespeech/s2t/models/ds2/__init__.py index 8d5959c8..b3222067 100644 --- a/paddlespeech/s2t/models/ds2/__init__.py +++ b/paddlespeech/s2t/models/ds2/__init__.py @@ -16,7 +16,7 @@ from .deepspeech2 import DeepSpeech2Model from paddlespeech.s2t.utils import dynamic_pip_install try: - import swig_decoders + import paddlespeech_ctcdecoders except ImportError: try: package_name = 'paddlespeech_ctcdecoders' diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index 4a4d67ce..9c6b66c2 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -164,24 +164,18 @@ class DeepSpeech2Model(nn.Layer): return loss @paddle.no_grad() - def decode(self, audio, audio_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes): - # init once + def decode(self, audio, audio_len): # decoders only accept string encoded in utf-8 - self.decoder.init_decode( - beam_alpha=beam_alpha, - beam_beta=beam_beta, - lang_model_path=lang_model_path, - vocab_list=vocab_list, - decoding_method=decoding_method) + # Make sure the decoder has been initialized eouts, eouts_len = self.encoder(audio, audio_len) probs = self.decoder.softmax(eouts) - return self.decoder.decode_probs( - probs.numpy(), eouts_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes) + batch_size = probs.shape[0] + self.decoder.reset_decoder(batch_size=batch_size) + self.decoder.next(probs, eouts_len) + trans_best, trans_beam = self.decoder.decode() + + return trans_best @classmethod def from_pretrained(cls, dataloader, config, checkpoint_path): diff --git a/paddlespeech/s2t/models/ds2_online/__init__.py b/paddlespeech/s2t/models/ds2_online/__init__.py index 2d304237..c5fdab1b 100644 --- a/paddlespeech/s2t/models/ds2_online/__init__.py +++ b/paddlespeech/s2t/models/ds2_online/__init__.py @@ -16,7 +16,7 @@ from .deepspeech2 import DeepSpeech2ModelOnline from paddlespeech.s2t.utils import dynamic_pip_install try: - import swig_decoders + import paddlespeech_ctcdecoders except ImportError: try: package_name = 'paddlespeech_ctcdecoders' diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py index 5e4981c0..9574a62b 100644 --- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py @@ -293,25 +293,17 @@ class DeepSpeech2ModelOnline(nn.Layer): return loss @paddle.no_grad() - def decode(self, audio, audio_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes): - # init once + def decode(self, audio, audio_len): # decoders only accept string encoded in utf-8 - self.decoder.init_decode( - beam_alpha=beam_alpha, - beam_beta=beam_beta, - lang_model_path=lang_model_path, - vocab_list=vocab_list, - decoding_method=decoding_method) - + # Make sure the decoder has been initialized eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder( audio, audio_len, None, None) probs = self.decoder.softmax(eouts) - return self.decoder.decode_probs( - probs.numpy(), eouts_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes) + batch_size = probs.shape[0] + self.decoder.reset_decoder(batch_size=batch_size) + self.decoder.next(probs, eouts_len) + trans_best, trans_beam = self.decoder.decode() + return trans_best @classmethod def from_pretrained(cls, dataloader, config, checkpoint_path): diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index ff4012e8..91079812 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -32,7 +32,7 @@ from paddlespeech.s2t.frontend.utility import IGNORE_ID from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.models.asr_interface import ASRInterface from paddlespeech.s2t.modules.cmvn import GlobalCMVN -from paddlespeech.s2t.modules.ctc import CTCDecoder +from paddlespeech.s2t.modules.ctc import CTCDecoderBase from paddlespeech.s2t.modules.decoder import TransformerDecoder from paddlespeech.s2t.modules.encoder import ConformerEncoder from paddlespeech.s2t.modules.encoder import TransformerEncoder @@ -63,7 +63,7 @@ class U2BaseModel(ASRInterface, nn.Layer): vocab_size: int, encoder: TransformerEncoder, decoder: TransformerDecoder, - ctc: CTCDecoder, + ctc: CTCDecoderBase, ctc_weight: float=0.5, ignore_id: int=IGNORE_ID, lsm_weight: float=0.0, @@ -663,7 +663,7 @@ class U2BaseModel(ASRInterface, nn.Layer): # (num_hyps, max_hyps_len, vocab_size) decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps, hyps_lens) - decoder_out = paddle.nn.functional.log_softmax(decoder_out, dim=-1) + decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1) return decoder_out @paddle.no_grad() @@ -840,7 +840,7 @@ class U2Model(U2DecodeModel): model_conf = configs.get('model_conf', dict()) dropout_rate = model_conf.get('ctc_dropout_rate', 0.0) grad_norm_type = model_conf.get('ctc_grad_norm_type', None) - ctc = CTCDecoder( + ctc = CTCDecoderBase( odim=vocab_size, enc_n_units=encoder.output_size(), blank_id=0, diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index 79ca423f..f7b05714 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -28,7 +28,7 @@ from paddle import nn from paddlespeech.s2t.frontend.utility import IGNORE_ID from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.modules.cmvn import GlobalCMVN -from paddlespeech.s2t.modules.ctc import CTCDecoder +from paddlespeech.s2t.modules.ctc import CTCDecoderBase from paddlespeech.s2t.modules.decoder import TransformerDecoder from paddlespeech.s2t.modules.encoder import ConformerEncoder from paddlespeech.s2t.modules.encoder import TransformerEncoder @@ -56,7 +56,7 @@ class U2STBaseModel(nn.Layer): encoder: TransformerEncoder, st_decoder: TransformerDecoder, decoder: TransformerDecoder=None, - ctc: CTCDecoder=None, + ctc: CTCDecoderBase=None, ctc_weight: float=0.0, asr_weight: float=0.0, ignore_id: int=IGNORE_ID, @@ -264,14 +264,17 @@ class U2STBaseModel(nn.Layer): speech_lengths: paddle.Tensor, beam_size: int=10, word_reward: float=0.0, + maxlenratio: float=0.5, decoding_chunk_size: int=-1, num_decoding_left_chunks: int=-1, simulate_streaming: bool=False, ) -> paddle.Tensor: - """ Apply beam search on attention decoder + """ Apply beam search on attention decoder with length penalty Args: speech (paddle.Tensor): (batch, max_len, feat_dim) speech_length (paddle.Tensor): (batch, ) beam_size (int): beam size for beam search + word_reward (float): word reward used in beam search + maxlenratio (float): max length ratio to bound the length of translated text decoding_chunk_size (int): decoding chunk for dynamic chunk trained model. <0: for decoding, use full chunk. @@ -284,90 +287,89 @@ class U2STBaseModel(nn.Layer): """ assert speech.shape[0] == speech_lengths.shape[0] assert decoding_chunk_size != 0 + assert speech.shape[0] == 1 device = speech.place - batch_size = speech.shape[0] # Let's assume B = batch_size and N = beam_size - # 1. Encoder + # 1. Encoder and init hypothesis encoder_out, encoder_mask = self._forward_encoder( speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.shape[1] - encoder_dim = encoder_out.shape[2] - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = paddle.ones( - [running_size, 1], dtype=paddle.long).fill_(self.sos) # (B*N, 1) - # log scale score - scores = paddle.to_tensor( - [0.0] + [-float('inf')] * (beam_size - 1), dtype=paddle.float) - scores = scores.to(device).repeat(batch_size).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = paddle.zeros_like(scores, dtype=paddle.bool) # (B*N, 1) - cache: Optional[List[paddle.Tensor]] = None + + maxlen = max(int(encoder_out.shape[1] * maxlenratio), 5) + + hyp = {"score": 0.0, "yseq": [self.sos], "cache": None} + hyps = [hyp] + ended_hyps = [] + cur_best_score = -float("inf") + cache = None + # 2. Decoder forward step by step for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - # TODO(Hui Zhang): if end_flag.sum() == running_size: - if end_flag.cast(paddle.int64).sum() == running_size: - break + ys = paddle.ones((len(hyps), i), dtype=paddle.long) + + if hyps[0]["cache"] is not None: + cache = [ + paddle.ones( + (len(hyps), i - 1, hyp_cache.shape[-1]), + dtype=paddle.float32) for hyp_cache in hyps[0]["cache"] + ] + for j, hyp in enumerate(hyps): + ys[j, :] = paddle.to_tensor(hyp["yseq"]) + if hyps[0]["cache"] is not None: + for k in range(len(cache)): + cache[k][j] = hyps[j]["cache"][k] + ys_mask = subsequent_mask(i).unsqueeze(0).to(device) - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) logp, cache = self.st_decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp += word_reward - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - - # 2.3 Seconde beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - scores = scores.view(-1, 1) # (B*N, 1) - - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = paddle.arange(batch_size).view(-1, 1).repeat( - 1, beam_size) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = paddle.index_select( - top_k_index.view(-1), index=best_k_index, axis=0) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = paddle.index_select( - hyps, index=best_hyps_index, axis=0) # (B*N, i) - hyps = paddle.cat( - (last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = paddle.eq(hyps[:, -1], self.eos).view(-1, 1) + encoder_out.repeat(len(hyps), 1, 1), + encoder_mask.repeat(len(hyps), 1, 1), ys, ys_mask, cache) + + hyps_best_kept = [] + for j, hyp in enumerate(hyps): + top_k_logp, top_k_index = logp[j:j + 1].topk(beam_size) + + for b in range(beam_size): + new_hyp = {} + new_hyp["score"] = hyp["score"] + float(top_k_logp[0, b]) + new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"])) + new_hyp["yseq"][:len(hyp["yseq"])] = hyp["yseq"] + new_hyp["yseq"][len(hyp["yseq"])] = int(top_k_index[0, b]) + new_hyp["cache"] = [cache_[j] for cache_ in cache] + # will be (2 x beam) hyps at most + hyps_best_kept.append(new_hyp) + + hyps_best_kept = sorted( + hyps_best_kept, key=lambda x: -x["score"])[:beam_size] + + # sort and get nbest + hyps = hyps_best_kept + if i == maxlen: + for hyp in hyps: + hyp["yseq"].append(self.eos) + + # finalize the ended hypotheses with word reward (by length) + remained_hyps = [] + for hyp in hyps: + if hyp["yseq"][-1] == self.eos: + hyp["score"] += (i - 1) * word_reward + cur_best_score = max(cur_best_score, hyp["score"]) + ended_hyps.append(hyp) + else: + # stop while guarantee the optimality + if hyp["score"] + maxlen * word_reward > cur_best_score: + remained_hyps.append(hyp) + + # stop predition when there is no unended hypothesis + if not remained_hyps: + break + hyps = remained_hyps # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_index = paddle.argmax(scores, axis=-1).long() # (B) - best_hyps_index = best_index + paddle.arange( - batch_size, dtype=paddle.long) * beam_size - best_hyps = paddle.index_select(hyps, index=best_hyps_index, axis=0) - best_hyps = best_hyps[:, 1:] - return best_hyps + best_hyp = max(ended_hyps, key=lambda x: x["score"]) + + return paddle.to_tensor([best_hyp["yseq"][1:]]) # @jit.to_static def subsampling_rate(self) -> int: @@ -472,6 +474,7 @@ class U2STBaseModel(nn.Layer): decoding_method: str, beam_size: int, word_reward: float=0.0, + maxlenratio: float=0.5, decoding_chunk_size: int=-1, num_decoding_left_chunks: int=-1, simulate_streaming: bool=False): @@ -507,6 +510,7 @@ class U2STBaseModel(nn.Layer): feats_lengths, beam_size=beam_size, word_reward=word_reward, + maxlenratio=maxlenratio, decoding_chunk_size=decoding_chunk_size, num_decoding_left_chunks=num_decoding_left_chunks, simulate_streaming=simulate_streaming) @@ -591,7 +595,7 @@ class U2STModel(U2STBaseModel): model_conf = configs['model_conf'] dropout_rate = model_conf.get('ctc_dropout_rate', 0.0) grad_norm_type = model_conf.get('ctc_grad_norm_type', None) - ctc = CTCDecoder( + ctc = CTCDecoderBase( odim=vocab_size, enc_n_units=encoder.output_size(), blank_id=0, diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py index 1f983807..2094182a 100644 --- a/paddlespeech/s2t/modules/ctc.py +++ b/paddlespeech/s2t/modules/ctc.py @@ -25,17 +25,19 @@ from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() try: - from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_beam_search_decoder_batch # noqa: F401 - from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_greedy_decoder # noqa: F401 - from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import Scorer # noqa: F401 + from paddlespeech.s2t.decoders.ctcdecoder import ctc_beam_search_decoding_batch # noqa: F401 + from paddlespeech.s2t.decoders.ctcdecoder import ctc_greedy_decoding # noqa: F401 + from paddlespeech.s2t.decoders.ctcdecoder import Scorer # noqa: F401 + from paddlespeech.s2t.decoders.ctcdecoder import CTCBeamSearchDecoder # noqa: F401 except ImportError: try: from paddlespeech.s2t.utils import dynamic_pip_install package_name = 'paddlespeech_ctcdecoders' dynamic_pip_install.install(package_name) - from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_beam_search_decoder_batch # noqa: F401 - from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_greedy_decoder # noqa: F401 - from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import Scorer # noqa: F401 + from paddlespeech.s2t.decoders.ctcdecoder import ctc_beam_search_decoding_batch # noqa: F401 + from paddlespeech.s2t.decoders.ctcdecoder import ctc_greedy_decoding # noqa: F401 + from paddlespeech.s2t.decoders.ctcdecoder import Scorer # noqa: F401 + from paddlespeech.s2t.decoders.ctcdecoder import CTCBeamSearchDecoder # noqa: F401 except Exception as e: logger.info("paddlespeech_ctcdecoders not installed!") @@ -139,9 +141,11 @@ class CTCDecoder(CTCDecoderBase): super().__init__(*args, **kwargs) # CTCDecoder LM Score handle self._ext_scorer = None + self.beam_search_decoder = None - def _decode_batch_greedy(self, probs_split, vocab_list): - """Decode by best path for a batch of probs matrix input. + def _decode_batch_greedy_offline(self, probs_split, vocab_list): + """This function will be deprecated in future. + Decode by best path for a batch of probs matrix input. :param probs_split: List of 2-D probability matrix, and each consists of prob vectors for one speech utterancce. :param probs_split: List of matrix @@ -152,7 +156,7 @@ class CTCDecoder(CTCDecoderBase): """ results = [] for i, probs in enumerate(probs_split): - output_transcription = ctc_greedy_decoder( + output_transcription = ctc_greedy_decoding( probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id) results.append(output_transcription) return results @@ -194,10 +198,12 @@ class CTCDecoder(CTCDecoderBase): logger.info("no language model provided, " "decoding by pure beam search without scorer.") - def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta, - beam_size, cutoff_prob, cutoff_top_n, - vocab_list, num_processes): - """Decode by beam search for a batch of probs matrix input. + def _decode_batch_beam_search_offline( + self, probs_split, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, vocab_list, num_processes): + """ + This function will be deprecated in future. + Decode by beam search for a batch of probs matrix input. :param probs_split: List of 2-D probability matrix, and each consists of prob vectors for one speech utterancce. :param probs_split: List of matrix @@ -226,7 +232,7 @@ class CTCDecoder(CTCDecoderBase): # beam search decode num_processes = min(num_processes, len(probs_split)) - beam_search_results = ctc_beam_search_decoder_batch( + beam_search_results = ctc_beam_search_decoding_batch( probs_split=probs_split, vocabulary=vocab_list, beam_size=beam_size, @@ -239,30 +245,69 @@ class CTCDecoder(CTCDecoderBase): results = [result[0][1] for result in beam_search_results] return results - def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list, - decoding_method): + def init_decoder(self, batch_size, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, + cutoff_prob, cutoff_top_n, num_processes): + """ + init ctc decoders + Args: + batch_size(int): Batch size for input data + vocab_list (list): List of tokens in the vocabulary, for decoding + decoding_method (str): ctc_beam_search + lang_model_path (str): language model path + beam_alpha (float): beam_alpha + beam_beta (float): beam_beta + beam_size (int): beam_size + cutoff_prob (float): cutoff probability in beam search + cutoff_top_n (int): cutoff_top_n + num_processes (int): num_processes + + Raises: + ValueError: when decoding_method not support. + Returns: + CTCBeamSearchDecoder + """ + self.batch_size = batch_size + self.vocab_list = vocab_list + self.decoding_method = decoding_method + self.beam_size = beam_size + self.cutoff_prob = cutoff_prob + self.cutoff_top_n = cutoff_top_n + self.num_processes = num_processes if decoding_method == "ctc_beam_search": self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path, vocab_list) + if self.beam_search_decoder is None: + self.beam_search_decoder = self.get_decoder( + vocab_list, batch_size, beam_alpha, beam_beta, beam_size, + num_processes, cutoff_prob, cutoff_top_n) + return self.beam_search_decoder + elif decoding_method == "ctc_greedy": + self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path, + vocab_list) + else: + raise ValueError(f"Not support: {decoding_method}") - def decode_probs(self, probs, logits_lens, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, - cutoff_prob, cutoff_top_n, num_processes): - """ctc decoding with probs. - + def decode_probs_offline(self, probs, logits_lens, vocab_list, + decoding_method, lang_model_path, beam_alpha, + beam_beta, beam_size, cutoff_prob, cutoff_top_n, + num_processes): + """ + This function will be deprecated in future. + ctc decoding with probs. Args: probs (Tensor): activation after softmax logits_lens (Tensor): audio output lens - vocab_list ([type]): [description] - decoding_method ([type]): [description] - lang_model_path ([type]): [description] - beam_alpha ([type]): [description] - beam_beta ([type]): [description] - beam_size ([type]): [description] - cutoff_prob ([type]): [description] - cutoff_top_n ([type]): [description] - num_processes ([type]): [description] + vocab_list (list): List of tokens in the vocabulary, for decoding + decoding_method (str): ctc_beam_search + lang_model_path (str): language model path + beam_alpha (float): beam_alpha + beam_beta (float): beam_beta + beam_size (int): beam_size + cutoff_prob (float): cutoff probability in beam search + cutoff_top_n (int): cutoff_top_n + num_processes (int): num_processes Raises: ValueError: when decoding_method not support. @@ -270,13 +315,14 @@ class CTCDecoder(CTCDecoderBase): Returns: List[str]: transcripts. """ - + logger.warn( + "This function will be deprecated in future: decode_probs_offline") probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)] if decoding_method == "ctc_greedy": - result_transcripts = self._decode_batch_greedy( + result_transcripts = self._decode_batch_greedy_offline( probs_split=probs_split, vocab_list=vocab_list) elif decoding_method == "ctc_beam_search": - result_transcripts = self._decode_batch_beam_search( + result_transcripts = self._decode_batch_beam_search_offline( probs_split=probs_split, beam_alpha=beam_alpha, beam_beta=beam_beta, @@ -288,3 +334,136 @@ class CTCDecoder(CTCDecoderBase): else: raise ValueError(f"Not support: {decoding_method}") return result_transcripts + + def get_decoder(self, vocab_list, batch_size, beam_alpha, beam_beta, + beam_size, num_processes, cutoff_prob, cutoff_top_n): + """ + init get ctc decoder + Args: + vocab_list (list): List of tokens in the vocabulary, for decoding. + batch_size(int): Batch size for input data + beam_alpha (float): beam_alpha + beam_beta (float): beam_beta + beam_size (int): beam_size + num_processes (int): num_processes + cutoff_prob (float): cutoff probability in beam search + cutoff_top_n (int): cutoff_top_n + + Raises: + ValueError: when decoding_method not support. + + Returns: + CTCBeamSearchDecoder + """ + num_processes = min(num_processes, batch_size) + if self._ext_scorer is not None: + self._ext_scorer.reset_params(beam_alpha, beam_beta) + if self.decoding_method == "ctc_beam_search": + beam_search_decoder = CTCBeamSearchDecoder( + vocab_list, batch_size, beam_size, num_processes, cutoff_prob, + cutoff_top_n, self._ext_scorer, self.blank_id) + else: + raise ValueError(f"Not support: {decoding_method}") + return beam_search_decoder + + def next(self, probs, logits_lens): + """ + Input probs into ctc decoder + Args: + probs (list(list(float))): probs for a batch of data + logits_lens (list(int)): logits lens for a batch of data + Raises: + Exception: when the ctc decoder is not initialized + ValueError: when decoding_method not support. + """ + + if self.beam_search_decoder is None: + raise Exception( + "You need to initialize the beam_search_decoder firstly") + beam_search_decoder = self.beam_search_decoder + + has_value = (logits_lens > 0).tolist() + has_value = [ + "true" if has_value[i] is True else "false" + for i in range(len(has_value)) + ] + probs_split = [ + probs[i, :l, :].tolist() if has_value[i] else probs[i].tolist() + for i, l in enumerate(logits_lens) + ] + if self.decoding_method == "ctc_beam_search": + beam_search_decoder.next(probs_split, has_value) + else: + raise ValueError(f"Not support: {decoding_method}") + + return + + def decode(self): + """ + Get the decoding result + Raises: + Exception: when the ctc decoder is not initialized + ValueError: when decoding_method not support. + Returns: + results_best (list(str)): The best result for a batch of data + results_beam (list(list(str))): The beam search result for a batch of data + """ + if self.beam_search_decoder is None: + raise Exception( + "You need to initialize the beam_search_decoder firstly") + + beam_search_decoder = self.beam_search_decoder + if self.decoding_method == "ctc_beam_search": + batch_beam_results = beam_search_decoder.decode() + batch_beam_results = [[(res[0], res[1]) for res in beam_results] + for beam_results in batch_beam_results] + results_best = [result[0][1] for result in batch_beam_results] + results_beam = [[trans[1] for trans in result] + for result in batch_beam_results] + + else: + raise ValueError(f"Not support: {decoding_method}") + + return results_best, results_beam + + def reset_decoder(self, + batch_size=-1, + beam_size=-1, + num_processes=-1, + cutoff_prob=-1.0, + cutoff_top_n=-1): + if batch_size > 0: + self.batch_size = batch_size + if beam_size > 0: + self.beam_size = beam_size + if num_processes > 0: + self.num_processes = num_processes + if cutoff_prob > 0: + self.cutoff_prob = cutoff_prob + if cutoff_top_n > 0: + self.cutoff_top_n = cutoff_top_n + """ + Reset the decoder state + Args: + batch_size(int): Batch size for input data + beam_size (int): beam_size + num_processes (int): num_processes + cutoff_prob (float): cutoff probability in beam search + cutoff_top_n (int): cutoff_top_n + Raises: + Exception: when the ctc decoder is not initialized + """ + if self.beam_search_decoder is None: + raise Exception( + "You need to initialize the beam_search_decoder firstly") + self.beam_search_decoder.reset_state( + self.batch_size, self.beam_size, self.num_processes, + self.cutoff_prob, self.cutoff_top_n) + + def del_decoder(self): + """ + Delete the decoder + """ + if self.beam_search_decoder is not None: + del self.beam_search_decoder + self.beam_search_decoder = None diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index cac5e570..de90c9ef 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -252,8 +252,7 @@ class Trainer(): if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step: logger.info( f"Reach benchmark-max-step: {self.args.benchmark_max_step}") - sys.exit( - f"Reach benchmark-max-step: {self.args.benchmark_max_step}") + sys.exit(0) def do_train(self): """The training process control by epoch.""" @@ -282,7 +281,7 @@ class Trainer(): observation['batch_cost'] = observation[ 'reader_cost'] + observation['step_cost'] observation['samples'] = observation['batch_size'] - observation['ips[sent./sec]'] = observation[ + observation['ips samples/s'] = observation[ 'batch_size'] / observation['batch_cost'] for k, v in observation.items(): msg += f" {k}: " diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index 226885f3..9e41b824 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -90,7 +90,8 @@ class SpeedPerturbation(): # Note1: resample requires the sampling-rate of input and output, # but actually only the ratio is used. - y = librosa.resample(x, ratio, 1, res_type=self.res_type) + y = librosa.resample( + x, orig_sr=ratio, target_sr=1, res_type=self.res_type) if self.keep_length: diff = abs(len(x) - len(y)) diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index a6346c34..889cd349 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -38,7 +38,7 @@ def stft(x, x = np.stack( [ librosa.stft( - x[:, ch], + y=x[:, ch], n_fft=n_fft, hop_length=n_shift, win_length=win_length, @@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True): x = np.stack( [ librosa.istft( - x[:, ch].T, # [Time, Freq] -> [Freq, Time] + stft_matrix=x[:, ch].T, # [Time, Freq] -> [Freq, Time] hop_length=n_shift, win_length=win_length, window=window, @@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft, # spc: (Time, Channel, Freq) or (Time, Freq) spc = np.abs(x_stft) # mel_basis: (Mel_freq, Freq) - mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax) + mel_basis = librosa.filters.mel( + sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq) lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py index 8a0acc48..7d93c026 100644 --- a/paddlespeech/t2s/__init__.py +++ b/paddlespeech/t2s/__init__.py @@ -13,7 +13,6 @@ # limitations under the License. import logging -from . import data from . import datasets from . import exps from . import frontend diff --git a/paddlespeech/t2s/audio/__init__.py b/paddlespeech/t2s/audio/__init__.py index 7747b794..0deefc8b 100644 --- a/paddlespeech/t2s/audio/__init__.py +++ b/paddlespeech/t2s/audio/__init__.py @@ -12,5 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. from .audio import AudioProcessor +from .codec import * from .spec_normalizer import LogMagnitude from .spec_normalizer import NormalizerBase diff --git a/paddlespeech/t2s/audio/audio.py b/paddlespeech/t2s/audio/audio.py index ab9a45d3..59ea8c87 100644 --- a/paddlespeech/t2s/audio/audio.py +++ b/paddlespeech/t2s/audio/audio.py @@ -53,8 +53,8 @@ class AudioProcessor(object): def _create_mel_filter(self): mel_filter = librosa.filters.mel( - self.sample_rate, - self.n_fft, + sr=self.sample_rate, + n_fft=self.n_fft, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax) diff --git a/paddlespeech/t2s/audio/codec.py b/paddlespeech/t2s/audio/codec.py new file mode 100644 index 00000000..2a759ce4 --- /dev/null +++ b/paddlespeech/t2s/audio/codec.py @@ -0,0 +1,51 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +import numpy as np +import paddle + + +# x: [0: 2**bit-1], return: [-1, 1] +def label_2_float(x, bits): + return 2 * x / (2**bits - 1.) - 1. + + +#x: [-1, 1], return: [0, 2**bits-1] +def float_2_label(x, bits): + assert abs(x).max() <= 1.0 + x = (x + 1.) * (2**bits - 1) / 2 + return x.clip(0, 2**bits - 1) + + +# y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1] +# see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm +# be careful the input `mu` here, which is +1 than that of the link above +def encode_mu_law(x, mu): + mu = mu - 1 + fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu) + return np.floor((fx + 1) / 2 * mu + 0.5) + + +# from_labels = True: +# y: [0: 2**bit-1], mu: 2**bits, return: [-1,1] +# from_labels = False: +# y: [-1, 1], return: [-1, 1] +def decode_mu_law(y, mu, from_labels=True): + # TODO: get rid of log2 - makes no sense + if from_labels: + y = label_2_float(y, math.log2(mu)) + mu = mu - 1 + x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1) + return x diff --git a/paddlespeech/t2s/datasets/__init__.py b/paddlespeech/t2s/datasets/__init__.py index fc64a82f..caf20aac 100644 --- a/paddlespeech/t2s/datasets/__init__.py +++ b/paddlespeech/t2s/datasets/__init__.py @@ -11,5 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .common import * from .ljspeech import * diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 526871a2..4e3ad3c1 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -14,7 +14,77 @@ import numpy as np import paddle -from paddlespeech.t2s.data.batch import batch_sequences +from paddlespeech.t2s.datasets.batch import batch_sequences + + +def tacotron2_single_spk_batch_fn(examples): + # fields = ["text", "text_lengths", "speech", "speech_lengths"] + text = [np.array(item["text"], dtype=np.int64) for item in examples] + speech = [np.array(item["speech"], dtype=np.float32) for item in examples] + text_lengths = [ + np.array(item["text_lengths"], dtype=np.int64) for item in examples + ] + speech_lengths = [ + np.array(item["speech_lengths"], dtype=np.int64) for item in examples + ] + + text = batch_sequences(text) + speech = batch_sequences(speech) + + # convert each batch to paddle.Tensor + text = paddle.to_tensor(text) + speech = paddle.to_tensor(speech) + text_lengths = paddle.to_tensor(text_lengths) + speech_lengths = paddle.to_tensor(speech_lengths) + + batch = { + "text": text, + "text_lengths": text_lengths, + "speech": speech, + "speech_lengths": speech_lengths, + } + return batch + + +def tacotron2_multi_spk_batch_fn(examples): + # fields = ["text", "text_lengths", "speech", "speech_lengths"] + text = [np.array(item["text"], dtype=np.int64) for item in examples] + speech = [np.array(item["speech"], dtype=np.float32) for item in examples] + text_lengths = [ + np.array(item["text_lengths"], dtype=np.int64) for item in examples + ] + speech_lengths = [ + np.array(item["speech_lengths"], dtype=np.int64) for item in examples + ] + + text = batch_sequences(text) + speech = batch_sequences(speech) + + # convert each batch to paddle.Tensor + text = paddle.to_tensor(text) + speech = paddle.to_tensor(speech) + text_lengths = paddle.to_tensor(text_lengths) + speech_lengths = paddle.to_tensor(speech_lengths) + + batch = { + "text": text, + "text_lengths": text_lengths, + "speech": speech, + "speech_lengths": speech_lengths, + } + # spk_emb has a higher priority than spk_id + if "spk_emb" in examples[0]: + spk_emb = [ + np.array(item["spk_emb"], dtype=np.float32) for item in examples + ] + spk_emb = batch_sequences(spk_emb) + spk_emb = paddle.to_tensor(spk_emb) + batch["spk_emb"] = spk_emb + elif "spk_id" in examples[0]: + spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] + spk_id = paddle.to_tensor(spk_id) + batch["spk_id"] = spk_id + return batch def speedyspeech_single_spk_batch_fn(examples): @@ -56,7 +126,7 @@ def speedyspeech_single_spk_batch_fn(examples): def speedyspeech_multi_spk_batch_fn(examples): - # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] + # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations", "spk_id"] phones = [np.array(item["phones"], dtype=np.int64) for item in examples] tones = [np.array(item["tones"], dtype=np.int64) for item in examples] feats = [np.array(item["feats"], dtype=np.float32) for item in examples] diff --git a/paddlespeech/t2s/data/batch.py b/paddlespeech/t2s/datasets/batch.py similarity index 100% rename from paddlespeech/t2s/data/batch.py rename to paddlespeech/t2s/datasets/batch.py diff --git a/paddlespeech/t2s/datasets/common.py b/paddlespeech/t2s/datasets/common.py deleted file mode 100644 index d6fa3a84..00000000 --- a/paddlespeech/t2s/datasets/common.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pathlib import Path -from typing import List - -import librosa -import numpy as np -from paddle.io import Dataset - -__all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"] - - -class AudioSegmentDataset(Dataset): - """A simple dataset adaptor for audio files to train vocoders. - Read -> trim silence -> normalize -> extract a segment - """ - - def __init__(self, - file_paths: List[Path], - sample_rate: int, - length: int, - top_db: float): - self.file_paths = file_paths - self.sr = sample_rate - self.top_db = top_db - self.length = length # samples in the clip - - def __getitem__(self, i): - fpath = self.file_paths[i] - y, sr = librosa.load(fpath, self.sr) - y, _ = librosa.effects.trim(y, top_db=self.top_db) - y = librosa.util.normalize(y) - y = y.astype(np.float32) - - # pad or trim - if y.size <= self.length: - y = np.pad(y, [0, self.length - len(y)], mode='constant') - else: - start = np.random.randint(0, 1 + len(y) - self.length) - y = y[start:start + self.length] - return y - - def __len__(self): - return len(self.file_paths) - - -class AudioDataset(Dataset): - """A simple dataset adaptor for the audio files. - Read -> trim silence -> normalize - """ - - def __init__(self, - file_paths: List[Path], - sample_rate: int, - top_db: float=60): - self.file_paths = file_paths - self.sr = sample_rate - self.top_db = top_db - - def __getitem__(self, i): - fpath = self.file_paths[i] - y, sr = librosa.load(fpath, self.sr) - y, _ = librosa.effects.trim(y, top_db=self.top_db) - y = librosa.util.normalize(y) - y = y.astype(np.float32) - return y - - def __len__(self): - return len(self.file_paths) - - -class AudioFolderDataset(AudioDataset): - def __init__( - self, - root, - sample_rate, - top_db=60, - extension=".wav", ): - root = Path(root).expanduser() - file_paths = sorted(list(root.rglob("*{}".format(extension)))) - super().__init__(file_paths, sample_rate, top_db) diff --git a/paddlespeech/t2s/datasets/data_table.py b/paddlespeech/t2s/datasets/data_table.py index b0e4c891..c9815af2 100644 --- a/paddlespeech/t2s/datasets/data_table.py +++ b/paddlespeech/t2s/datasets/data_table.py @@ -22,26 +22,17 @@ from paddle.io import Dataset class DataTable(Dataset): """Dataset to load and convert data for general purpose. - - Parameters - ---------- - data : List[Dict[str, Any]] - Metadata, a list of meta datum, each of which is composed of - several fields - fields : List[str], optional - Fields to use, if not specified, all the fields in the data are - used, by default None - converters : Dict[str, Callable], optional - Converters used to process each field, by default None - use_cache : bool, optional - Whether to use cache, by default False - - Raises - ------ - ValueError - If there is some field that does not exist in data. - ValueError - If there is some field in converters that does not exist in fields. + Args: + data (List[Dict[str, Any]]): Metadata, a list of meta datum, each of which is composed of several fields + fields (List[str], optional): Fields to use, if not specified, all the fields in the data are used, by default None + converters (Dict[str, Callable], optional): Converters used to process each field, by default None + use_cache (bool, optional): Whether to use cache, by default False + + Raises: + ValueError: + If there is some field that does not exist in data. + ValueError: + If there is some field in converters that does not exist in fields. """ def __init__(self, @@ -95,15 +86,11 @@ class DataTable(Dataset): """Convert a meta datum to an example by applying the corresponding converters to each fields requested. - Parameters - ---------- - meta_datum : Dict[str, Any] - Meta datum + Args: + meta_datum (Dict[str, Any]): Meta datum - Returns - ------- - Dict[str, Any] - Converted example + Returns: + Dict[str, Any]: Converted example """ example = {} for field in self.fields: @@ -118,16 +105,11 @@ class DataTable(Dataset): def __getitem__(self, idx: int) -> Dict[str, Any]: """Get an example given an index. + Args: + idx (int): Index of the example to get - Parameters - ---------- - idx : int - Index of the example to get - - Returns - ------- - Dict[str, Any] - A converted example + Returns: + Dict[str, Any]: A converted example """ if self.use_cache and self.caches[idx] is not None: return self.caches[idx] diff --git a/paddlespeech/t2s/data/dataset.py b/paddlespeech/t2s/datasets/dataset.py similarity index 99% rename from paddlespeech/t2s/data/dataset.py rename to paddlespeech/t2s/datasets/dataset.py index 2d6c03cb..f81c2877 100644 --- a/paddlespeech/t2s/data/dataset.py +++ b/paddlespeech/t2s/datasets/dataset.py @@ -258,4 +258,4 @@ class ChainDataset(Dataset): return dataset[i] i -= len(dataset) - raise IndexError("dataset index out of range") + raise IndexError("dataset index out of range") \ No newline at end of file diff --git a/paddlespeech/t2s/data/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py similarity index 100% rename from paddlespeech/t2s/data/get_feats.py rename to paddlespeech/t2s/datasets/get_feats.py diff --git a/paddlespeech/t2s/datasets/preprocess_utils.py b/paddlespeech/t2s/datasets/preprocess_utils.py index 8b01f6c3..445b69bd 100644 --- a/paddlespeech/t2s/datasets/preprocess_utils.py +++ b/paddlespeech/t2s/datasets/preprocess_utils.py @@ -18,14 +18,10 @@ import re def get_phn_dur(file_name): ''' read MFA duration.txt - Parameters - ---------- - file_name : str or Path - path of gen_duration_from_textgrid.py's result - Returns - ---------- - Dict - sentence: {'utt': ([char], [int])} + Args: + file_name (str or Path): path of gen_duration_from_textgrid.py's result + Returns: + Dict: sentence: {'utt': ([char], [int])} ''' f = open(file_name, 'r') sentence = {} @@ -48,10 +44,8 @@ def get_phn_dur(file_name): def merge_silence(sentence): ''' merge silences - Parameters - ---------- - sentence : Dict - sentence: {'utt': (([char], [int]), str)} + Args: + sentence (Dict): sentence: {'utt': (([char], [int]), str)} ''' for utt in sentence: cur_phn, cur_dur, speaker = sentence[utt] @@ -81,12 +75,9 @@ def merge_silence(sentence): def get_input_token(sentence, output_path, dataset="baker"): ''' get phone set from training data and save it - Parameters - ---------- - sentence : Dict - sentence: {'utt': ([char], [int])} - output_path : str or path - path to save phone_id_map + Args: + sentence (Dict): sentence: {'utt': ([char], [int])} + output_path (str or path):path to save phone_id_map ''' phn_token = set() for utt in sentence: @@ -112,14 +103,10 @@ def get_phones_tones(sentence, dataset="baker"): ''' get phone set and tone set from training data and save it - Parameters - ---------- - sentence : Dict - sentence: {'utt': ([char], [int])} - phones_output_path : str or path - path to save phone_id_map - tones_output_path : str or path - path to save tone_id_map + Args: + sentence (Dict): sentence: {'utt': ([char], [int])} + phones_output_path (str or path): path to save phone_id_map + tones_output_path (str or path): path to save tone_id_map ''' phn_token = set() tone_token = set() @@ -162,14 +149,10 @@ def get_spk_id_map(speaker_set, output_path): def compare_duration_and_mel_length(sentences, utt, mel): ''' check duration error, correct sentences[utt] if possible, else pop sentences[utt] - Parameters - ---------- - sentences : Dict - sentences[utt] = [phones_list ,durations_list] - utt : str - utt_id - mel : np.ndarry - features (num_frames, n_mels) + Args: + sentences (Dict): sentences[utt] = [phones_list ,durations_list] + utt (str): utt_id + mel (np.ndarry): features (num_frames, n_mels) ''' if utt in sentences: diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py index 2e4f740f..08748de0 100644 --- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py +++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py @@ -14,6 +14,10 @@ import numpy as np import paddle +from paddlespeech.t2s.audio.codec import encode_mu_law +from paddlespeech.t2s.audio.codec import float_2_label +from paddlespeech.t2s.audio.codec import label_2_float + class Clip(object): """Collate functor for training vocoders. @@ -25,15 +29,11 @@ class Clip(object): hop_size=256, aux_context_window=0, ): """Initialize customized collater for DataLoader. + Args: - Parameters - ---------- - batch_max_steps : int - The maximum length of input signal in batch. - hop_size : int - Hop size of auxiliary features. - aux_context_window : int - Context window size for auxiliary feature conv. + batch_max_steps (int): The maximum length of input signal in batch. + hop_size (int): Hop size of auxiliary features. + aux_context_window (int): Context window size for auxiliary feature conv. """ if batch_max_steps % hop_size != 0: @@ -49,29 +49,26 @@ class Clip(object): self.end_offset = -(self.batch_max_frames + aux_context_window) self.mel_threshold = self.batch_max_frames + 2 * aux_context_window - def __call__(self, examples): + def __call__(self, batch): """Convert into batch tensors. - Parameters - ---------- - batch : list - list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C). + Args: + batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C). - Returns - ---------- - Tensor - Auxiliary feature batch (B, C, T'), where - T = (T' - 2 * aux_context_window) * hop_size. - Tensor - Target signal batch (B, 1, T). + Returns: + Tensor: + Auxiliary feature batch (B, C, T'), where + T = (T' - 2 * aux_context_window) * hop_size. + Tensor: + Target signal batch (B, 1, T). """ # check length - examples = [ - self._adjust_length(b['wave'], b['feats']) for b in examples + batch = [ + self._adjust_length(b['wave'], b['feats']) for b in batch if b['feats'].shape[0] > self.mel_threshold ] - xs, cs = [b[0] for b in examples], [b[1] for b in examples] + xs, cs = [b[0] for b in batch], [b[1] for b in batch] # make batch with random cut c_lengths = [c.shape[0] for c in cs] @@ -89,7 +86,7 @@ class Clip(object): c_batch = np.stack( [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)]) - # convert each batch to tensor, asuume that each item in batch has the same length + # convert each batch to tensor, assume that each item in batch has the same length y_batch = paddle.to_tensor( y_batch, dtype=paddle.float32).unsqueeze(1) # (B, 1, T) c_batch = paddle.to_tensor( @@ -100,11 +97,10 @@ class Clip(object): def _adjust_length(self, x, c): """Adjust the audio and feature lengths. - Note - ------- - Basically we assume that the length of x and c are adjusted - through preprocessing stage, but if we use other library processed - features, this process will be needed. + Note: + Basically we assume that the length of x and c are adjusted + through preprocessing stage, but if we use other library processed + features, this process will be needed. """ if len(x) < c.shape[0] * self.hop_size: @@ -120,3 +116,105 @@ class Clip(object): 0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})" return x, c + + +class WaveRNNClip(Clip): + def __init__(self, + mode: str='RAW', + batch_max_steps: int=4500, + hop_size: int=300, + aux_context_window: int=2, + bits: int=9, + mu_law: bool=True): + self.mode = mode + self.mel_win = batch_max_steps // hop_size + 2 * aux_context_window + self.batch_max_steps = batch_max_steps + self.hop_size = hop_size + self.aux_context_window = aux_context_window + self.mu_law = mu_law + self.batch_max_frames = batch_max_steps // hop_size + self.mel_threshold = self.batch_max_frames + 2 * aux_context_window + if self.mode == 'MOL': + self.bits = 16 + else: + self.bits = bits + + def to_quant(self, wav): + if self.mode == 'RAW': + if self.mu_law: + quant = encode_mu_law(wav, mu=2**self.bits) + else: + quant = float_2_label(wav, bits=self.bits) + elif self.mode == 'MOL': + quant = float_2_label(wav, bits=16) + quant = quant.astype(np.int64) + return quant + + def __call__(self, batch): + # voc_pad = 2 this will pad the input so that the resnet can 'see' wider than input length + # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15 + """Convert into batch tensors. + Args: + batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C). + + Returns: + Tensor: Input signal batch (B, 1, T). + Tensor: Target signal batch (B, 1, T). + Tensor: Auxiliary feature batch (B, C, T'), + where T = (T' - 2 * aux_context_window) * hop_size. + + """ + # check length + batch = [ + self._adjust_length(b['wave'], b['feats']) for b in batch + if b['feats'].shape[0] > self.mel_threshold + ] + wav, mel = [b[0] for b in batch], [b[1] for b in batch] + # mel 此处需要转置 + mel = [x.T for x in mel] + max_offsets = [ + x.shape[-1] - 2 - (self.mel_win + 2 * self.aux_context_window) + for x in mel + ] + # the slice point of mel selecting randomly + mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] + # the slice point of wav selecting randomly, which is behind 2(=pad) frames + sig_offsets = [(offset + self.aux_context_window) * self.hop_size + for offset in mel_offsets] + # mels.shape[1] = voc_seq_len // hop_length + 2 * voc_pad + mels = [ + x[:, mel_offsets[i]:mel_offsets[i] + self.mel_win] + for i, x in enumerate(mel) + ] + # label.shape[1] = voc_seq_len + 1 + wav = [self.to_quant(x) for x in wav] + + labels = [ + x[sig_offsets[i]:sig_offsets[i] + self.batch_max_steps + 1] + for i, x in enumerate(wav) + ] + + mels = np.stack(mels).astype(np.float32) + labels = np.stack(labels).astype(np.int64) + + mels = paddle.to_tensor(mels) + labels = paddle.to_tensor(labels, dtype='int64') + # x is input, y is label + x = labels[:, :self.batch_max_steps] + y = labels[:, 1:] + ''' + mode = RAW: + mu_law = True: + quant: bits = 9 0, 1, 2, ..., 509, 510, 511 int + mu_law = False + quant bits = 9 [0, 511] float + mode = MOL: + quant: bits = 16 [0. 65536] float + ''' + # x should be normalizes in.[0, 1] in RAW mode + x = label_2_float(paddle.cast(x, dtype='float32'), self.bits) + # y should be normalizes in.[0, 1] in MOL mode + if self.mode == 'MOL': + y = label_2_float(paddle.cast(y, dtype='float32'), self.bits) + + return x, y, mels diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py index 4ddd19f7..3fded29b 100644 --- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py +++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py @@ -15,20 +15,21 @@ # for mb melgan finetune # 长度和原本的 mel 不一致怎么办? import argparse +import os from pathlib import Path import numpy as np import paddle import yaml -from yacs.config import CfgNode from tqdm import tqdm -import os +from yacs.config import CfgNode from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import merge_silence from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference from paddlespeech.t2s.modules.normalizer import ZScore +from paddlespeech.t2s.utils import str2bool def evaluate(args, fastspeech2_config): @@ -50,11 +51,14 @@ def evaluate(args, fastspeech2_config): spk_id_list = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id_list) else: - spk_num=None + spk_num = None odim = fastspeech2_config.n_mels model = FastSpeech2( - idim=vocab_size, odim=odim, **fastspeech2_config["model"], spk_num=spk_num) + idim=vocab_size, + odim=odim, + **fastspeech2_config["model"], + spk_num=spk_num) model.set_state_dict( paddle.load(args.fastspeech2_checkpoint)["main_params"]) @@ -99,9 +103,15 @@ def evaluate(args, fastspeech2_config): else: train_wav_files += wav_files - train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files] - dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files] - test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files] + train_wav_files = [ + os.path.basename(str(str_path)) for str_path in train_wav_files + ] + dev_wav_files = [ + os.path.basename(str(str_path)) for str_path in dev_wav_files + ] + test_wav_files = [ + os.path.basename(str(str_path)) for str_path in test_wav_files + ] for i, utt_id in enumerate(tqdm(sentences)): phones = sentences[utt_id][0] @@ -122,7 +132,8 @@ def evaluate(args, fastspeech2_config): phone_ids = paddle.to_tensor(np.array(phone_ids)) if args.speaker_dict: - speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0]) + speaker_id = int( + [item[1] for item in spk_id_list if speaker == item[0]][0]) speaker_id = paddle.to_tensor(speaker_id) else: speaker_id = None @@ -143,7 +154,8 @@ def evaluate(args, fastspeech2_config): sub_output_dir.mkdir(parents=True, exist_ok=True) with paddle.no_grad(): - mel = fastspeech2_inference(phone_ids, durations=durations, spk_id=speaker_id) + mel = fastspeech2_inference( + phone_ids, durations=durations, spk_id=speaker_id) np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) @@ -175,12 +187,9 @@ def main(): type=str, default="phone_id_map.txt", help="phone vocabulary file.") - + parser.add_argument( - "--speaker-dict", - type=str, - default=None, - help="speaker id map file.") + "--speaker-dict", type=str, default=None, help="speaker id map file.") parser.add_argument( "--dur-file", default=None, type=str, help="path to durations.txt.") @@ -188,9 +197,6 @@ def main(): parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - def str2bool(str): - return True if str.lower() == 'true' else False - parser.add_argument( "--cut-sil", type=str2bool, diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index b874b3a7..5bda7545 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -27,14 +27,15 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import Energy -from paddlespeech.t2s.data.get_feats import LogMelFBank -from paddlespeech.t2s.data.get_feats import Pitch +from paddlespeech.t2s.datasets.get_feats import Energy +from paddlespeech.t2s.datasets.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import Pitch from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_input_token from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map from paddlespeech.t2s.datasets.preprocess_utils import merge_silence +from paddlespeech.t2s.utils import str2bool def process_sentence(config: Dict[str, Any], @@ -203,9 +204,6 @@ def main(): parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") - def str2bool(str): - return True if str.lower() == 'true' else False - parser.add_argument( "--cut-sil", type=str2bool, diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index 1dfa575a..10e023d0 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -38,6 +38,7 @@ from paddlespeech.t2s.training.extensions.visualizer import VisualDL from paddlespeech.t2s.training.optimizer import build_optimizers from paddlespeech.t2s.training.seeding import seed_everything from paddlespeech.t2s.training.trainer import Trainer +from paddlespeech.t2s.utils import str2bool def train_sp(args, config): @@ -159,9 +160,8 @@ def train_sp(args, config): if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) - trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) - # print(trainer.extensions) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) trainer.run() @@ -182,9 +182,6 @@ def main(): default=None, help="speaker id map file for multiple speaker model.") - def str2bool(str): - return True if str.lower() == 'true' else False - parser.add_argument( "--voice-cloning", type=str2bool, diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py index 9ac6cbd3..c70821e7 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py @@ -231,9 +231,9 @@ def train_sp(args, config): trainer.extend( evaluator, trigger=(config.eval_interval_steps, 'iteration')) trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) - trainer.extend( - Snapshot(max_size=config.num_snapshots), - trigger=(config.save_interval_steps, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) print("Trainer Done!") trainer.run() diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py index 3d0ff7d3..27ffded6 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py @@ -219,9 +219,9 @@ def train_sp(args, config): trainer.extend( evaluator, trigger=(config.eval_interval_steps, 'iteration')) trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) - trainer.extend( - Snapshot(max_size=config.num_snapshots), - trigger=(config.save_interval_steps, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) print("Trainer Done!") trainer.run() diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py index f5affb50..def30e67 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py @@ -23,7 +23,7 @@ import soundfile as sf import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator from paddlespeech.t2s.models.parallel_wavegan import PWGInference from paddlespeech.t2s.modules.normalizer import ZScore diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py index a7881d6b..92de7a2c 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py @@ -41,6 +41,7 @@ from paddlespeech.t2s.training.extensions.snapshot import Snapshot from paddlespeech.t2s.training.extensions.visualizer import VisualDL from paddlespeech.t2s.training.seeding import seed_everything from paddlespeech.t2s.training.trainer import Trainer +from paddlespeech.t2s.utils import str2bool def train_sp(args, config): @@ -193,19 +194,16 @@ def train_sp(args, config): trainer.extend( evaluator, trigger=(config.eval_interval_steps, 'iteration')) trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) - trainer.extend( - Snapshot(max_size=config.num_snapshots), - trigger=(config.save_interval_steps, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) - # print(trainer.extensions.keys()) print("Trainer Done!") trainer.run() def main(): # parse args and config and redirect to train_sp - def str2bool(str): - return True if str.lower() == 'true' else False parser = argparse.ArgumentParser( description="Train a ParallelWaveGAN model.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 782fbdf2..4871bca7 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -27,9 +27,10 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import merge_silence +from paddlespeech.t2s.utils import str2bool def process_sentence(config: Dict[str, Any], @@ -165,9 +166,6 @@ def main(): parser.add_argument( "--dur-file", default=None, type=str, help="path to durations.txt.") - def str2bool(str): - return True if str.lower() == 'true' else False - parser.add_argument( "--cut-sil", type=str2bool, diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py index b162260d..be3ba742 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py @@ -212,9 +212,9 @@ def train_sp(args, config): trainer.extend( evaluator, trigger=(config.eval_interval_steps, 'iteration')) trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) - trainer.extend( - Snapshot(max_size=config.num_snapshots), - trigger=(config.save_interval_steps, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) print("Trainer Done!") trainer.run() diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index e1d5306c..26d7e2c0 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -14,9 +14,11 @@ import argparse from pathlib import Path +import numpy import soundfile as sf from paddle import inference +from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend @@ -29,20 +31,38 @@ def main(): '--am', type=str, default='fastspeech2_csmsc', - choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'], + choices=[ + 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_aishell3', + 'fastspeech2_vctk', 'tacotron2_csmsc' + ], help='Choose acoustic model type of tts task.') parser.add_argument( "--phones_dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( "--tones_dict", type=str, default=None, help="tone vocabulary file.") + parser.add_argument( + "--speaker_dict", type=str, default=None, help="speaker id map file.") + parser.add_argument( + '--spk_id', + type=int, + default=0, + help='spk id for multi speaker acoustic model') # voc parser.add_argument( '--voc', type=str, default='pwgan_csmsc', - choices=['pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc'], + choices=[ + 'pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc', 'pwgan_aishell3', + 'pwgan_vctk', 'wavernn_csmsc' + ], help='Choose vocoder type of tts task.') # other + parser.add_argument( + '--lang', + type=str, + default='zh', + help='Choose model language. zh or en') parser.add_argument( "--text", type=str, @@ -53,8 +73,12 @@ def main(): args, _ = parser.parse_known_args() - frontend = Frontend( - phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) + # frontend + if args.lang == 'zh': + frontend = Frontend( + phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) + elif args.lang == 'en': + frontend = English(phone_vocab_path=args.phones_dict) print("frontend done!") # model: {model_name}_{dataset} @@ -83,30 +107,53 @@ def main(): print("in new inference") + # construct dataset for evaluation + sentences = [] with open(args.text, 'rt') as f: for line in f: items = line.strip().split() utt_id = items[0] - sentence = "".join(items[1:]) + if args.lang == 'zh': + sentence = "".join(items[1:]) + elif args.lang == 'en': + sentence = " ".join(items[1:]) sentences.append((utt_id, sentence)) get_tone_ids = False + get_spk_id = False if am_name == 'speedyspeech': get_tone_ids = True + if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: + get_spk_id = True + spk_id = numpy.array([args.spk_id]) am_input_names = am_predictor.get_input_names() - + print("am_input_names:", am_input_names) + merge_sentences = True for utt_id, sentence in sentences: - input_ids = frontend.get_input_ids( - sentence, merge_sentences=True, get_tone_ids=get_tone_ids) - phone_ids = input_ids["phone_ids"] + if args.lang == 'zh': + input_ids = frontend.get_input_ids( + sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) + phone_ids = input_ids["phone_ids"] + elif args.lang == 'en': + input_ids = frontend.get_input_ids( + sentence, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] + else: + print("lang should in {'zh', 'en'}!") + if get_tone_ids: tone_ids = input_ids["tone_ids"] tones = tone_ids[0].numpy() tones_handle = am_predictor.get_input_handle(am_input_names[1]) tones_handle.reshape(tones.shape) tones_handle.copy_from_cpu(tones) - + if get_spk_id: + spk_id_handle = am_predictor.get_input_handle(am_input_names[1]) + spk_id_handle.reshape(spk_id.shape) + spk_id_handle.copy_from_cpu(spk_id) phones = phone_ids[0].numpy() phones_handle = am_predictor.get_input_handle(am_input_names[0]) phones_handle.reshape(phones.shape) diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py index b6440fd6..31b7d2ea 100644 --- a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py +++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py @@ -30,6 +30,7 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.models.speedyspeech import SpeedySpeech from paddlespeech.t2s.models.speedyspeech import SpeedySpeechInference from paddlespeech.t2s.modules.normalizer import ZScore +from paddlespeech.t2s.utils import str2bool def evaluate(args, speedyspeech_config): @@ -213,9 +214,6 @@ def main(): parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - def str2bool(str): - return True if str.lower() == 'true' else False - parser.add_argument( "--cut-sil", type=str2bool, diff --git a/paddlespeech/t2s/exps/speedyspeech/normalize.py b/paddlespeech/t2s/exps/speedyspeech/normalize.py index a427c469..249a4d6d 100644 --- a/paddlespeech/t2s/exps/speedyspeech/normalize.py +++ b/paddlespeech/t2s/exps/speedyspeech/normalize.py @@ -23,6 +23,7 @@ from sklearn.preprocessing import StandardScaler from tqdm import tqdm from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.utils import str2bool def main(): @@ -55,9 +56,6 @@ def main(): default=1, help="logging level. higher is more logging. (default=1)") - def str2bool(str): - return True if str.lower() == 'true' else False - parser.add_argument( "--use-relative-path", type=str2bool, diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index 9ff77144..3f81c4e1 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -27,12 +27,13 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map from paddlespeech.t2s.datasets.preprocess_utils import merge_silence +from paddlespeech.t2s.utils import str2bool def process_sentence(config: Dict[str, Any], @@ -190,9 +191,6 @@ def main(): parser.add_argument( "--num-cpu", type=int, default=1, help="number of process.") - def str2bool(str): - return True if str.lower() == 'true' else False - parser.add_argument( "--cut-sil", type=str2bool, diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index 448cd7bb..bda5370c 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -38,6 +38,7 @@ from paddlespeech.t2s.training.extensions.visualizer import VisualDL from paddlespeech.t2s.training.optimizer import build_optimizers from paddlespeech.t2s.training.seeding import seed_everything from paddlespeech.t2s.training.trainer import Trainer +from paddlespeech.t2s.utils import str2bool def train_sp(args, config): @@ -170,8 +171,8 @@ def train_sp(args, config): if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) - trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) trainer.run() @@ -186,9 +187,6 @@ def main(): parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - def str2bool(str): - return True if str.lower() == 'true' else False - parser.add_argument( "--use-relative-path", type=str2bool, diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index f5477470..1c42a87c 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -25,6 +25,7 @@ from yacs.config import CfgNode from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.modules.normalizer import ZScore +from paddlespeech.t2s.utils import str2bool model_alias = { # acoustic model @@ -36,6 +37,10 @@ model_alias = { "paddlespeech.t2s.models.fastspeech2:FastSpeech2", "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -91,6 +96,11 @@ def evaluate(args): print("spk_num:", spk_num) elif am_name == 'speedyspeech': fields = ["utt_id", "phones", "tones"] + elif am_name == 'tacotron2': + fields = ["utt_id", "text"] + if args.voice_cloning: + print("voice cloning!") + fields += ["spk_emb"] test_dataset = DataTable(data=test_metadata, fields=fields) @@ -117,6 +127,8 @@ def evaluate(args): elif am_name == 'speedyspeech': am = am_class( vocab_size=vocab_size, tone_size=tone_size, **am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) am.eval() @@ -168,6 +180,13 @@ def evaluate(args): phone_ids = paddle.to_tensor(datum["phones"]) tone_ids = paddle.to_tensor(datum["tones"]) mel = am_inference(phone_ids, tone_ids) + elif am_name == 'tacotron2': + phone_ids = paddle.to_tensor(datum["text"]) + spk_emb = None + # multi speaker + if args.voice_cloning and "spk_emb" in datum: + spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) + mel = am_inference(phone_ids, spk_emb=spk_emb) # vocoder wav = voc_inference(mel) sf.write( @@ -188,7 +207,8 @@ def main(): default='fastspeech2_csmsc', choices=[ 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk' + 'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc', + 'tacotron2_ljspeech', 'tacotron2_aishell3' ], help='Choose acoustic model type of tts task.') parser.add_argument( @@ -214,9 +234,6 @@ def main(): parser.add_argument( "--speaker_dict", type=str, default=None, help="speaker id map file.") - def str2bool(str): - return True if str.lower() == 'true' else False - parser.add_argument( "--voice-cloning", type=str2bool, diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 15ed1e4d..75c631b8 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -38,6 +38,10 @@ model_alias = { "paddlespeech.t2s.models.fastspeech2:FastSpeech2", "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -55,6 +59,10 @@ model_alias = { "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", "hifigan_inference": "paddlespeech.t2s.models.hifigan:HiFiGANInference", + "wavernn": + "paddlespeech.t2s.models.wavernn:WaveRNN", + "wavernn_inference": + "paddlespeech.t2s.models.wavernn:WaveRNNInference", } @@ -125,7 +133,12 @@ def evaluate(args): idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) elif am_name == 'speedyspeech': am = am_class( - vocab_size=vocab_size, tone_size=tone_size, **am_config["model"]) + vocab_size=vocab_size, + tone_size=tone_size, + spk_num=spk_num, + **am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) am.eval() @@ -142,10 +155,16 @@ def evaluate(args): voc_name = args.voc[:args.voc.rindex('_')] voc_class = dynamic_import(voc_name, model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - voc = voc_class(**voc_config["generator_params"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() + if voc_name != 'wavernn': + voc = voc_class(**voc_config["generator_params"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) + voc.remove_weight_norm() + voc.eval() + else: + voc = voc_class(**voc_config["model"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"]) + voc.eval() + voc_mu, voc_std = np.load(args.voc_stat) voc_mu = paddle.to_tensor(voc_mu) voc_std = paddle.to_tensor(voc_std) @@ -159,29 +178,42 @@ def evaluate(args): # acoustic model if am_name == 'fastspeech2': if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: - print( - "Haven't test dygraph to static for multi speaker fastspeech2 now!" - ) + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), + InputSpec([1], dtype=paddle.int64) + ]) else: am_inference = jit.to_static( am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) - paddle.jit.save(am_inference, - os.path.join(args.inference_dir, args.am)) - am_inference = paddle.jit.load( - os.path.join(args.inference_dir, args.am)) + elif am_name == 'speedyspeech': + if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), # text + InputSpec([-1], dtype=paddle.int64), # tone + None, # duration + InputSpec([-1], dtype=paddle.int64) # spk_id + ]) + else: + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), + InputSpec([-1], dtype=paddle.int64) + ]) + + elif am_name == 'tacotron2': am_inference = jit.to_static( - am_inference, - input_spec=[ - InputSpec([-1], dtype=paddle.int64), - InputSpec([-1], dtype=paddle.int64) - ]) + am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) - paddle.jit.save(am_inference, - os.path.join(args.inference_dir, args.am)) - am_inference = paddle.jit.load( - os.path.join(args.inference_dir, args.am)) + paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am)) + am_inference = paddle.jit.load( + os.path.join(args.inference_dir, args.am)) # vocoder voc_inference = jit.to_static( @@ -197,6 +229,11 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) merge_sentences = False + # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph + # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) + if am_name == 'tacotron2': + merge_sentences = True + for utt_id, sentence in sentences: get_tone_ids = False if am_name == 'speedyspeech': @@ -229,7 +266,14 @@ def evaluate(args): mel = am_inference(part_phone_ids) elif am_name == 'speedyspeech': part_tone_ids = tone_ids[i] - mel = am_inference(part_phone_ids, part_tone_ids) + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, part_tone_ids, + spk_id) + else: + mel = am_inference(part_phone_ids, part_tone_ids) + elif am_name == 'tacotron2': + mel = am_inference(part_phone_ids) # vocoder wav = voc_inference(mel) if flags == 0: @@ -254,8 +298,9 @@ def main(): type=str, default='fastspeech2_csmsc', choices=[ - 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk' + 'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk', + 'tacotron2_csmsc', 'tacotron2_ljspeech' ], help='Choose acoustic model type of tts task.') parser.add_argument( @@ -292,7 +337,8 @@ def main(): default='pwgan_csmsc', choices=[ 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc' + 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc', + 'wavernn_csmsc' ], help='Choose vocoder type of tts task.') diff --git a/paddlespeech/t2s/exps/tacotron2/config.py b/paddlespeech/t2s/exps/tacotron2/config.py deleted file mode 100644 index 0ce2df36..00000000 --- a/paddlespeech/t2s/exps/tacotron2/config.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode as CN - -_C = CN() -_C.data = CN( - dict( - batch_size=32, # batch size - valid_size=64, # the first N examples are reserved for validation - sample_rate=22050, # Hz, sample rate - n_fft=1024, # fft frame size - win_length=1024, # window size - hop_length=256, # hop size between ajacent frame - fmax=8000, # Hz, max frequency when converting to mel - fmin=0, # Hz, min frequency when converting to mel - n_mels=80, # mel bands - padding_idx=0, # text embedding's padding index - )) - -_C.model = CN( - dict( - vocab_size=37, # set this according to the frontend's vocab size - n_tones=None, - reduction_factor=1, # reduction factor - d_encoder=512, # embedding & encoder's internal size - encoder_conv_layers=3, # number of conv layer in tacotron2 encoder - encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder - d_prenet=256, # hidden size of decoder prenet - d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder - d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder - d_attention=128, # hidden size of decoder location linear layer - attention_filters=32, # number of filter in decoder location conv layer - attention_kernel_size=31, # kernel size of decoder location conv layer - d_postnet=512, # hidden size of decoder postnet - postnet_kernel_size=5, # kernel size of conv layers in postnet - postnet_conv_layers=5, # number of conv layer in decoder postnet - p_encoder_dropout=0.5, # droput probability in encoder - p_prenet_dropout=0.5, # droput probability in decoder prenet - p_attention_dropout=0.1, # droput probability of first rnn layer in decoder - p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder - p_postnet_dropout=0.5, # droput probability in decoder postnet - d_global_condition=None, - use_stop_token=True, # wherther to use binary classifier to predict when to stop - use_guided_attention_loss=False, # whether to use guided attention loss - guided_attention_loss_sigma=0.2 # sigma in guided attention loss - )) - -_C.training = CN( - dict( - lr=1e-3, # learning rate - weight_decay=1e-6, # the coeff of weight decay - grad_clip_thresh=1.0, # the clip norm of grad clip. - plot_interval=1000, # plot attention and spectrogram - valid_interval=1000, # validation - save_interval=1000, # checkpoint - max_iteration=500000, # max iteration to train - )) - - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - return _C.clone() diff --git a/paddlespeech/t2s/exps/tacotron2/ljspeech.py b/paddlespeech/t2s/exps/tacotron2/ljspeech.py deleted file mode 100644 index 08db2a64..00000000 --- a/paddlespeech/t2s/exps/tacotron2/ljspeech.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pickle -from pathlib import Path - -import numpy as np -from paddle.io import Dataset - -from paddlespeech.t2s.data.batch import batch_spec -from paddlespeech.t2s.data.batch import batch_text_id - - -class LJSpeech(Dataset): - """A simple dataset adaptor for the processed ljspeech dataset.""" - - def __init__(self, root): - self.root = Path(root).expanduser() - records = [] - with open(self.root / "metadata.pkl", 'rb') as f: - metadata = pickle.load(f) - for mel_name, text, ids in metadata: - mel_name = self.root / "mel" / (mel_name + ".npy") - records.append((mel_name, text, ids)) - self.records = records - - def __getitem__(self, i): - mel_name, _, ids = self.records[i] - mel = np.load(mel_name) - return ids, mel - - def __len__(self): - return len(self.records) - - -class LJSpeechCollector(object): - """A simple callable to batch LJSpeech examples.""" - - def __init__(self, padding_idx=0, padding_value=0., padding_stop_token=1.0): - self.padding_idx = padding_idx - self.padding_value = padding_value - self.padding_stop_token = padding_stop_token - - def __call__(self, examples): - texts = [] - mels = [] - text_lens = [] - mel_lens = [] - - for data in examples: - text, mel = data - text = np.array(text, dtype=np.int64) - text_lens.append(len(text)) - mels.append(mel) - texts.append(text) - mel_lens.append(mel.shape[1]) - - # Sort by text_len in descending order - texts = [ - i for i, _ in sorted( - zip(texts, text_lens), key=lambda x: x[1], reverse=True) - ] - mels = [ - i for i, _ in sorted( - zip(mels, text_lens), key=lambda x: x[1], reverse=True) - ] - - mel_lens = [ - i for i, _ in sorted( - zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True) - ] - - mel_lens = np.array(mel_lens, dtype=np.int64) - text_lens = np.array(sorted(text_lens, reverse=True), dtype=np.int64) - - # Pad sequence with largest len of the batch - texts, _ = batch_text_id(texts, pad_id=self.padding_idx) - mels, _ = batch_spec(mels, pad_value=self.padding_value) - mels = np.transpose(mels, axes=(0, 2, 1)) - - return texts, mels, text_lens, mel_lens diff --git a/paddlespeech/t2s/exps/tacotron2/normalize.py b/paddlespeech/t2s/exps/tacotron2/normalize.py new file mode 120000 index 00000000..64848f89 --- /dev/null +++ b/paddlespeech/t2s/exps/tacotron2/normalize.py @@ -0,0 +1 @@ +../transformer_tts/normalize.py \ No newline at end of file diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py index 480b3331..7f41089e 100644 --- a/paddlespeech/t2s/exps/tacotron2/preprocess.py +++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,86 +13,314 @@ # limitations under the License. import argparse import os -import pickle +from concurrent.futures import ThreadPoolExecutor +from operator import itemgetter from pathlib import Path +from typing import Any +from typing import Dict +from typing import List +import jsonlines +import librosa import numpy as np import tqdm +import yaml +from yacs.config import CfgNode -from paddlespeech.t2s.audio import AudioProcessor -from paddlespeech.t2s.audio import LogMagnitude -from paddlespeech.t2s.datasets import LJSpeechMetaData -from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults -from paddlespeech.t2s.frontend import EnglishCharacter - - -def create_dataset(config, source_path, target_path, verbose=False): - # create output dir - target_path = Path(target_path).expanduser() - mel_path = target_path / "mel" - os.makedirs(mel_path, exist_ok=True) - - meta_data = LJSpeechMetaData(source_path) - frontend = EnglishCharacter() - processor = AudioProcessor( - sample_rate=config.data.sample_rate, - n_fft=config.data.n_fft, - n_mels=config.data.n_mels, - win_length=config.data.win_length, - hop_length=config.data.hop_length, - fmax=config.data.fmax, - fmin=config.data.fmin) - normalizer = LogMagnitude() - - records = [] - for (fname, text, _) in tqdm.tqdm(meta_data): - wav = processor.read_wav(fname) - mel = processor.mel_spectrogram(wav) - mel = normalizer.transform(mel) - ids = frontend(text) - mel_name = os.path.splitext(os.path.basename(fname))[0] - - # save mel spectrogram - records.append((mel_name, text, ids)) - np.save(mel_path / mel_name, mel) - if verbose: - print("save mel spectrograms into {}".format(mel_path)) - - # save meta data as pickle archive - with open(target_path / "metadata.pkl", 'wb') as f: - pickle.dump(records, f) - if verbose: - print("saved metadata into {}".format(target_path / "metadata.pkl")) - - print("Done.") +from paddlespeech.t2s.datasets.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length +from paddlespeech.t2s.datasets.preprocess_utils import get_input_token +from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur +from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map +from paddlespeech.t2s.datasets.preprocess_utils import merge_silence +from paddlespeech.t2s.utils import str2bool -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="create dataset") +def process_sentence(config: Dict[str, Any], + fp: Path, + sentences: Dict, + output_dir: Path, + mel_extractor=None, + cut_sil: bool=True, + spk_emb_dir: Path=None): + utt_id = fp.stem + # for vctk + if utt_id.endswith("_mic2"): + utt_id = utt_id[:-5] + record = None + if utt_id in sentences: + # reading, resampling may occur + wav, _ = librosa.load(str(fp), sr=config.fs) + if len(wav.shape) != 1 or np.abs(wav).max() > 1.0: + return record + assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." + assert np.abs(wav).max( + ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + speaker = sentences[utt_id][2] + d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant') + # little imprecise than use *.TextGrid directly + times = librosa.frames_to_time( + d_cumsum, sr=config.fs, hop_length=config.n_shift) + if cut_sil: + start = 0 + end = d_cumsum[-1] + if phones[0] == "sil" and len(durations) > 1: + start = times[1] + durations = durations[1:] + phones = phones[1:] + if phones[-1] == 'sil' and len(durations) > 1: + end = times[-2] + durations = durations[:-1] + phones = phones[:-1] + sentences[utt_id][0] = phones + sentences[utt_id][1] = durations + start, end = librosa.time_to_samples([start, end], sr=config.fs) + wav = wav[start:end] + # extract mel feats + logmel = mel_extractor.get_log_mel_fbank(wav) + # change duration according to mel_length + compare_duration_and_mel_length(sentences, utt_id, logmel) + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + num_frames = logmel.shape[0] + assert sum(durations) == num_frames + mel_dir = output_dir / "data_speech" + mel_dir.mkdir(parents=True, exist_ok=True) + mel_path = mel_dir / (utt_id + "_speech.npy") + np.save(mel_path, logmel) + record = { + "utt_id": utt_id, + "phones": phones, + "text_lengths": len(phones), + "speech_lengths": num_frames, + "speech": str(mel_path), + "speaker": speaker + } + if spk_emb_dir: + if speaker in os.listdir(spk_emb_dir): + embed_name = utt_id + ".npy" + embed_path = spk_emb_dir / speaker / embed_name + if embed_path.is_file(): + record["spk_emb"] = str(embed_path) + else: + return None + return record + + +def process_sentences(config, + fps: List[Path], + sentences: Dict, + output_dir: Path, + mel_extractor=None, + nprocs: int=1, + cut_sil: bool=True, + spk_emb_dir: Path=None): + if nprocs == 1: + results = [] + for fp in fps: + record = process_sentence(config, fp, sentences, output_dir, + mel_extractor, cut_sil, spk_emb_dir) + if record: + results.append(record) + else: + with ThreadPoolExecutor(nprocs) as pool: + futures = [] + with tqdm.tqdm(total=len(fps)) as progress: + for fp in fps: + future = pool.submit(process_sentence, config, fp, + sentences, output_dir, mel_extractor, + cut_sil, spk_emb_dir) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + results = [] + for ft in futures: + record = ft.result() + if record: + results.append(record) + + results.sort(key=itemgetter("utt_id")) + with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: + for item in results: + writer.write(item) + print("Done") + + +def main(): + # parse config and args + parser = argparse.ArgumentParser( + description="Preprocess audio and then extract features.") + parser.add_argument( - "--config", + "--dataset", + default="baker", type=str, - metavar="FILE", - help="extra config to overwrite the default config") + help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now") + parser.add_argument( - "--input", type=str, help="path of the ljspeech dataset") + "--rootdir", default=None, type=str, help="directory to dataset.") + + parser.add_argument( + "--dumpdir", + type=str, + required=True, + help="directory to dump feature files.") + parser.add_argument( + "--dur-file", default=None, type=str, help="path to durations.txt.") + + parser.add_argument("--config", type=str, help="fastspeech2 config file.") + parser.add_argument( - "--output", type=str, help="path to save output dataset") + "--verbose", + type=int, + default=1, + help="logging level. higher is more logging. (default=1)") parser.add_argument( - "--opts", - nargs=argparse.REMAINDER, - help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" - ) + "--num-cpu", type=int, default=1, help="number of process.") + parser.add_argument( - "-v", "--verbose", action="store_true", help="print msg") + "--cut-sil", + type=str2bool, + default=True, + help="whether cut sil in the edge of audio") - config = get_cfg_defaults() + parser.add_argument( + "--spk_emb_dir", + default=None, + type=str, + help="directory to speaker embedding files.") args = parser.parse_args() - if args.config: - config.merge_from_file(args.config) - if args.opts: - config.merge_from_list(args.opts) - config.freeze() - print(config.data) - - create_dataset(config, args.input, args.output, args.verbose) + + rootdir = Path(args.rootdir).expanduser() + dumpdir = Path(args.dumpdir).expanduser() + # use absolute path + dumpdir = dumpdir.resolve() + dumpdir.mkdir(parents=True, exist_ok=True) + dur_file = Path(args.dur_file).expanduser() + + if args.spk_emb_dir: + spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve() + else: + spk_emb_dir = None + + assert rootdir.is_dir() + assert dur_file.is_file() + + with open(args.config, 'rt') as f: + config = CfgNode(yaml.safe_load(f)) + + if args.verbose > 1: + print(vars(args)) + print(config) + + sentences, speaker_set = get_phn_dur(dur_file) + + merge_silence(sentences) + phone_id_map_path = dumpdir / "phone_id_map.txt" + speaker_id_map_path = dumpdir / "speaker_id_map.txt" + get_input_token(sentences, phone_id_map_path, args.dataset) + get_spk_id_map(speaker_set, speaker_id_map_path) + + if args.dataset == "baker": + wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) + # split data into 3 sections + num_train = 9800 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "aishell3": + sub_num_dev = 5 + wav_dir = rootdir / "train" / "wav" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*.wav"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + elif args.dataset == "ljspeech": + wav_files = sorted(list((rootdir / "wavs").rglob("*.wav"))) + # split data into 3 sections + num_train = 12900 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "vctk": + sub_num_dev = 5 + wav_dir = rootdir / "wav48_silence_trimmed" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + else: + print("dataset should in {baker, aishell3, ljspeech, vctk} now!") + + train_dump_dir = dumpdir / "train" / "raw" + train_dump_dir.mkdir(parents=True, exist_ok=True) + dev_dump_dir = dumpdir / "dev" / "raw" + dev_dump_dir.mkdir(parents=True, exist_ok=True) + test_dump_dir = dumpdir / "test" / "raw" + test_dump_dir.mkdir(parents=True, exist_ok=True) + + # Extractor + mel_extractor = LogMelFBank( + sr=config.fs, + n_fft=config.n_fft, + hop_length=config.n_shift, + win_length=config.win_length, + window=config.window, + n_mels=config.n_mels, + fmin=config.fmin, + fmax=config.fmax) + + # process for the 3 sections + if train_wav_files: + process_sentences( + config, + train_wav_files, + sentences, + train_dump_dir, + mel_extractor, + nprocs=args.num_cpu, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + if dev_wav_files: + process_sentences( + config, + dev_wav_files, + sentences, + dev_dump_dir, + mel_extractor, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + if test_wav_files: + process_sentences( + config, + test_wav_files, + sentences, + test_dump_dir, + mel_extractor, + nprocs=args.num_cpu, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/tacotron2/synthesize.ipynb b/paddlespeech/t2s/exps/tacotron2/synthesize.ipynb deleted file mode 100644 index cc424311..00000000 --- a/paddlespeech/t2s/exps/tacotron2/synthesize.ipynb +++ /dev/null @@ -1,342 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TTS with Tacotron2 + Waveflow" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import paddle\n", - "from matplotlib import pyplot as plt\n", - "from IPython import display as ipd\n", - "%matplotlib inline\n", - "\n", - "from paddlespeech.t2s.utils import display\n", - "from paddlespeech.t2s.utils import layer_tools\n", - "paddle.set_device(\"gpu:0\")\n", - "\n", - "import sys\n", - "sys.path.append(\"../..\")\n", - "import examples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tacotron2: synthesizer model\n", - "\n", - "Tacotron2 is used here as a phonemes to spectrogram model. Here we will use an alternative config. In this config, the tacotron2 model does not have a binary classifier to predict whether the generation should stop.\n", - "\n", - "Instead, the peak position is used as the criterion. When the peak position of the attention reaches the end of the encoder outputs, it implies that the content is exhausted. So we stop the generated after 10 frames." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from paddlespeech.t2s.models.tacotron2 import Tacotron2\n", - "from paddlespeech.t2s.frontend import EnglishCharacter" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data:\n", - " batch_size: 32\n", - " fmax: 8000\n", - " fmin: 0\n", - " hop_length: 256\n", - " n_fft: 1024\n", - " n_mels: 80\n", - " padding_idx: 0\n", - " sample_rate: 22050\n", - " valid_size: 64\n", - " win_length: 1024\n", - "model:\n", - " attention_filters: 32\n", - " attention_kernel_size: 31\n", - " d_attention: 128\n", - " d_attention_rnn: 1024\n", - " d_decoder_rnn: 1024\n", - " d_encoder: 512\n", - " d_global_condition: None\n", - " d_postnet: 512\n", - " d_prenet: 256\n", - " encoder_conv_layers: 3\n", - " encoder_kernel_size: 5\n", - " guided_attention_loss_sigma: 0.2\n", - " n_tones: None\n", - " p_attention_dropout: 0.1\n", - " p_decoder_dropout: 0.1\n", - " p_encoder_dropout: 0.5\n", - " p_postnet_dropout: 0.5\n", - " p_prenet_dropout: 0.5\n", - " postnet_conv_layers: 5\n", - " postnet_kernel_size: 5\n", - " reduction_factor: 1\n", - " use_guided_attention_loss: True\n", - " use_stop_token: False\n", - " vocab_size: 37\n", - "training:\n", - " grad_clip_thresh: 1.0\n", - " lr: 0.001\n", - " max_iteration: 500000\n", - " plot_interval: 1000\n", - " save_interval: 1000\n", - " valid_interval: 1000\n", - " weight_decay: 1e-06\n" - ] - } - ], - "source": [ - "from examples.tacotron2 import config as tacotron2_config\n", - "synthesizer_config = tacotron2_config.get_cfg_defaults()\n", - "synthesizer_config.merge_from_file(\"configs/alternative.yaml\")\n", - "print(synthesizer_config)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[checkpoint] Rank 0: loaded model from ../../pretrained/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative/step-50000.pdparams\n" - ] - } - ], - "source": [ - "frontend = EnglishCharacter()\n", - "model = Tacotron2.from_pretrained(\n", - " synthesizer_config, \"../../pretrained/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative/step-50000\")\n", - "model.eval()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 36%|███▋ | 365/1000 [00:01<00:02, 256.89it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "content exhausted!\n" - ] - } - ], - "source": [ - "sentence = \"Life was like a box of chocolates, you never know what you're gonna get.\" \n", - "sentence = paddle.to_tensor(frontend(sentence)).unsqueeze(0)\n", - "\n", - "with paddle.no_grad():\n", - " outputs = model.infer(sentence)\n", - "mel_output = outputs[\"mel_outputs_postnet\"][0].numpy().T\n", - "alignment = outputs[\"alignments\"][0].numpy().T" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEYCAYAAAB2qXBEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3de5xcdZ3n/9f7nKrq+yWdhCY3SLgblQAygKCAgAquP5jZUQd0d+M8WJlx1NHxssDM6qi7s4Pjb1DHZXQyiuKsI+JtwBkUWQRUHDDhFm7BhEDIPeTe6U53V9X57B/ndLrS6Ut1d1XX6c7nmUc9+pxT5/Lp09X9zfl8bzIznHPOuYkKah2Ac8656c0LEuecc5PiBYlzzrlJ8YLEOefcpHhB4pxzblK8IHHOOTcpVStIJJ0q6YmS135JH5HUIeleSWuTr7OqFYNzzrnq01T0I5EUApuBc4EPALvN7CZJNwCzzOz6qgfhnHOuKqaqIHkL8JdmdoGk54GLzWyrpHnAA2Z26mjH51Rn9TRVPU7n3FFEIJUkZcKQqCFLoVG0dxwgwAgVESoiMtEU9KH4sEOKiO6oDoCsighDGNs25+naXRAV8NY3Ndmu3cWy9390dd89ZnZ5Ja5drswUXedq4DvJcqeZbU2WtwGdYx1cTxPn6tJqxeacOwopk0G53KH1YFY7Pa9dwI4zs/zeH/ySxqCftkwP7WEPXcV6zmtYT52KhBr8z3dXlOXhgycSWcD87B6yKpBTkQ9e9WLF4ty5u8gj9ywse//svBfmVOziZap6QSIpB1wJ3Dj0PTMzScM+Ekm6DrgOoJ7GqsbonHPpZRQtqnUQo5qKJ5IrgMfMbHuyvl3SvJLU1o7hDjKzFcAKgFZ1+IBgzrnKkFAuR9DcBMfMpvvEWWy+KMTm9ZLZkKFxKzz2nqVobxcEAdbbh8KAO7sWY/15sAiL4j9Jymbi82QyUChAvoAVCuw6uK1i4RoQke4/gVPR/PcaBtNaAHcBy5Pl5cCdUxCDc85NW9E4/tVCVZ9IJDUBbwb+qGTzTcAdkq4FNgDvqmYMzjk3nRlGMeWjtFe1IDGzbmD2kG27AK85d865MqU9tTVVrbacc672JDIL5lOc287+E5t55cyA4JQDhGuamXNnjtbn9hDs2ktx9x4sXwCLYLinASUteyODfAEgrj8BCMPB9yvAgKIXJM455ybDn0icc85NmAH5o7mOxDnnqko6MvWUpJUUhvHXTPxnTg0NbLv6NP7Hx77Bqu4T+MGLy2j4+SwWrNhL4eWnAbAwJEo6KSoQhIMdFoksTnUpgEAQGVbIU9zff0RYZuX3RB+LYZ7acs45NwkGxXSXI16QOOdcmsUdEtPNCxLn3PQ1kNYaaCV1aD0ABYQL53HwxDlsuizHf3nb/dz6+BK+/OplEBnzeQEsolAsHjrOCgWsZJ1CYfTrS0deu+JEkcq1AqsGL0iccy7FjLh6Js28IHHOuZTzJxLnnKuCoL4eMzvsv+tBexvWOZv9r2rj4OyAPWcWIBLZPfDQu8/g1A1rifr74zTUaCmp4VqDlb4HoAAFilNhCuKJSgZadVWu0VbSIdELEuecc5MQmRckzjnnJsifSJxzzk2KIfIW1jqMUXlB4pybdpTJsOHjZ3HJVY/yjo5VBIpYnDnA2nwbTx48nnu2L2X37ll03NtM68sF6h94nKi39/CTjNZcN6nzUBjGvdiLxbgupPRYK3Jo4sLSnuwV7NUO/kTinHNu0kTRpmIOwonzgsQ551Is7tnuBYlzzk2MhMKQ/EXL2HpeHflX97C4cxfZoEj7P0S8eOUsPrfvPCxfGJJ62sxxtunQallDjJQ06yWKz2VRGWmq4eYeqXAHQk9tOeecmzAzT20555ybpMifSJxzrkwSQXMzwZwODp44h6BgbD+nnsuv/nd+sPosWn7TSM+WenL7i7T/+lkKXV2VHyyxnHRWSbwKQwjDeGreQ9srF07casufSJxzzk2Yp7acc85Ngrfacs45NymG6Pee7c45NzJlcyiXRbkcam9l89sXELx5F7ObdtCWO8gLv13M6tfByfboYaPyRplM3FS3Uj3JJ1LXYoYVCkdOgFXpapuUp7aqGp2kdknfl7RG0nOSXi+pQ9K9ktYmX2dVMwbnnJvOBirby33VQrWv+iXgp2Z2GrAMeA64AbjPzE4G7kvWnXPODcMQRSv/VQtVS21JagMuBN4LYGb9QL+kq4CLk91uAx4Arq9WHM65lApCglyWYO4cotmtdC9qpvuYkMxBo/0LTeSe20tPUZy6d3U8gRUcln6yyOKJpWyUSahmiKO5sn0J8ArwDUnLgEeBDwOdZrY12Wcb0DncwZKuA64DqKeximE651x6mZH65r/VjC4DnAV8xczOBLoZksay+L8Zw/5XwsxWmNnZZnZ2lroqhumcc2kmonG8aqGaTySbgE1m9kiy/n3igmS7pHlmtlXSPGBHFWNwzqVNEBI0NVJ87Qm8fHETjefvJJfpZc+BgN5tTSz4OeQef4HCvv3x/iOlrSzConT/T70SjKP4icTMtgEbJZ2abLoUeBa4C1iebFsO3FmtGJxzbiZIe6utavcj+RDwbUk5YD3wh8SF1x2SrgU2AO+qcgzOOTdtGSKqUWusclW1IDGzJ4Czh3nr0mpe1zmXMhJhSwu2eD6985rZ8boc/W1Gbi+0f76J3IZdtHVtwQ72Eh08SLGMVlgKQ1CQ9EeMZmzLLQPylu6+4+mOzjnnjnryia2cc85NnJH+IVK8IHHOuZTzJxLn3FFJ2RxYBGFIeMxcDp7ayf7jc8igbo8x+9kiLat3UNy8lWJk8ZzrNo66jjAZEbcwc+tHIJ5qt5JPJJIuJx6+KgS+ZmY3DXn/OOJRR9qTfW4ws7tHO6cXJM45l3KV6kciKQRuAd5M3NdvpaS7zOzZkt3+O3CHmX1F0lLgbmDxaOdNd+LNOeeOcvHEVhXr2X4OsM7M1ifjH94OXDXMJVuT5TZgy1gn9ScS51xlBSGZ+cey79yF9BwTsGdZkbqOgxQLRWyL0bAjYP4vDhCu3URhz754jnSVOfCiFM9BkrD+/hmd1oqNe6rdOZJWlayvMLMVyfICYGPJe5uAc4cc/2ngZ5I+BDQBl411QS9InHMuxeJWW+OqbN9pZsP13yvXNcA3zexvJb0e+CdJrzGzaKQDvCBxzrmUq+DQJ5uBRSXrC5Ntpa4FLgcws3+XVA/MYZRxEb0gcc5NnkSm8xgKxx1D93GNvPLOgxw3Zwu7drfTvLKFBV8zwk1bKe7aEw+2WChw2AS55aanzA5NrWv9M7u11oAKD5GyEjhZ0hLiAuRq4N1D9nmZePSRb0p6FVBPPCXIiLwgcc65FDODvIUVOpcVJH0QuIe4ae+tZvaMpM8Cq8zsLuBjwD9K+jPizNp7zUYvsb0gcc65lKvkoI1Jn5C7h2z7VMnys8AF4zmnFyTOOZdicWor3T01vCBxzo1PMpIvgC2ez64zZ9H19gN86czbeaj7FFbuPp7oH44juH01xyUtTQ0oVDyOgJk86m8pHyLFOefchE2g+e+U84LEOedSzVNbzrmZQkK5HMHxC+l6zRwOzA/pWhIRZYyOnzVz8zXLkoEXt9Ay9qgakzfQI/4oUMbQJzXlBYlzzqWYGRQ9teWcc24yPLXlnJu+gpCgvo5gdgf9J8xl3+J6upaI3mMKqFhkzqqA2U/shbUbiAqTaJdV7qCNQx0FLbYq3LO9Krwgcc65FDOg4E8kzjnnJsNTW8656SUIUTZD0NwEx8zm5Svn0jfLKMwqEBw02p6HJd/dA1t3EHUfJCrkj0wxjSdVdWiOkaOjc+G4mae2nHPOTcLADIlpVtWCRNJLQBdQBApmdrakDuC7xHMAvwS8y8z2VDMO55ybztL+RDIVibc3mdkZJTN23QDcZ2YnA/cl684554YxMERKua9aqEVq6yrg4mT5NuAB4PoaxOGcA5AI6uoIjj2G3efPZ//xAf2zjHDxAU6fv4Xup2fRuCHDggehccNueGUP0d59cS/2kYynrqNksio3vKP9icSIJ5F/VNJ1ybZOM9uaLG8DOoc7UNJ1klZJWpWnr8phOudcOg30Izman0jeYGabJR0D3CtpTembZmaShv2vi5mtAFYAtKrDm3I4545aR3Vlu5ltTr7ukPQj4Bxgu6R5ZrZV0jxGmVDeOVdFQUjwmpM5/bY1vLphHaHW8vcvXkTPi3PI7QppvbuZA490cOraxw6lsYpmR81AialhR3FqS1KTpJaBZeAtwNPAXcDyZLflwJ3VisE556Y7AwpRUParFqr5RNIJ/Ejx/14ywD+b2U8lrQTukHQtsAF4VxVjcM65ae2oHmvLzNYDy4bZvgu4tFrXdc4NL5zdAZ1z6Dm+ja7jMuw9zYiaigT/8XiePthJtL+L5t4NnMIGsAhIUlkwmM4a6IWevA94b/QpYEdrQeKcc64yZkRlu6RjiSvKDVhpZtuqGpVzzjkgfuCb9qktSf8V+BTwc0DAlyV91sxurXZwzrkKkAgaG+k+/yS2XBBSbIogMpo2Biz86V4KGzbF09aWy9NaU24mpLY+AZyZ1G0gaTbwa8ALEuecq7qZUdm+i3jgxQFdyTbnnHNTYCY8kawDHpF0J3EdyVXAakkfBTCzm6sYn3POHdUGBm1Ms3IKkheS14CBDoQtlQ/HOVcRQUjQUE/QOZeouZ6+Y5t5+fciWp7JMOsho+mFPdjGrUTdPeXXjyjp7DZQRzJa/UgQDi6bT1g1KZb+2zdmQWJmnwGQ1GhmPdUPyTnn3AADiimfanfM6CS9XtKzwJpkfZmkv696ZM4552CGjP77ReCtxGNkYWZPSrqwqlE558ZFmQxWLKIwJOw8hmc/uZBrzn2Y1zU9xbZCG/e+spRX/XErxTXrwIxxz/4x3jlDxtOc2I1p2qe2AMxsow4f8dM/Jc45N0VmQqutjZLOB0xSFvgw8Fx1w3LOOQfJw+AMKEj+GPgSsADYDPwM+JNqBuWcG4M0mO+QUC6H9fYRtLfRdfZCml7K8NjXl/HY47lkLpEdYNvHPpdLpZnQ/PdUM3tP6QZJFwAPVSck55xzpdJezpfTpuzLZW5zzjlXBWYq+1ULIz6RSHo9cD4wd6AXe6IVCIc/yjnnXCUZtSsgyjVaaisHNCf7lPZi3w+8o5pBOedGIaEwhDAkaG2F9hbWfLKdH194C2vzc/noQ4s55X1PYPn+I44bNkeS9ryJI+0/oRELEjN7EHhQ0jfNbAOApABoNrP9UxWgc84d1QwsqtwTiaTLiRtQhcDXzOymYfZ5F/Dp+Oo8aWbvHu2c5dSR/LWkVklNwNPAs5I+Md7gnXPOTUyl6kgkhcAtwBXAUuAaSUuH7HMycCNwgZm9GvjIWPGV02prqZntl/Qe4CfADcCjwOfLONY5N0HKZCAMsf44RZU5biE7bmng9tfeSrdl6AgKbC/mWNvfyd984Wo+/v7LsL4+TgmfPjKtBSOmsJTJYJEdPmHVKPu7qVfBH8U5wDozWw8g6XbiEd2fLdnnfcAtZrYnvrbtGOuk5TyRZJOOiL8L3GVmedKfsnPOuRnBGPcTyRxJq0pe15WcbgGwsWR9U7Kt1CnAKZIekvRwkgobVTlPJP8AvAQ8CfxC0vHEFe7OOeeqzYDxtdraaWZnT+KKGeBk4GJgIfHf/dea2d7RDhiVmf0d8HclmzZIetMkgnTOjUWCMCR/wWvY8oY6eo/rh0ic/L/6+cCjl0JkWCF/KOcxVw8TDeQ/CoVxXcfGs7+riQqmtjYDi0rWFybbSm0CHkmyTy9K+i1xwbJypJOWM4x8p6SvS/pJsr4UWD7O4J1zzk2UjeM1upXAyZKWSMoBV5OM7F7iX4ifRpA0hzjVtX60k5ZTR/JN4B5gfrL+W8qoxR8gKZT0uKR/TdaXSHpE0jpJ302+Geecc8Mqv35krFZbZlYAPkj8N/054A4ze0bSZyVdmex2D7ArmYfqfuATZrZrtPOWU0cyx8zukHTjQCCSxjOM/MBowa3J+ueAL5jZ7ZK+ClwLfGUc53OutgamVCgZNPEQs8PXS6anVRiCAhQGEARYvgAWxYMqJscFdXUEc2ZzcOk8+ltDdvx+L/m9IY0v5Jj7ZJ7wseeJ+vpGjmukHIgUxzK0ZZabHirYvMnM7gbuHrLtUyXLBnw0eZWlnCeSbkmzSb4VSecB+8o5uaSFwH8AvpasC7gE+H6yy23ErcGcc84Nx6bxWFslPkqcQztR0kPAXMofIuWLwH9jcIiV2cDe5PEKhm96BkDSZO06gHoay7ycc87NQNN4rC0AzOwxSRcBpwICnk9q80cl6e3ADjN7VNLF4w3MzFYAKwBa1eH9VpxzR6+U/wUcsyBJutS/DVic7P8WSZjZzWMcegFwpaS3AfXEdSRfAtolZZKnkuGanjmXbkPrIUZbL5nnPK4LKXDov2ESymRRGPcu3/3OM9n5OmPeaTvo7jvA/q4GTvmrPmzNM/F87Llc3AN9qKF1NiPFPJ451126pLwgKaeO5MfAe4nTUi0lr1GZ2Y1mttDMFhM3Mft5MkHW/QymxpYDd44/bOecO0oMdEgs91UD5dSRLDSz0yt4zeuB2yX9T+Bx4OsVPLdzzs04aR/2rJyC5CeS3mJmP5voRczsAeCBZHk98cBhzk0fYzWtHem9IIyb3JqhTDbujQ6ggHBWGxw7F6KI7pNmsfOtvdQ/00DdXbNoe3En83v3Utj+CkRxSsr6+gjq67HCMKms0uVyUl1D4x/P/m7qpfxHU05B8jDwo2QukjxxhbuZWevohznnnKuI6d5qC7gZeD3wVNJRxTnn3BRSyv/yllOQbASe9kLEzVjDpXaSnuAKQxQGqK0V+vNEPT2HHxqGBJ1zsV17IAyJenpQLgdRhBobUVMD9PVjvX1EJy7A6jJk128jmtvOmmvbOfE1m7lo7loe3r2E096bJdqxE6KIohlKptS1JLWFFPdqH0+rsXL4r3a6lTeGVk2VU5CsBx5IBm08NDZDGc1/nXPOTVrtWmOVq5yC5MXklUtezjnnplLKh0grp2f7Z6YiEOeccyOYrqktSV80s49I+jHDfBtmduUwhzmXHsON0qsABcn2ZCRe1dfBMXMozGkmOFggqs/Q355jzylZ8i1QrDPybRHWXCTYlyHsFVEmrgAt1hnZ/SKqm0+xziCweJC9rEFoIMg294OM/MEsRKK5o4N8PqTtwYDsikZ+ta4N6992RF3FML90Vb1dLqXGP0PilBvtieSfkq///1QE4pxzbnjTttWWmT2aLJ5hZl8qfU/Sh4EHqxmYc865xHQtSEosJx5ssdR7h9nmXM0F9fUQhmA2OHFUZChQ3Cw3ac5LGKKWZqK2Jkxi8yXtHFjWS6bOaGroQ+qlu6uR4v4sQV9A4+aQsC+kd45RrDdUEASQ2ydUEBYa9fsDolz8XpCHTA9ku41Mb0ihXjRvLYBBlGsmu79I7pEnKXZ31/qWOTdpo9WRXAO8G1giqXRO3xZgd7UDc845F5u2qS3g18BWYA7wtyXbu4DV1QzKOedciela2W5mG4ANxMOjOJdOGmyBhUXxnB1maH4nL/3BMfzOFU/zPxfcTQB0RQE9liFvIf2EPHnweJ7oWkRXoY7st9o47XMHKLQ1kNlbRPu7OWb7CwBYoTDy9Sch5V0DXFrMkJ7tzjnnaskLEuecc5OhlD++jlqQJNPsfiuZ2dC52lNpZ8KQcE4HUWcHfXMa2PWaOnrO6aGzYz9dvVnq74UtN57EHz18GVYs+U20KD6+vg41NkB/nmMW7EG9/WS278T680T9/fHUuCNd3zsHuqmU8o/bqAWJmRUlHS8pZ2b9UxWUc865mGx6t9oasB54KGkCfKjRu4/+65xzU2S6ttoq8ULyCoj7kDjnnJtK0/2JZGD0X0mNZtYz1v7OVYsyGdTQQNDWijXUke9sZdtZjRxYZLDgIFGhl2PurqduS0jbizuxrh0Ud+8lioap6wAs3w9dXfHKvv1xc1wrrUsZ5rfX60ZcDaQ9tRWMtYOk10t6FliTrC+T9PdVj8w551zMxvGqgTELEuCLwFuBXQBm9iRwYTWDcs45l7DBCvdyXrVQVj8SM9soHVbZM3yuwLnJCEKCXBZIepMnvdUHBl3c9v5zqL9iB5fNf57O7H5e6p3NDx85m1lPhRxzRxE9tYaoPw8WUUiOLTsVNUL6y7lUmO6pLWCjpPMBk5SV9HHgubEOklQv6TeSnpT0jKSBupYlkh6RtE7SdyX59L3OOTeaGZDa+mPgA8ACYDNwRrI+lj7gEjNblhxzuaTzgM8BXzCzk4A9wLUTCdw5544W0z61ZWY7gXH3bDczAw4kq9nkZcAlxMPTA9wGfBr4ynjP72YWZXMEHe1Ex3VSrAvZ8bpGll39NDfO+ynzM3Fa9Yx7zqD9y7N4/NnFRLv3Eh04wCmsBAXxf8ZsSO/1cQUwTDt9b6Hl0iLlH8XR5iP5MqOEb2Z/OtbJkyFWHgVOAm4h7o+y18wGhlPdRPyk45xzbjjToGf7aKmtVcSFQD1wFrA2eZ0BlFWvYWZFMzsDWAicA5xWbmCSrpO0StKqPH3lHuacczNPyutIRpuP5DYASe8H3jDwFCHpq8Avx3MRM9sr6X7iuU3aJWWS8y0krncZ7pgVwAqAVnWkvDx2zrkqSvlfwHKa/84CWhmcXrc52TYqSXOBfFKINABvJq5ovx94B3A78Xzwd04gbpdSQX394ERQClA2AxIKw3jCqfp6or37DqvDCBob6b7wNF6+yjh36QvMyXWz4YVT2fjpU/jYv0PxQDdYxKmZ1RCIYrEYj8w7UIdhFWi66/UhLqXE9E5tDbgJeFzSNyXdBjwG/K8yjpsH3C9pNbASuNfM/hW4HviopHXAbODrEwvdOeeOEhVMbUm6XNLzSReMG0bZ7/clmaSzxzpnOa22viHpJ8C5SZjXm9m2Mo5bDZw5zPb1xPUlzjnnxlLByvakAdQtxBmiTcBKSXeZ2bND9msBPgw8Us55y50h8RzgjcmyAT8u8zh3tJAIW1qI+vritJMCFERYsYhyOchm0OxZYMb7HnqEixq2klX8QLyzWORdq49l3v/pYP+fh+x5pY8lrAYOH0LB8lWeEkfyFJdLp8p9LM8B1iX/oUfS7cBVwLND9vsfxFURnyjnpOUM2ngTccn0bPL6U0nlpLacc85VQuVSWwuAjSXrR3TBkHQWsMjM/q3c8Mp5InkbcIYlvb2SepLHgT8v9yLOOecmbpxzts+RtKpkfUXSCnbs60gBcDPw3vFcsNzUVjuDrbbaxnMBN8MFIWFbK9FJC+n5qy6uXriSjswB6pXnxOwuioiuKMe2Qjsb8x3sKzRy65svYsXGLYcNlDgniOdPK9Zy8ERPa7k0Gn//kJ1mNlIF+WZgUcn60C4YLcBrgAeSgXqPBe6SdKWZlRZOhymnIPlr4lZb9xO3RLsQGLGm3znnXGVVsPnvSuBkSUuIC5CrGRyyCjPbB8w5dF3pAeDjoxUiUF6rre8kJ/udZFNZrbacc85VSIUKEjMrSPogcA8QArea2TOSPgusMrO7JnLeMQsSSb8H/HzgApLaJf2umf3LRC7ophkJ5XIEJxwHr+ymuGs3CkN63n4W26/pZfmrHqEts4Nf7hG7P7GIO586gejAgThNNDAQ4hEpo41HXIaoOPzAiZOM3dNVbiaoZIdEM7sbuHvItk+NsO/F5ZyznA6Jf5k87gyceC/wl+Wc3DnnXAVM17G2SgxX2JRbSe+cc24yalhAlKucJ5JVkm6WdGLyupl4VGDnnHNVpnG+aqGcJ4sPAZ8Evpus30t5MyS6aU6ZDMpkCObO4eW3z6V3zhwKswqEzXlm/zSk8zv1/GrtGWh/N0QRmf4tFAfqR6D29RO1vr5zlZLyj3I5rba68ea+zjlXM2kf/becVlunAB8HFpfub2aXVC8s55xzh4xz5uipVk5q63vAV4GvcfgYem6mCkIynXMpzp/NgeOa2XNqSPYA1O8S0d4s9bszzH54B/byZop9fYMppCAsL500XLPcSjb99Wa/biaZBlPtllOQFMzsK1WPxDnn3PBmQEHyY0l/AvwIBidPN7PdIx/inHOuUmbCE8ny5GvpuPQGnFD5cFy1KZuDQBDZkfN7JOmlA79/NlsuL3LF6U/zq81LyN3TTue3VmP9eayQBwUULToyfTSZARcrmYrytJabaVL+kS6n1daSqQjEOefc8NL+RDJih0RJ/61k+Z1D3vOJrZxzbiqMZ3iUGhU4o/Vsv7pk+cYh711ehVicc84NJ+UFyWipLY2wPNy6q5XSEXZLm9AqGLbOwgp5guZmJFEsFg+NuqtMlmDxQvJfzdMWbKT3O8fz0p/lmNe1BoUhUaFQcpIR6kIkgro6rBjFdSkDcQAKhEUWfy0919Dvxes3nDuMSH9qa7SCxEZYHm7dOedctaT8L+5oBckySfuJC8SGZJlkvb7qkTnnnANAKX9SH7EgMbNwKgNxE1T6ATtsuYiyOYKGkjI/m4EgxHp6sIF9JRSGKJdFhSLZD9ZTXLOOOdoeN/EFrFiEIESBDqWqrFiEYZoAR729Q+KL02DJqQ59HfN7cc7FDJTyIVLKGUZ+QiQtknS/pGclPSPpw8n2Dkn3SlqbfJ1VrRicc25GSHlle9UKEqAAfMzMlgLnAR+QtJR4JOH7zOxk4D58ZGHnnBuVrPxXLVRtpkMz2wpsTZa7JD0HLACuAi5OdrsNeAC4vlpxzHilLZ0Up54UhiibQS3NcLAXtTRjrU1QjOhdPIuX35rhLRc+wSc7H6QtyNFjeV4pii/uuJSX37c4Pp+VzKGugCCXxQqFuDd80spLYZaoPz/YOsxTU85VR8p/taZkylxJi4EzgUeAzqSQAdgGdE5FDM45Ny3NkNF/J0VSM/AD4CNmtl8lfR3MzKThb5Gk64DrAOpprHaYzjmXXkdzQSIpS1yIfNvMfphs3i5pnpltlTQP2DHcsWa2AlgB0KqOlN/GKSQRtrWy9T+9mr3L8lzy2uc4rXkrV7Wspr6kTN5dzHJv91I2981iX97oyvfTlu3lwfUtHP9148VPhby3542Hp6PUC6wZXD80ZW6RqLd42HbL92P56powZAwAABPNSURBVH6rzrnp0SGxmq22BHwdeM7Mbi556y4GRxReDtxZrRicc25GMCv/VQPVfCK5APjPwFOSnki2/TlwE3CHpGuBDcC7qhiDc85Ne2l/Iqlmq61fMfKYXJdW67rOOTej1LB/SLmmpNWWmzhlMug1p9B1UgsHOwJ65ovsGXvo+Mc88771PFvDLJt75vBzXYoVo8O6jquuDjU3xSuR0d0dcXK0hqg/TzTcJFTefNe5VNIk5oybCl6QOOdcyh21qS3nnHMVYKQ+W+AFSUopm4t7py+azws3Zjj12A1s2tdG75Y2Ov65jcZV6yke6EZhiBXyh6dQk7lJ7GAv6uvDIhtMeSkYY+RE51za+BOJc865yfGCxDnn3ERNhw6JXpCkQTLXhxUKEIT0/ofX8fJVEbdd8jXu3HMW+Y8uo+/BbcxlG3MhTls1NKDXLSVY+zLRgSieH6RUMvDiEVmskabJdc6lUw07GpbLCxLnnEs5fyJxzjk3OV6QuDFFRcwEQUhm0Xya1u/jVTcbf/2n5wOQ0W+JSkZNViZL1NsHK5+iGISD84EMNZAyG2i1lUyTO9wUueNSOgeKc67q/InEOefcxBlQTHdJ4gWJc86lXNqfSKo5Z7tzzrlKqOAw8pIul/S8pHWSbhjm/Y9KelbSakn3STp+rHN6QVJLQYiyOTKLFtL1rnPh3nmceedLvHBNBz3HtxH19hH19MTNgjX4o7JCyTzpI9WPAEEuS9DYSNBQT1BXF8/lHobxuTTSwMxl8PoR56aUrPzXqOeRQuAW4ApgKXCNpKVDdnscONvMTge+D/zNWPF5QeKcc2lm43yN7hxgnZmtN7N+4HbgqsMuZ3a/mfUkqw8DC8c6qdeROOdcisU928eVBZgjaVXJ+opk6nKABcDGkvc2AeeOcq5rgZ+MdUEvSKZSEKJshqC5CSKjcNpx7DuxgX0nBPTPirCvLeTRu/awpPAYli8cSltZX9+ELhf19kJvbyW/A+dcLYxvnNWdZnb2ZC8p6T8BZwMXjbWvFyTOOZdy43wiGc1mYFHJ+sJk2+HXky4D/gK4yMzG/J+s15E451yaVbaOZCVwsqQlknLA1cBdpTtIOhP4B+BKM9tRToj+RFJJ0qH5PpTLEbS2wtxZvPCe2Zx4/gZuOeEOQsG2Yh0hxj1dr+XuLa/GftPJCf/SR/CLJ4iSuUTKETQ1xWkvBVi+v8rfnHOuNio3aKOZFSR9ELgHCIFbzewZSZ8FVpnZXcDngWbge4r/Fr1sZleOdl4vSJxzLuUUVa7JvZndDdw9ZNunSpYvG+85vSBxzrk0M1DKJzX1gsQ559Iu5Z2AvSAZamBk25FGuE3qQRQM1oeggKChnsKrl7B2eR3/39mPs3z2L3mqbyHrejtZf+cb6b55IX9y31uwfCHumQ5gRhPraWL94dco80MT9fSk/gPmnKuAlP+aV63VlqRbJe2Q9HTJtg5J90pam3ydVa3rO+fcTCGzsl+1UM3mv98ELh+y7QbgPjM7GbgvWXfOOTeaCg7aWA1VS22Z2S8kLR6y+Srg4mT5NuAB4PpqxTBhw6W1Bpr2AspmUCZD0N4GxSLRsbPZdXobXceJ1jXwzPdO55OPNiZprALH5x+BqDjOzqll8LSWczOfMd6e7VNuqutIOs1sa7K8DegcaUdJ1wHXAdTTOAWhOedc+ojapazKVbPKdjMzaeRBj5NBxlYAtKoj3XfROeeqyQuSw2yXNM/MtkqaB5TV/X7KqLQlVtK7XAFBLgvZLHbq8Rw8tpFXzszwznc8yIl1z3Nibge/7jmZW587n/r7W5h3x/MUd+2mONk50SH1Hx7n3BRJ+d+CqR5r6y5gebK8HLhziq/vnHPTi4GKVvarFqrZ/Pc7wL8Dp0raJOla4CbgzZLWApcl684550ZzFLfaumaEty6t1jUnLAhRIMI5s7HWZqKWevaf3ELmYMTuV2XoX9bNgjl7MTtAoC4Kj85n5TtO5ZGXmrDCIjDjOJ4CaXIprQEpf4x1zk2l2hUQ5fKe7c45l2aGFyTOOecmyfuROOecmwzvR5JWEgpDCEOUyaD6OmhsoNjeyIHFTWw/DzpO2kvP+ll0/LyR4it1ND+3m2j9y5xgmykWi0c+bqb8h+2cm6ZS/rfl6C1InHNuOjCgghNbVYMXJM45l2reait9kl7jYccs1NpC1NJA37HNdC3MUqyD/lahIpz0z11kNh9gbt8uoq4DUCxSjOzQ/CPOOTdlvCBxzjk3YQYU091sywsS55xLtSQTkmJHTUESNDailmZ6lx3HK2fk4PV7WdS+lygqcrA/z84ts2h+PsfsZwo0/XYX0YZNFPr7h5+XpHQ55Y+czrkZIOV/Z46agsQ556Ylb7XlnHNu0vyJxDnn3KR4QVIlI82rDiiTjb/W16FMBjIZus9bwiunZ7AzuujdI467tRnWFwl37qF9307a2QxhGDfz7c9DVBz+upZMoJzyH6xzbqbwfiTOOecmw4DIW20555ybDH8iqRCJoK4OggA1NaFcln3nLWLHO3p532t/xX9seZIA2FBopSPsoV5F9kY5Qoy8hdy08W1sefgETvpMHlvzHFYsHjkJVblNe1P+Q3XOzTAp/5szfQoS55w7GplhxRHqbFPCCxLnnEs770cyeaec3sO//nQVW4sH2VJooNtyvNDfyZfXtHPMt1u4/9Nnct/62XGpXTqUwGGPg9s5ge2jTzSW8sdH59xRKuV/m6ZFQeKcc0ctM2+15ZxzbpL8iWTyfru6kbcvPBvlcgQN9Vi+gBrqWaRXIL+Z4t59ZZ9LdXUoDFFdHRSLhzohRn19KJMhOtiLAmGFwpHHZpLblRxjkcUdF4MQZTOHb0tamamhAcIA6zlI1NMzZnxh5zEwZxZs3Epx//7hv4dsjgNXnknTD38Tbyj3QyYRNDdjpy0m3LaHwuatgx0vk6mHD33fQRjH09GO6uuJdu7CCoX4fYkD7zyX9l+8SLS/C8sXwKIktTiBD7xEOLtjzHukTGbw/g4IwkP3G7P4ZxGI8NhOyGYobtqKFfIlJwkG058DsQ6cY4opkxn2cza4Q/I9SSiTJZzfifX0UnzllZEPqasjaG0ddZ+BawctLdjx82DtBqLu7nEGX8aApck+QWMjWrII7d5HtGcvhGH8cx6Y12e89z753QpmtVPYum1chwb19US9vUeeMpsjaGrAFs4j6OqmsGFj+bE0Ng7evyoN5Gr+ROKcc27i0t+zvSZT/Um6XNLzktZJuqEWMTjn3LQwMPpvua8amPInEkkhcAvwZmATsFLSXWb27FTH4pxz00LKJ7aSTfEjk6TXA582s7cm6zcCmNlfj3RMqzrsXF06mMcvFlEYxrnXtlbs4EHsYC8EAUFHO7ZvP1aMUBhgvX2YGZYvENTXER08CGYomwOL4nqR/jiHrkBxPUgQEPX2Df7wFAzm4QfyuqV59mS7spm4viDJ+Sqbwwp5grq6Q9cgKsbXUIDl+4Ekb9vXd3jOHuLvsbWZ4q7dR9yToKWFwrITyW3cBWFI4aWNcUwD5xgpV5vcQ2AwPz/Qo18BQS57eA55oF6lvx/r6xu8fn19/D0NyW8HTU0Es9pBorjjFay/P743YRjXVYz0eQtCgob6wTqqYnHwvpdcEzj8Xo1mpIE9U5ImUCaDFYsEDQ3x5w0GP0MDSn+myT2ygfq8kp/HcPVbpfVf8bmG/zwEjY1YX198fFLHZJEdfu2xvo9CIf5ch+Fhn5NDSuuyFKBgcBQJG/hf9NDrjfGzClpaIJ+Pr93QQHTgQDxgq0WHf7ZHq4sZeH/gd7au7tD9D+fOxroOxD+jujqKB7oPrwsd6ZzAI3Yf+223hn1znFqD2XZe5q1l739v/juPmtnZlbh2uWpRR7IAKK3J2gScO3QnSdcB1wHU0zg1kTnnXNp4z/aJM7MVwAqIn0hqHI5zztWOp7aGXHACqS1JXcDzUxPhhM0BdtY6iDJMhzg9xsrwGCtjIjEeb2ZzK3FxST9NYijXTjO7vBLXLlctCpIM8FvgUmAzsBJ4t5k9M8oxq6Y65zde0yFGmB5xeoyV4TFWxnSIsdamPLVlZgVJHwTuAULg1tEKEeecc+lWkzoSM7sbuLsW13bOOVdZNemQOAErah1AGaZDjDA94vQYK8NjrIzpEGNNTXkdiXPOuZllujyROOecS6nUFyRpHZdL0kuSnpL0hKRVybYOSfdKWpt8nTXFMd0qaYekp0u2DRuTYn+X3NfVks6qYYyflrQ5uZdPSHpbyXs3JjE+L6n87r2Ti3GRpPslPSvpGUkfTran5l6OEmPa7mW9pN9IejKJ8zPJ9iWSHkni+a6kXLK9Lllfl7y/uIYxflPSiyX38oxke01+d1LNzFL7Im7V9QJwApADngSW1jquJLaXgDlDtv0NcEOyfAPwuSmO6ULgLODpsWIC3gb8BBBwHvBIDWP8NPDxYfZdmvzM64AlyWchnIIY5wFnJcstxM3Vl6bpXo4SY9rupYDmZDkLPJLcozuAq5PtXwXenyz/CfDVZPlq4Ls1jPGbwDuG2b8mvztpfqX9ieQcYJ2ZrTezfuB24KoaxzSaq4DbkuXbgN+dyoub2S+AoQNzjRTTVcC3LPYw0C5pXo1iHMlVwO1m1mdmLwLriD8TVWVmW83ssWS5C3iOeGif1NzLUWIcSa3upZnZgWQ1m7wMuAT4frJ96L0cuMffBy6VVJExqyYQ40hq8ruTZmkvSIYbl2u0X5apZMDPJD2ajAsG0GlmW5PlbUBnbUI7zEgxpe3efjBJE9xakhKseYxJauVM4v+lpvJeDokRUnYvJYWSngB2APcSPw3tNbOB0SlLYzkUZ/L+PmD2VMdoZgP38q+Se/kFSXVDYxwm/qNS2guSNHuDmZ0FXAF8QNKFpW9a/AycqiZxaYwp8RXgROAMYCvwt7UNJyapGfgB8BEzO2y6yrTcy2FiTN29NLOimZ0BLCR+CjqtxiEdYWiMkl4D3Egc6+8AHcD1NQwx1dJekGwGFpWsL0y21ZyZbU6+7gB+RPwLsn3gETf5uqN2ER4yUkypubdmtj35RY6Af2Qw5VKzGCVlif9Af9vMfphsTtW9HC7GNN7LAWa2F7gfeD1xOmigQ3RpLIfiTN5vA3bVIMbLk/ShmVkf8A1SdC/TJu0FyUrg5KSFR4648u2uGseEpCZJLQPLwFuAp4ljW57sthy4szYRHmakmO4C/kvSAuU8YF9J2mZKDckv/x7xvYQ4xquTljxLgJOB30xBPAK+DjxnZjeXvJWaezlSjCm8l3MltSfLDcQT2j1H/Mf6HcluQ+/lwD1+B/Dz5OlvqmNcU/KfBhHX4ZTey1T87qRGrWv7x3oRt5D4LXFe9S9qHU8S0wnELWCeBJ4ZiIs4l3sfsBb4v0DHFMf1HeJ0Rp44b3vtSDERtzi5JbmvTwFn1zDGf0piWE38SzqvZP+/SGJ8HrhiimJ8A3HaajXwRPJ6W5ru5Sgxpu1eng48nsTzNPCpZPsJxAXZOuB7QF2yvT5ZX5e8f0INY/x5ci+fBv4Pgy27avK7k+aX92x3zjk3KWlPbTnnnEs5L0icc85NihckzjnnJsULEuecc5PiBYlzzrlJ8YLEVZSkYjJS6jPJaKofk1S1z5mkxSoZSXiC5/jzIeu/nlxUo15rsaR3V+v8ztWCFySu0g6a2Rlm9mrijl1XAH9Z45gOKelNXeqwgsTMzq9iCIsBL0jcjOIFiasai4ePuY54EEElA+N9XtLKZCC8PxrYV9L1iud3eVLSTcm2MyQ9nOz7Iw3O//G6ZL8ngQ+UnGPY80u6WNIvJd0FPFsaY3KthuQp6tvJtgMlxz0o6U5J6yXdJOk9iueueErSicl+cyX9ILnuSkkXJNsv0uBcFo8noyHcBLwx2fZnY8T8C0n/pnj+kK9W88nOuUmpdY9If82sF3BgmG17iUfKvQ7478m2OmAV8dwYVwC/BhqT9wZ6jK8GLkqWPwt8sWT7hcny50nmNhnl/BcD3cCScmIeWE+O20s890cd8XhKn0ne+3BJPP9MPIgnwHHEw5YA/Bi4IFluBjLJOf+15FqjxdxL3AM8JB4194i5MfzlrzS8hnvMd65a3gKcLmlgjKU24jGfLgO+YWY9AGa2W1Ib0G5mDyb73gZ8LxkTqd3ieU0gHhLkijHO3w/8xuJ5OMZrpSXjKEl6AfhZsv0p4E3J8mXAUg1Om9GqeFTeh4CbkyedH5rZJh05tcZYMa9Prv0d4mFRvj/0BM7VmhckrqoknQAUiUfKFfAhM7tnyD6VmvZ1pPNfTPxEMhF9JctRyXrE4O9PAJxnZr1Djr1J0r8Rj4H10Ajf52gxDx2/yMczcqnkOVdXNZLmEk+j+r/NzIB7gPcrHv4cSacoHj35XuAPJTUm2zvMbB+wR9Ibk9P9Z+BBi4f53ivpDcn295RccqTzjyU/cMwE/Qz40MCKBuf2PtHMnjKzzxGPZH0a0EU8NW45MZ+jeOTrAPgD4FeTiNG5qvEnEldpDYpnmssCBeLU08Aw518jbrX0WDI09yvA75rZT5M/vqsk9QN3E7ekWg58NSlg1gN/mJznD4FbJRmDqaYRz19GzCuA1ZIeM7P3jLn3kf4UuEXSauLfqV8Afwx8RNKbiJ9eniGe5zsCiklDgW8CXxol5pXA/wZOIh52/UcTiM25qvPRf51LoSS19XEze3utY3FuLJ7acs45Nyn+ROKcc25S/InEOefcpHhB4pxzblK8IHHOOTcpXpA455ybFC9InHPOTYoXJM455ybl/wG2SGZbPnFFOwAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "fig = display.plot_alignment(alignment)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## WaveFlow: vocoder model\n", - "Generated spectrogram is converted to raw audio using a pretrained waveflow model." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data:\n", - " batch_size: 8\n", - " clip_frames: 65\n", - " fmax: 8000\n", - " fmin: 0\n", - " hop_length: 256\n", - " n_fft: 1024\n", - " n_mels: 80\n", - " sample_rate: 22050\n", - " valid_size: 16\n", - " win_length: 1024\n", - "model:\n", - " channels: 128\n", - " kernel_size: [3, 3]\n", - " n_flows: 8\n", - " n_group: 16\n", - " n_layers: 8\n", - " sigma: 1.0\n", - " upsample_factors: [16, 16]\n", - "training:\n", - " lr: 0.0002\n", - " max_iteration: 3000000\n", - " save_interval: 10000\n", - " valid_interval: 1000\n" - ] - } - ], - "source": [ - "from examples.waveflow import config as waveflow_config\n", - "vocoder_config = waveflow_config.get_cfg_defaults()\n", - "print(vocoder_config)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[checkpoint] Rank 0: loaded model from ../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams\n" - ] - } - ], - "source": [ - "vocoder = ConditionalWaveFlow.from_pretrained(\n", - " vocoder_config, \n", - " \"../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000\")\n", - "layer_tools.recursively_remove_weight_norm(vocoder)\n", - "vocoder.eval()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "time: 9.412613868713379s\n" - ] - } - ], - "source": [ - "audio = vocoder.infer(paddle.transpose(outputs[\"mel_outputs_postnet\"], [0, 2, 1]))\n", - "wav = audio[0].numpy()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ipd.Audio(wav, rate=22050)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/paddlespeech/t2s/exps/tacotron2/synthesize.py b/paddlespeech/t2s/exps/tacotron2/synthesize.py deleted file mode 100644 index c73c32d2..00000000 --- a/paddlespeech/t2s/exps/tacotron2/synthesize.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -from pathlib import Path - -import numpy as np -import paddle -from matplotlib import pyplot as plt - -from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults -from paddlespeech.t2s.frontend import EnglishCharacter -from paddlespeech.t2s.models.tacotron2 import Tacotron2 -from paddlespeech.t2s.utils import display - - -def main(config, args): - if args.ngpu == 0: - paddle.set_device("cpu") - elif args.ngpu > 0: - paddle.set_device("gpu") - else: - print("ngpu should >= 0 !") - - # model - frontend = EnglishCharacter() - model = Tacotron2.from_pretrained(config, args.checkpoint_path) - model.eval() - - # inputs - input_path = Path(args.input).expanduser() - sentences = [] - with open(input_path, "rt") as f: - for line in f: - line_list = line.strip().split() - utt_id = line_list[0] - sentence = " ".join(line_list[1:]) - sentences.append((utt_id, sentence)) - - if args.output is None: - output_dir = input_path.parent / "synthesis" - else: - output_dir = Path(args.output).expanduser() - output_dir.mkdir(exist_ok=True) - - for i, sentence in enumerate(sentences): - sentence = paddle.to_tensor(frontend(sentence)).unsqueeze(0) - outputs = model.infer(sentence) - mel_output = outputs["mel_outputs_postnet"][0].numpy().T - alignment = outputs["alignments"][0].numpy().T - - np.save(str(output_dir / f"sentence_{i}"), mel_output) - display.plot_alignment(alignment) - plt.savefig(str(output_dir / f"sentence_{i}.png")) - if args.verbose: - print("spectrogram saved at {}".format(output_dir / - f"sentence_{i}.npy")) - - -if __name__ == "__main__": - config = get_cfg_defaults() - - parser = argparse.ArgumentParser( - description="generate mel spectrogram with TransformerTTS.") - parser.add_argument( - "--config", - type=str, - metavar="FILE", - help="extra config to overwrite the default config") - parser.add_argument( - "--checkpoint_path", type=str, help="path of the checkpoint to load.") - parser.add_argument("--input", type=str, help="path of the text sentences") - parser.add_argument("--output", type=str, help="path to save outputs") - parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument( - "--opts", - nargs=argparse.REMAINDER, - help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" - ) - parser.add_argument( - "-v", "--verbose", action="store_true", help="print msg") - - args = parser.parse_args() - if args.config: - config.merge_from_file(args.config) - if args.opts: - config.merge_from_list(args.opts) - config.freeze() - print(config) - print(args) - - main(config, args) diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py index 8198348f..69ff80e4 100644 --- a/paddlespeech/t2s/exps/tacotron2/train.py +++ b/paddlespeech/t2s/exps/tacotron2/train.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,210 +11,192 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import time -from collections import defaultdict +import argparse +import logging +import os +import shutil +from pathlib import Path +import jsonlines import numpy as np import paddle +import yaml +from paddle import DataParallel from paddle import distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler +from yacs.config import CfgNode -from paddlespeech.t2s.data import dataset -from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults -from paddlespeech.t2s.exps.tacotron2.ljspeech import LJSpeech -from paddlespeech.t2s.exps.tacotron2.ljspeech import LJSpeechCollector +from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_multi_spk_batch_fn +from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_single_spk_batch_fn +from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.models.tacotron2 import Tacotron2 -from paddlespeech.t2s.models.tacotron2 import Tacotron2Loss -from paddlespeech.t2s.training.cli import default_argument_parser -from paddlespeech.t2s.training.experiment import ExperimentBase -from paddlespeech.t2s.utils import display -from paddlespeech.t2s.utils import mp_tools - - -class Experiment(ExperimentBase): - def compute_losses(self, inputs, outputs): - texts, mel_targets, plens, slens = inputs - - mel_outputs = outputs["mel_output"] - mel_outputs_postnet = outputs["mel_outputs_postnet"] - attention_weight = outputs["alignments"] - if self.config.model.use_stop_token: - stop_logits = outputs["stop_logits"] - else: - stop_logits = None - - losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets, - attention_weight, slens, plens, stop_logits) - return losses - - def train_batch(self): - start = time.time() - batch = self.read_batch() - data_loader_time = time.time() - start - - self.optimizer.clear_grad() - self.model.train() - texts, mels, text_lens, output_lens = batch - outputs = self.model(texts, text_lens, mels, output_lens) - losses = self.compute_losses(batch, outputs) - loss = losses["loss"] - loss.backward() - self.optimizer.step() - iteration_time = time.time() - start - - losses_np = {k: float(v) for k, v in losses.items()} - # logging - msg = "Rank: {}, ".format(dist.get_rank()) - msg += "step: {}, ".format(self.iteration) - msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, - iteration_time) - msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in losses_np.items()) - self.logger.info(msg) - - if dist.get_rank() == 0: - for k, v in losses_np.items(): - self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration) - - @mp_tools.rank_zero_only - @paddle.no_grad() - def valid(self): - valid_losses = defaultdict(list) - for i, batch in enumerate(self.valid_loader): - texts, mels, text_lens, output_lens = batch - outputs = self.model(texts, text_lens, mels, output_lens) - losses = self.compute_losses(batch, outputs) - for k, v in losses.items(): - valid_losses[k].append(float(v)) - - attention_weights = outputs["alignments"] - self.visualizer.add_figure( - f"valid_sentence_{i}_alignments", - display.plot_alignment(attention_weights[0].numpy().T), - self.iteration) - self.visualizer.add_figure( - f"valid_sentence_{i}_target_spectrogram", - display.plot_spectrogram(mels[0].numpy().T), self.iteration) - self.visualizer.add_figure( - f"valid_sentence_{i}_predicted_spectrogram", - display.plot_spectrogram(outputs['mel_outputs_postnet'][0] - .numpy().T), self.iteration) - - # write visual log - valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} - - # logging - msg = "Valid: " - msg += "step: {}, ".format(self.iteration) - msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in valid_losses.items()) - self.logger.info(msg) - - for k, v in valid_losses.items(): - self.visualizer.add_scalar(f"valid/{k}", v, self.iteration) - - def setup_model(self): - config = self.config - model = Tacotron2( - vocab_size=config.model.vocab_size, - d_mels=config.data.n_mels, - d_encoder=config.model.d_encoder, - encoder_conv_layers=config.model.encoder_conv_layers, - encoder_kernel_size=config.model.encoder_kernel_size, - d_prenet=config.model.d_prenet, - d_attention_rnn=config.model.d_attention_rnn, - d_decoder_rnn=config.model.d_decoder_rnn, - attention_filters=config.model.attention_filters, - attention_kernel_size=config.model.attention_kernel_size, - d_attention=config.model.d_attention, - d_postnet=config.model.d_postnet, - postnet_kernel_size=config.model.postnet_kernel_size, - postnet_conv_layers=config.model.postnet_conv_layers, - reduction_factor=config.model.reduction_factor, - p_encoder_dropout=config.model.p_encoder_dropout, - p_prenet_dropout=config.model.p_prenet_dropout, - p_attention_dropout=config.model.p_attention_dropout, - p_decoder_dropout=config.model.p_decoder_dropout, - p_postnet_dropout=config.model.p_postnet_dropout, - use_stop_token=config.model.use_stop_token) - - if self.parallel: - model = paddle.DataParallel(model) - - grad_clip = paddle.nn.ClipGradByGlobalNorm( - config.training.grad_clip_thresh) - optimizer = paddle.optimizer.Adam( - learning_rate=config.training.lr, - parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), - grad_clip=grad_clip) - criterion = Tacotron2Loss( - use_stop_token_loss=config.model.use_stop_token, - use_guided_attention_loss=config.model.use_guided_attention_loss, - sigma=config.model.guided_attention_loss_sigma) - self.model = model - self.optimizer = optimizer - self.criterion = criterion - - def setup_dataloader(self): - args = self.args - config = self.config - ljspeech_dataset = LJSpeech(args.data) - - valid_set, train_set = dataset.split(ljspeech_dataset, - config.data.valid_size) - batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx) - - if not self.parallel: - self.train_loader = DataLoader( - train_set, - batch_size=config.data.batch_size, - shuffle=True, - drop_last=True, - collate_fn=batch_fn) - else: - sampler = DistributedBatchSampler( - train_set, - batch_size=config.data.batch_size, - shuffle=True, - drop_last=True) - self.train_loader = DataLoader( - train_set, batch_sampler=sampler, collate_fn=batch_fn) - - self.valid_loader = DataLoader( - valid_set, - batch_size=config.data.batch_size, - shuffle=False, - drop_last=False, - collate_fn=batch_fn) - - -def main_sp(config, args): - exp = Experiment(config, args) - exp.setup() - exp.resume_or_load() - exp.run() - - -def main(config, args): - if args.ngpu > 1: - dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) +from paddlespeech.t2s.models.tacotron2 import Tacotron2Evaluator +from paddlespeech.t2s.models.tacotron2 import Tacotron2Updater +from paddlespeech.t2s.training.extensions.snapshot import Snapshot +from paddlespeech.t2s.training.extensions.visualizer import VisualDL +from paddlespeech.t2s.training.optimizer import build_optimizers +from paddlespeech.t2s.training.seeding import seed_everything +from paddlespeech.t2s.training.trainer import Trainer +from paddlespeech.t2s.utils import str2bool + + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + paddle.set_device("cpu") else: - main_sp(config, args) - + paddle.set_device("gpu") + world_size = paddle.distributed.get_world_size() + if world_size > 1: + paddle.distributed.init_parallel_env() + + # set the random seed, it is a must for multiprocess training + seed_everything(config.seed) + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + + # dataloader has been too verbose + logging.getLogger("DataLoader").disabled = True + + fields = [ + "text", + "text_lengths", + "speech", + "speech_lengths", + ] + + converters = { + "speech": np.load, + } + if args.voice_cloning: + print("Training voice cloning!") + collate_fn = tacotron2_multi_spk_batch_fn + fields += ["spk_emb"] + converters["spk_emb"] = np.load + else: + print("single speaker tacotron2!") + collate_fn = tacotron2_single_spk_batch_fn + + # construct dataset for training and validation + with jsonlines.open(args.train_metadata, 'r') as reader: + train_metadata = list(reader) + train_dataset = DataTable( + data=train_metadata, + fields=fields, + converters=converters, ) + with jsonlines.open(args.dev_metadata, 'r') as reader: + dev_metadata = list(reader) + dev_dataset = DataTable( + data=dev_metadata, + fields=fields, + converters=converters, ) + + # collate function and dataloader + train_sampler = DistributedBatchSampler( + train_dataset, + batch_size=config.batch_size, + shuffle=True, + drop_last=True) + + print("samplers done!") + + train_dataloader = DataLoader( + train_dataset, + batch_sampler=train_sampler, + collate_fn=collate_fn, + num_workers=config.num_workers) + + dev_dataloader = DataLoader( + dev_dataset, + shuffle=False, + drop_last=False, + batch_size=config.batch_size, + collate_fn=collate_fn, + num_workers=config.num_workers) + print("dataloaders done!") + + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + odim = config.n_mels + model = Tacotron2(idim=vocab_size, odim=odim, **config["model"]) + if world_size > 1: + model = DataParallel(model) + print("model done!") + + optimizer = build_optimizers(model, **config["optimizer"]) + print("optimizer done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + if dist.get_rank() == 0: + config_name = args.config.split("/")[-1] + # copy conf to output_dir + shutil.copyfile(args.config, output_dir / config_name) + + updater = Tacotron2Updater( + model=model, + optimizer=optimizer, + dataloader=train_dataloader, + output_dir=output_dir, + **config["updater"]) + + trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) + + evaluator = Tacotron2Evaluator( + model, dev_dataloader, output_dir=output_dir, **config["updater"]) + + if dist.get_rank() == 0: + trainer.extend(evaluator, trigger=(1, "epoch")) + trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + trainer.run() + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Train a Tacotron2 model.") + parser.add_argument("--config", type=str, help="tacotron2 config file.") + parser.add_argument("--train-metadata", type=str, help="training data.") + parser.add_argument("--dev-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + parser.add_argument( + "--phones-dict", type=str, default=None, help="phone vocabulary file.") + + parser.add_argument( + "--voice-cloning", + type=str2bool, + default=False, + help="whether training voice cloning model.") -if __name__ == "__main__": - config = get_cfg_defaults() - parser = default_argument_parser() args = parser.parse_args() - if args.config: - config.merge_from_file(args.config) - if args.opts: - config.merge_from_list(args.opts) - config.freeze() + + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") print(config) - print(args) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) - main(config, args) + # dispatch + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) + else: + train_sp(args, config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/transformer_tts/normalize.py b/paddlespeech/t2s/exps/transformer_tts/normalize.py index 4bb77c79..87e975b8 100644 --- a/paddlespeech/t2s/exps/transformer_tts/normalize.py +++ b/paddlespeech/t2s/exps/transformer_tts/normalize.py @@ -130,6 +130,9 @@ def main(): "speech_lengths": item['speech_lengths'], "speech": str(speech_path), } + # add spk_emb for voice cloning + if "spk_emb" in item: + record["spk_emb"] = str(item["spk_emb"]) output_metadata.append(record) output_metadata.sort(key=itemgetter('utt_id')) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py index 93158b67..9aa87e91 100644 --- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py +++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py @@ -26,20 +26,17 @@ import tqdm import yaml from yacs.config import CfgNode as Configuration -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.frontend import English def get_lj_sentences(file_name, frontend): - ''' - read MFA duration.txt - Parameters - ---------- - file_name : str or Path - Returns - ---------- - Dict - sentence: {'utt': ([char], [int])} + '''read MFA duration.txt + + Args: + file_name (str or Path) + Returns: + Dict: sentence: {'utt': ([char], [int])} ''' f = open(file_name, 'r') sentence = {} @@ -59,14 +56,11 @@ def get_lj_sentences(file_name, frontend): def get_input_token(sentence, output_path): - ''' - get phone set from training data and save it - Parameters - ---------- - sentence : Dict - sentence: {'utt': ([char], str)} - output_path : str or path - path to save phone_id_map + '''get phone set from training data and save it + + Args: + sentence (Dict): sentence: {'utt': ([char], str)} + output_path (str or path): path to save phone_id_map ''' phn_token = set() for utt in sentence: diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index 8695c06a..d521ce89 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -148,9 +148,8 @@ def train_sp(args, config): if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) - trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) - # print(trainer.extensions) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) trainer.run() diff --git a/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py similarity index 57% rename from paddlespeech/t2s/exps/fastspeech2/voice_cloning.py rename to paddlespeech/t2s/exps/voice_cloning.py index 9fbd4964..3de30774 100644 --- a/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py +++ b/paddlespeech/t2s/exps/voice_cloning.py @@ -21,17 +21,43 @@ import soundfile as sf import yaml from yacs.config import CfgNode +from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.t2s.frontend.zh_frontend import Frontend -from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 -from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference -from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator -from paddlespeech.t2s.models.parallel_wavegan import PWGInference from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder +model_alias = { + # acoustic model + "fastspeech2": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2", + "fastspeech2_inference": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", + # voc + "pwgan": + "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", + "pwgan_inference": + "paddlespeech.t2s.models.parallel_wavegan:PWGInference", +} + + +def voice_cloning(args): + # Init body. + with open(args.am_config) as f: + am_config = CfgNode(yaml.safe_load(f)) + with open(args.voc_config) as f: + voc_config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(am_config) + print(voc_config) -def voice_cloning(args, fastspeech2_config, pwg_config): # speaker encoder p = SpeakerVerificationPreprocessor( sampling_rate=16000, @@ -57,40 +83,52 @@ def voice_cloning(args, fastspeech2_config, pwg_config): phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) - odim = fastspeech2_config.n_mels - model = FastSpeech2( - idim=vocab_size, odim=odim, **fastspeech2_config["model"]) - model.set_state_dict( - paddle.load(args.fastspeech2_checkpoint)["main_params"]) - model.eval() - - vocoder = PWGGenerator(**pwg_config["generator_params"]) - vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"]) - vocoder.remove_weight_norm() - vocoder.eval() - print("model done!") + # acoustic model + odim = am_config.n_mels + # model: {model_name}_{dataset} + am_name = args.am[:args.am.rindex('_')] + am_dataset = args.am[args.am.rindex('_') + 1:] + + am_class = dynamic_import(am_name, model_alias) + am_inference_class = dynamic_import(am_name + '_inference', model_alias) + + if am_name == 'fastspeech2': + am = am_class( + idim=vocab_size, odim=odim, spk_num=None, **am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) + + am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) + am.eval() + am_mu, am_std = np.load(args.am_stat) + am_mu = paddle.to_tensor(am_mu) + am_std = paddle.to_tensor(am_std) + am_normalizer = ZScore(am_mu, am_std) + am_inference = am_inference_class(am_normalizer, am) + am_inference.eval() + print("acoustic model done!") + + # vocoder + # model: {model_name}_{dataset} + voc_name = args.voc[:args.voc.rindex('_')] + voc_class = dynamic_import(voc_name, model_alias) + voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) + voc = voc_class(**voc_config["generator_params"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) + voc.remove_weight_norm() + voc.eval() + voc_mu, voc_std = np.load(args.voc_stat) + voc_mu = paddle.to_tensor(voc_mu) + voc_std = paddle.to_tensor(voc_std) + voc_normalizer = ZScore(voc_mu, voc_std) + voc_inference = voc_inference_class(voc_normalizer, voc) + voc_inference.eval() + print("voc done!") frontend = Frontend(phone_vocab_path=args.phones_dict) print("frontend done!") - stat = np.load(args.fastspeech2_stat) - mu, std = stat - mu = paddle.to_tensor(mu) - std = paddle.to_tensor(std) - fastspeech2_normalizer = ZScore(mu, std) - - stat = np.load(args.pwg_stat) - mu, std = stat - mu = paddle.to_tensor(mu) - std = paddle.to_tensor(std) - pwg_normalizer = ZScore(mu, std) - - fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model) - fastspeech2_inference.eval() - pwg_inference = PWGInference(pwg_normalizer, vocoder) - pwg_inference.eval() - output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) @@ -112,24 +150,23 @@ def voice_cloning(args, fastspeech2_config, pwg_config): # print("spk_emb shape: ", spk_emb.shape) with paddle.no_grad(): - wav = pwg_inference( - fastspeech2_inference(phone_ids, spk_emb=spk_emb)) + wav = voc_inference(am_inference(phone_ids, spk_emb=spk_emb)) sf.write( str(output_dir / (utt_id + ".wav")), wav.numpy(), - samplerate=fastspeech2_config.fs) + samplerate=am_config.fs) print(f"{utt_id} done!") # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb random_spk_emb = np.random.rand(256) * 0.2 random_spk_emb = paddle.to_tensor(random_spk_emb) utt_id = "random_spk_emb" with paddle.no_grad(): - wav = pwg_inference(fastspeech2_inference(phone_ids, spk_emb=spk_emb)) + wav = voc_inference(am_inference(phone_ids, spk_emb=spk_emb)) sf.write( str(output_dir / (utt_id + ".wav")), wav.numpy(), - samplerate=fastspeech2_config.fs) + samplerate=am_config.fs) print(f"{utt_id} done!") @@ -137,32 +174,53 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="") parser.add_argument( - "--fastspeech2-config", type=str, help="fastspeech2 config file.") - parser.add_argument( - "--fastspeech2-checkpoint", + '--am', type=str, - help="fastspeech2 checkpoint to load.") + default='fastspeech2_csmsc', + choices=['fastspeech2_aishell3', 'tacotron2_aishell3'], + help='Choose acoustic model type of tts task.') parser.add_argument( - "--fastspeech2-stat", + '--am_config', type=str, - help="mean and standard deviation used to normalize spectrogram when training fastspeech2." - ) + default=None, + help='Config of acoustic model. Use deault config when it is None.') parser.add_argument( - "--pwg-config", type=str, help="parallel wavegan config file.") - parser.add_argument( - "--pwg-checkpoint", + '--am_ckpt', type=str, - help="parallel wavegan generator parameters to load.") + default=None, + help='Checkpoint file of acoustic model.') parser.add_argument( - "--pwg-stat", + "--am_stat", type=str, - help="mean and standard deviation used to normalize spectrogram when training parallel wavegan." + default=None, + help="mean and standard deviation used to normalize spectrogram when training acoustic model." ) parser.add_argument( "--phones-dict", type=str, default="phone_id_map.txt", help="phone vocabulary file.") + # vocoder + parser.add_argument( + '--voc', + type=str, + default='pwgan_csmsc', + choices=['pwgan_aishell3'], + help='Choose vocoder type of tts task.') + + parser.add_argument( + '--voc_config', + type=str, + default=None, + help='Config of voc. Use deault config when it is None.') + parser.add_argument( + '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') + parser.add_argument( + "--voc_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training voc." + ) parser.add_argument( "--text", type=str, @@ -190,18 +248,7 @@ def main(): else: print("ngpu should >= 0 !") - with open(args.fastspeech2_config) as f: - fastspeech2_config = CfgNode(yaml.safe_load(f)) - with open(args.pwg_config) as f: - pwg_config = CfgNode(yaml.safe_load(f)) - - print("========Args========") - print(yaml.safe_dump(vars(args))) - print("========Config========") - print(fastspeech2_config) - print(pwg_config) - - voice_cloning(args, fastspeech2_config, pwg_config) + voice_cloning(args) if __name__ == "__main__": diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/aishell3.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/aishell3.py deleted file mode 100644 index da95582d..00000000 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/aishell3.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pickle -from pathlib import Path - -import numpy as np -from paddle.io import Dataset - -from paddlespeech.t2s.data import batch_spec -from paddlespeech.t2s.data import batch_text_id -from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _phones -from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _tones -from paddlespeech.t2s.frontend import Vocab - -voc_phones = Vocab(sorted(list(_phones))) -print("vocab_phones:\n", voc_phones) -voc_tones = Vocab(sorted(list(_tones))) -print("vocab_tones:\n", voc_tones) - - -class AiShell3(Dataset): - """Processed AiShell3 dataset.""" - - def __init__(self, root): - super().__init__() - self.root = Path(root).expanduser() - self.embed_dir = self.root / "embed" - self.mel_dir = self.root / "mel" - - with open(self.root / "metadata.pickle", 'rb') as f: - self.records = pickle.load(f) - - def __getitem__(self, index): - metadatum = self.records[index] - sentence_id = metadatum["sentence_id"] - speaker_id = sentence_id[:7] - phones = metadatum["phones"] - tones = metadatum["tones"] - phones = np.array( - [voc_phones.lookup(item) for item in phones], dtype=np.int64) - tones = np.array( - [voc_tones.lookup(item) for item in tones], dtype=np.int64) - mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy"))) - embed = np.load( - str(self.embed_dir / speaker_id / (sentence_id + ".npy"))) - return phones, tones, mel, embed - - def __len__(self): - return len(self.records) - - -def collate_aishell3_examples(examples): - phones, tones, mel, embed = list(zip(*examples)) - - text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64) - spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64) - T_dec = np.max(spec_lengths) - stop_tokens = ( - np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32) - phones, _ = batch_text_id(phones) - tones, _ = batch_text_id(tones) - mel, _ = batch_spec(mel) - mel = np.transpose(mel, (0, 2, 1)) - embed = np.stack(embed) - # 7 fields - # (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T) - return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens - - -if __name__ == "__main__": - dataset = AiShell3("~/datasets/aishell3/train") - example = dataset[0] - - examples = [dataset[i] for i in range(10)] - batch = collate_aishell3_examples(examples) - - for field in batch: - print(field.shape, field.dtype) diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py deleted file mode 100644 index 12de3bb7..00000000 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import List -from typing import Tuple - -from pypinyin import lazy_pinyin -from pypinyin import Style - -from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import split_syllable - - -def convert_to_pinyin(text: str) -> List[str]: - """convert text into list of syllables, other characters that are not chinese, thus - cannot be converted to pinyin are splited. - """ - syllables = lazy_pinyin( - text, style=Style.TONE3, neutral_tone_with_five=True) - return syllables - - -def convert_sentence(text: str) -> List[Tuple[str]]: - """convert a sentence into two list: phones and tones""" - syllables = convert_to_pinyin(text) - phones = [] - tones = [] - for syllable in syllables: - p, t = split_syllable(syllable) - phones.extend(p) - tones.extend(t) - - return phones, tones diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/config.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/config.py deleted file mode 100644 index 8d8c9c4e..00000000 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/config.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode as CN - -_C = CN() -_C.data = CN( - dict( - batch_size=32, # batch size - valid_size=64, # the first N examples are reserved for validation - sample_rate=22050, # Hz, sample rate - n_fft=1024, # fft frame size - win_length=1024, # window size - hop_length=256, # hop size between ajacent frame - fmax=8000, # Hz, max frequency when converting to mel - fmin=0, # Hz, min frequency when converting to mel - d_mels=80, # mel bands - padding_idx=0, # text embedding's padding index - )) - -_C.model = CN( - dict( - vocab_size=70, - n_tones=10, - reduction_factor=1, # reduction factor - d_encoder=512, # embedding & encoder's internal size - encoder_conv_layers=3, # number of conv layer in tacotron2 encoder - encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder - d_prenet=256, # hidden size of decoder prenet - # hidden size of the first rnn layer in tacotron2 decoder - d_attention_rnn=1024, - # hidden size of the second rnn layer in tacotron2 decoder - d_decoder_rnn=1024, - d_attention=128, # hidden size of decoder location linear layer - attention_filters=32, # number of filter in decoder location conv layer - attention_kernel_size=31, # kernel size of decoder location conv layer - d_postnet=512, # hidden size of decoder postnet - postnet_kernel_size=5, # kernel size of conv layers in postnet - postnet_conv_layers=5, # number of conv layer in decoder postnet - p_encoder_dropout=0.5, # droput probability in encoder - p_prenet_dropout=0.5, # droput probability in decoder prenet - - # droput probability of first rnn layer in decoder - p_attention_dropout=0.1, - # droput probability of second rnn layer in decoder - p_decoder_dropout=0.1, - p_postnet_dropout=0.5, # droput probability in decoder postnet - guided_attention_loss_sigma=0.2, - d_global_condition=256, - - # whether to use a classifier to predict stop probability - use_stop_token=False, - # whether to use guided attention loss in training - use_guided_attention_loss=True, )) - -_C.training = CN( - dict( - lr=1e-3, # learning rate - weight_decay=1e-6, # the coeff of weight decay - grad_clip_thresh=1.0, # the clip norm of grad clip. - valid_interval=1000, # validation - save_interval=1000, # checkpoint - max_iteration=500000, # max iteration to train - )) - - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - return _C.clone() diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/extract_mel.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/extract_mel.py deleted file mode 100644 index d12466f6..00000000 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/extract_mel.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import multiprocessing as mp -from functools import partial -from pathlib import Path - -import numpy as np -import tqdm - -from paddlespeech.t2s.audio import AudioProcessor -from paddlespeech.t2s.audio.spec_normalizer import LogMagnitude -from paddlespeech.t2s.audio.spec_normalizer import NormalizerBase -from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults - - -def extract_mel(fname: Path, - input_dir: Path, - output_dir: Path, - p: AudioProcessor, - n: NormalizerBase): - relative_path = fname.relative_to(input_dir) - out_path = (output_dir / relative_path).with_suffix(".npy") - out_path.parent.mkdir(parents=True, exist_ok=True) - wav = p.read_wav(fname) - mel = p.mel_spectrogram(wav) - mel = n.transform(mel) - np.save(out_path, mel) - - -def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"): - input_dir = Path(input_dir).expanduser() - fnames = list(input_dir.rglob(f"*{extension}")) - output_dir = Path(output_dir).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - - p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length, - config.hop_length, config.d_mels, config.fmin, - config.fmax) - n = LogMagnitude(1e-5) - - func = partial( - extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n) - - with mp.Pool(16) as pool: - list( - tqdm.tqdm( - pool.imap(func, fnames), total=len(fnames), unit="utterance")) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Extract mel spectrogram from processed wav in AiShell3 training dataset." - ) - parser.add_argument( - "--config", - type=str, - help="yaml config file to overwrite the default config") - parser.add_argument( - "--input", - type=str, - default="~/datasets/aishell3/train/normalized_wav", - help="path of the processed wav folder") - parser.add_argument( - "--output", - type=str, - default="~/datasets/aishell3/train/mel", - help="path of the folder to save mel spectrograms") - parser.add_argument( - "--opts", - nargs=argparse.REMAINDER, - help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" - ) - default_config = get_cfg_defaults() - - args = parser.parse_args() - if args.config: - default_config.merge_from_file(args.config) - if args.opts: - default_config.merge_from_list(args.opts) - default_config.freeze() - audio_config = default_config.data - - extract_mel_multispeaker(audio_config, args.input, args.output) diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt deleted file mode 100644 index cc56b55d..00000000 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt +++ /dev/null @@ -1,4150 +0,0 @@ -zhi1 zh iii1 -zhi2 zh iii2 -zhi3 zh iii3 -zhi4 zh iii4 -zhi5 zh iii5 -chi1 ch iii1 -chi2 ch iii2 -chi3 ch iii3 -chi4 ch iii4 -chi5 ch iii5 -shi1 sh iii1 -shi2 sh iii2 -shi3 sh iii3 -shi4 sh iii4 -shi5 sh iii5 -ri1 r iii1 -ri2 r iii2 -ri3 r iii3 -ri4 r iii4 -ri5 r iii5 -zi1 z ii1 -zi2 z ii2 -zi3 z ii3 -zi4 z ii4 -zi5 z ii5 -ci1 c ii1 -ci2 c ii2 -ci3 c ii3 -ci4 c ii4 -ci5 c ii5 -si1 s ii1 -si2 s ii2 -si3 s ii3 -si4 s ii4 -si5 s ii5 -a1 a1 -a2 a2 -a3 a3 -a4 a4 -a5 a5 -ba1 b a1 -ba2 b a2 -ba3 b a3 -ba4 b a4 -ba5 b a5 -pa1 p a1 -pa2 p a2 -pa3 p a3 -pa4 p a4 -pa5 p a5 -ma1 m a1 -ma2 m a2 -ma3 m a3 -ma4 m a4 -ma5 m a5 -fa1 f a1 -fa2 f a2 -fa3 f a3 -fa4 f a4 -fa5 f a5 -da1 d a1 -da2 d a2 -da3 d a3 -da4 d a4 -da5 d a5 -ta1 t a1 -ta2 t a2 -ta3 t a3 -ta4 t a4 -ta5 t a5 -na1 n a1 -na2 n a2 -na3 n a3 -na4 n a4 -na5 n a5 -la1 l a1 -la2 l a2 -la3 l a3 -la4 l a4 -la5 l a5 -ga1 g a1 -ga2 g a2 -ga3 g a3 -ga4 g a4 -ga5 g a5 -ka1 k a1 -ka2 k a2 -ka3 k a3 -ka4 k a4 -ka5 k a5 -ha1 h a1 -ha2 h a2 -ha3 h a3 -ha4 h a4 -ha5 h a5 -zha1 zh a1 -zha2 zh a2 -zha3 zh a3 -zha4 zh a4 -zha5 zh a5 -cha1 ch a1 -cha2 ch a2 -cha3 ch a3 -cha4 ch a4 -cha5 ch a5 -sha1 sh a1 -sha2 sh a2 -sha3 sh a3 -sha4 sh a4 -sha5 sh a5 -za1 z a1 -za2 z a2 -za3 z a3 -za4 z a4 -za5 z a5 -ca1 c a1 -ca2 c a2 -ca3 c a3 -ca4 c a4 -ca5 c a5 -sa1 s a1 -sa2 s a2 -sa3 s a3 -sa4 s a4 -sa5 s a5 -o1 o1 -o2 o2 -o3 o3 -o4 o4 -o5 o5 -bo1 b uo1 -bo2 b uo2 -bo3 b uo3 -bo4 b uo4 -bo5 b uo5 -po1 p uo1 -po2 p uo2 -po3 p uo3 -po4 p uo4 -po5 p uo5 -mo1 m uo1 -mo2 m uo2 -mo3 m uo3 -mo4 m uo4 -mo5 m uo5 -fo1 f uo1 -fo2 f uo2 -fo3 f uo3 -fo4 f uo4 -fo5 f uo5 -lo1 l o1 -lo2 l o2 -lo3 l o3 -lo4 l o4 -lo5 l o5 -e1 e1 -e2 e2 -e3 e3 -e4 e4 -e5 e5 -me1 m e1 -me2 m e2 -me3 m e3 -me4 m e4 -me5 m e5 -de1 d e1 -de2 d e2 -de3 d e3 -de4 d e4 -de5 d e5 -te1 t e1 -te2 t e2 -te3 t e3 -te4 t e4 -te5 t e5 -ne1 n e1 -ne2 n e2 -ne3 n e3 -ne4 n e4 -ne5 n e5 -le1 l e1 -le2 l e2 -le3 l e3 -le4 l e4 -le5 l e5 -ge1 g e1 -ge2 g e2 -ge3 g e3 -ge4 g e4 -ge5 g e5 -ke1 k e1 -ke2 k e2 -ke3 k e3 -ke4 k e4 -ke5 k e5 -he1 h e1 -he2 h e2 -he3 h e3 -he4 h e4 -he5 h e5 -zhe1 zh e1 -zhe2 zh e2 -zhe3 zh e3 -zhe4 zh e4 -zhe5 zh e5 -che1 ch e1 -che2 ch e2 -che3 ch e3 -che4 ch e4 -che5 ch e5 -she1 sh e1 -she2 sh e2 -she3 sh e3 -she4 sh e4 -she5 sh e5 -re1 r e1 -re2 r e2 -re3 r e3 -re4 r e4 -re5 r e5 -ze1 z e1 -ze2 z e2 -ze3 z e3 -ze4 z e4 -ze5 z e5 -ce1 c e1 -ce2 c e2 -ce3 c e3 -ce4 c e4 -ce5 c e5 -se1 s e1 -se2 s e2 -se3 s e3 -se4 s e4 -se5 s e5 -ea1 ea1 -ea2 ea2 -ea3 ea3 -ea4 ea4 -ea5 ea5 -ai1 ai1 -ai2 ai2 -ai3 ai3 -ai4 ai4 -ai5 ai5 -bai1 b ai1 -bai2 b ai2 -bai3 b ai3 -bai4 b ai4 -bai5 b ai5 -pai1 p ai1 -pai2 p ai2 -pai3 p ai3 -pai4 p ai4 -pai5 p ai5 -mai1 m ai1 -mai2 m ai2 -mai3 m ai3 -mai4 m ai4 -mai5 m ai5 -dai1 d ai1 -dai2 d ai2 -dai3 d ai3 -dai4 d ai4 -dai5 d ai5 -tai1 t ai1 -tai2 t ai2 -tai3 t ai3 -tai4 t ai4 -tai5 t ai5 -nai1 n ai1 -nai2 n ai2 -nai3 n ai3 -nai4 n ai4 -nai5 n ai5 -lai1 l ai1 -lai2 l ai2 -lai3 l ai3 -lai4 l ai4 -lai5 l ai5 -gai1 g ai1 -gai2 g ai2 -gai3 g ai3 -gai4 g ai4 -gai5 g ai5 -kai1 k ai1 -kai2 k ai2 -kai3 k ai3 -kai4 k ai4 -kai5 k ai5 -hai1 h ai1 -hai2 h ai2 -hai3 h ai3 -hai4 h ai4 -hai5 h ai5 -zhai1 zh ai1 -zhai2 zh ai2 -zhai3 zh ai3 -zhai4 zh ai4 -zhai5 zh ai5 -chai1 ch ai1 -chai2 ch ai2 -chai3 ch ai3 -chai4 ch ai4 -chai5 ch ai5 -shai1 sh ai1 -shai2 sh ai2 -shai3 sh ai3 -shai4 sh ai4 -shai5 sh ai5 -zai1 z ai1 -zai2 z ai2 -zai3 z ai3 -zai4 z ai4 -zai5 z ai5 -cai1 c ai1 -cai2 c ai2 -cai3 c ai3 -cai4 c ai4 -cai5 c ai5 -sai1 s ai1 -sai2 s ai2 -sai3 s ai3 -sai4 s ai4 -sai5 s ai5 -ei1 ei1 -ei2 ei2 -ei3 ei3 -ei4 ei4 -ei5 ei5 -bei1 b ei1 -bei2 b ei2 -bei3 b ei3 -bei4 b ei4 -bei5 b ei5 -pei1 p ei1 -pei2 p ei2 -pei3 p ei3 -pei4 p ei4 -pei5 p ei5 -mei1 m ei1 -mei2 m ei2 -mei3 m ei3 -mei4 m ei4 -mei5 m ei5 -fei1 f ei1 -fei2 f ei2 -fei3 f ei3 -fei4 f ei4 -fei5 f ei5 -dei1 d ei1 -dei2 d ei2 -dei3 d ei3 -dei4 d ei4 -dei5 d ei5 -tei1 t ei1 -tei2 t ei2 -tei3 t ei3 -tei4 t ei4 -tei5 t ei5 -nei1 n ei1 -nei2 n ei2 -nei3 n ei3 -nei4 n ei4 -nei5 n ei5 -lei1 l ei1 -lei2 l ei2 -lei3 l ei3 -lei4 l ei4 -lei5 l ei5 -gei1 g ei1 -gei2 g ei2 -gei3 g ei3 -gei4 g ei4 -gei5 g ei5 -kei1 k ei1 -kei2 k ei2 -kei3 k ei3 -kei4 k ei4 -kei5 k ei5 -hei1 h ei1 -hei2 h ei2 -hei3 h ei3 -hei4 h ei4 -hei5 h ei5 -zhei1 zh ei1 -zhei2 zh ei2 -zhei3 zh ei3 -zhei4 zh ei4 -zhei5 zh ei5 -shei1 sh ei1 -shei2 sh ei2 -shei3 sh ei3 -shei4 sh ei4 -shei5 sh ei5 -zei1 z ei1 -zei2 z ei2 -zei3 z ei3 -zei4 z ei4 -zei5 z ei5 -ao1 au1 -ao2 au2 -ao3 au3 -ao4 au4 -ao5 au5 -bao1 b au1 -bao2 b au2 -bao3 b au3 -bao4 b au4 -bao5 b au5 -pao1 p au1 -pao2 p au2 -pao3 p au3 -pao4 p au4 -pao5 p au5 -mao1 m au1 -mao2 m au2 -mao3 m au3 -mao4 m au4 -mao5 m au5 -dao1 d au1 -dao2 d au2 -dao3 d au3 -dao4 d au4 -dao5 d au5 -tao1 t au1 -tao2 t au2 -tao3 t au3 -tao4 t au4 -tao5 t au5 -nao1 n au1 -nao2 n au2 -nao3 n au3 -nao4 n au4 -nao5 n au5 -lao1 l au1 -lao2 l au2 -lao3 l au3 -lao4 l au4 -lao5 l au5 -gao1 g au1 -gao2 g au2 -gao3 g au3 -gao4 g au4 -gao5 g au5 -kao1 k au1 -kao2 k au2 -kao3 k au3 -kao4 k au4 -kao5 k au5 -hao1 h au1 -hao2 h au2 -hao3 h au3 -hao4 h au4 -hao5 h au5 -zhao1 zh au1 -zhao2 zh au2 -zhao3 zh au3 -zhao4 zh au4 -zhao5 zh au5 -chao1 ch au1 -chao2 ch au2 -chao3 ch au3 -chao4 ch au4 -chao5 ch au5 -shao1 sh au1 -shao2 sh au2 -shao3 sh au3 -shao4 sh au4 -shao5 sh au5 -rao1 r au1 -rao2 r au2 -rao3 r au3 -rao4 r au4 -rao5 r au5 -zao1 z au1 -zao2 z au2 -zao3 z au3 -zao4 z au4 -zao5 z au5 -cao1 c au1 -cao2 c au2 -cao3 c au3 -cao4 c au4 -cao5 c au5 -sao1 s au1 -sao2 s au2 -sao3 s au3 -sao4 s au4 -sao5 s au5 -ou1 ou1 -ou2 ou2 -ou3 ou3 -ou4 ou4 -ou5 ou5 -pou1 p ou1 -pou2 p ou2 -pou3 p ou3 -pou4 p ou4 -pou5 p ou5 -mou1 m ou1 -mou2 m ou2 -mou3 m ou3 -mou4 m ou4 -mou5 m ou5 -fou1 f ou1 -fou2 f ou2 -fou3 f ou3 -fou4 f ou4 -fou5 f ou5 -dou1 d ou1 -dou2 d ou2 -dou3 d ou3 -dou4 d ou4 -dou5 d ou5 -tou1 t ou1 -tou2 t ou2 -tou3 t ou3 -tou4 t ou4 -tou5 t ou5 -nou1 n ou1 -nou2 n ou2 -nou3 n ou3 -nou4 n ou4 -nou5 n ou5 -lou1 l ou1 -lou2 l ou2 -lou3 l ou3 -lou4 l ou4 -lou5 l ou5 -gou1 g ou1 -gou2 g ou2 -gou3 g ou3 -gou4 g ou4 -gou5 g ou5 -kou1 k ou1 -kou2 k ou2 -kou3 k ou3 -kou4 k ou4 -kou5 k ou5 -hou1 h ou1 -hou2 h ou2 -hou3 h ou3 -hou4 h ou4 -hou5 h ou5 -zhou1 zh ou1 -zhou2 zh ou2 -zhou3 zh ou3 -zhou4 zh ou4 -zhou5 zh ou5 -chou1 ch ou1 -chou2 ch ou2 -chou3 ch ou3 -chou4 ch ou4 -chou5 ch ou5 -shou1 sh ou1 -shou2 sh ou2 -shou3 sh ou3 -shou4 sh ou4 -shou5 sh ou5 -rou1 r ou1 -rou2 r ou2 -rou3 r ou3 -rou4 r ou4 -rou5 r ou5 -zou1 z ou1 -zou2 z ou2 -zou3 z ou3 -zou4 z ou4 -zou5 z ou5 -cou1 c ou1 -cou2 c ou2 -cou3 c ou3 -cou4 c ou4 -cou5 c ou5 -sou1 s ou1 -sou2 s ou2 -sou3 s ou3 -sou4 s ou4 -sou5 s ou5 -an1 an1 -an2 an2 -an3 an3 -an4 an4 -an5 an5 -ban1 b an1 -ban2 b an2 -ban3 b an3 -ban4 b an4 -ban5 b an5 -pan1 p an1 -pan2 p an2 -pan3 p an3 -pan4 p an4 -pan5 p an5 -man1 m an1 -man2 m an2 -man3 m an3 -man4 m an4 -man5 m an5 -fan1 f an1 -fan2 f an2 -fan3 f an3 -fan4 f an4 -fan5 f an5 -dan1 d an1 -dan2 d an2 -dan3 d an3 -dan4 d an4 -dan5 d an5 -tan1 t an1 -tan2 t an2 -tan3 t an3 -tan4 t an4 -tan5 t an5 -nan1 n an1 -nan2 n an2 -nan3 n an3 -nan4 n an4 -nan5 n an5 -lan1 l an1 -lan2 l an2 -lan3 l an3 -lan4 l an4 -lan5 l an5 -gan1 g an1 -gan2 g an2 -gan3 g an3 -gan4 g an4 -gan5 g an5 -kan1 k an1 -kan2 k an2 -kan3 k an3 -kan4 k an4 -kan5 k an5 -han1 h an1 -han2 h an2 -han3 h an3 -han4 h an4 -han5 h an5 -zhan1 zh an1 -zhan2 zh an2 -zhan3 zh an3 -zhan4 zh an4 -zhan5 zh an5 -chan1 ch an1 -chan2 ch an2 -chan3 ch an3 -chan4 ch an4 -chan5 ch an5 -shan1 sh an1 -shan2 sh an2 -shan3 sh an3 -shan4 sh an4 -shan5 sh an5 -ran1 r an1 -ran2 r an2 -ran3 r an3 -ran4 r an4 -ran5 r an5 -zan1 z an1 -zan2 z an2 -zan3 z an3 -zan4 z an4 -zan5 z an5 -can1 c an1 -can2 c an2 -can3 c an3 -can4 c an4 -can5 c an5 -san1 s an1 -san2 s an2 -san3 s an3 -san4 s an4 -san5 s an5 -en1 en1 -en2 en2 -en3 en3 -en4 en4 -en5 en5 -ben1 b en1 -ben2 b en2 -ben3 b en3 -ben4 b en4 -ben5 b en5 -pen1 p en1 -pen2 p en2 -pen3 p en3 -pen4 p en4 -pen5 p en5 -men1 m en1 -men2 m en2 -men3 m en3 -men4 m en4 -men5 m en5 -fen1 f en1 -fen2 f en2 -fen3 f en3 -fen4 f en4 -fen5 f en5 -den1 d en1 -den2 d en2 -den3 d en3 -den4 d en4 -den5 d en5 -nen1 n en1 -nen2 n en2 -nen3 n en3 -nen4 n en4 -nen5 n en5 -gen1 g en1 -gen2 g en2 -gen3 g en3 -gen4 g en4 -gen5 g en5 -ken1 k en1 -ken2 k en2 -ken3 k en3 -ken4 k en4 -ken5 k en5 -hen1 h en1 -hen2 h en2 -hen3 h en3 -hen4 h en4 -hen5 h en5 -zhen1 zh en1 -zhen2 zh en2 -zhen3 zh en3 -zhen4 zh en4 -zhen5 zh en5 -chen1 ch en1 -chen2 ch en2 -chen3 ch en3 -chen4 ch en4 -chen5 ch en5 -shen1 sh en1 -shen2 sh en2 -shen3 sh en3 -shen4 sh en4 -shen5 sh en5 -ren1 r en1 -ren2 r en2 -ren3 r en3 -ren4 r en4 -ren5 r en5 -zen1 z en1 -zen2 z en2 -zen3 z en3 -zen4 z en4 -zen5 z en5 -cen1 c en1 -cen2 c en2 -cen3 c en3 -cen4 c en4 -cen5 c en5 -sen1 s en1 -sen2 s en2 -sen3 s en3 -sen4 s en4 -sen5 s en5 -ang1 ang1 -ang2 ang2 -ang3 ang3 -ang4 ang4 -ang5 ang5 -bang1 b ang1 -bang2 b ang2 -bang3 b ang3 -bang4 b ang4 -bang5 b ang5 -pang1 p ang1 -pang2 p ang2 -pang3 p ang3 -pang4 p ang4 -pang5 p ang5 -mang1 m ang1 -mang2 m ang2 -mang3 m ang3 -mang4 m ang4 -mang5 m ang5 -fang1 f ang1 -fang2 f ang2 -fang3 f ang3 -fang4 f ang4 -fang5 f ang5 -dang1 d ang1 -dang2 d ang2 -dang3 d ang3 -dang4 d ang4 -dang5 d ang5 -tang1 t ang1 -tang2 t ang2 -tang3 t ang3 -tang4 t ang4 -tang5 t ang5 -nang1 n ang1 -nang2 n ang2 -nang3 n ang3 -nang4 n ang4 -nang5 n ang5 -lang1 l ang1 -lang2 l ang2 -lang3 l ang3 -lang4 l ang4 -lang5 l ang5 -gang1 g ang1 -gang2 g ang2 -gang3 g ang3 -gang4 g ang4 -gang5 g ang5 -kang1 k ang1 -kang2 k ang2 -kang3 k ang3 -kang4 k ang4 -kang5 k ang5 -hang1 h ang1 -hang2 h ang2 -hang3 h ang3 -hang4 h ang4 -hang5 h ang5 -zhang1 zh ang1 -zhang2 zh ang2 -zhang3 zh ang3 -zhang4 zh ang4 -zhang5 zh ang5 -chang1 ch ang1 -chang2 ch ang2 -chang3 ch ang3 -chang4 ch ang4 -chang5 ch ang5 -shang1 sh ang1 -shang2 sh ang2 -shang3 sh ang3 -shang4 sh ang4 -shang5 sh ang5 -rang1 r ang1 -rang2 r ang2 -rang3 r ang3 -rang4 r ang4 -rang5 r ang5 -zang1 z ang1 -zang2 z ang2 -zang3 z ang3 -zang4 z ang4 -zang5 z ang5 -cang1 c ang1 -cang2 c ang2 -cang3 c ang3 -cang4 c ang4 -cang5 c ang5 -sang1 s ang1 -sang2 s ang2 -sang3 s ang3 -sang4 s ang4 -sang5 s ang5 -eng1 eng1 -eng2 eng2 -eng3 eng3 -eng4 eng4 -eng5 eng5 -beng1 b eng1 -beng2 b eng2 -beng3 b eng3 -beng4 b eng4 -beng5 b eng5 -peng1 p eng1 -peng2 p eng2 -peng3 p eng3 -peng4 p eng4 -peng5 p eng5 -meng1 m eng1 -meng2 m eng2 -meng3 m eng3 -meng4 m eng4 -meng5 m eng5 -feng1 f eng1 -feng2 f eng2 -feng3 f eng3 -feng4 f eng4 -feng5 f eng5 -deng1 d eng1 -deng2 d eng2 -deng3 d eng3 -deng4 d eng4 -deng5 d eng5 -teng1 t eng1 -teng2 t eng2 -teng3 t eng3 -teng4 t eng4 -teng5 t eng5 -neng1 n eng1 -neng2 n eng2 -neng3 n eng3 -neng4 n eng4 -neng5 n eng5 -leng1 l eng1 -leng2 l eng2 -leng3 l eng3 -leng4 l eng4 -leng5 l eng5 -geng1 g eng1 -geng2 g eng2 -geng3 g eng3 -geng4 g eng4 -geng5 g eng5 -keng1 k eng1 -keng2 k eng2 -keng3 k eng3 -keng4 k eng4 -keng5 k eng5 -heng1 h eng1 -heng2 h eng2 -heng3 h eng3 -heng4 h eng4 -heng5 h eng5 -zheng1 zh eng1 -zheng2 zh eng2 -zheng3 zh eng3 -zheng4 zh eng4 -zheng5 zh eng5 -cheng1 ch eng1 -cheng2 ch eng2 -cheng3 ch eng3 -cheng4 ch eng4 -cheng5 ch eng5 -sheng1 sh eng1 -sheng2 sh eng2 -sheng3 sh eng3 -sheng4 sh eng4 -sheng5 sh eng5 -reng1 r eng1 -reng2 r eng2 -reng3 r eng3 -reng4 r eng4 -reng5 r eng5 -zeng1 z eng1 -zeng2 z eng2 -zeng3 z eng3 -zeng4 z eng4 -zeng5 z eng5 -ceng1 c eng1 -ceng2 c eng2 -ceng3 c eng3 -ceng4 c eng4 -ceng5 c eng5 -seng1 s eng1 -seng2 s eng2 -seng3 s eng3 -seng4 s eng4 -seng5 s eng5 -er1 er1 -er2 er2 -er3 er3 -er4 er4 -er5 er5 -yi1 y i1 -yi2 y i2 -yi3 y i3 -yi4 y i4 -yi5 y i5 -bi1 b i1 -bi2 b i2 -bi3 b i3 -bi4 b i4 -bi5 b i5 -pi1 p i1 -pi2 p i2 -pi3 p i3 -pi4 p i4 -pi5 p i5 -mi1 m i1 -mi2 m i2 -mi3 m i3 -mi4 m i4 -mi5 m i5 -di1 d i1 -di2 d i2 -di3 d i3 -di4 d i4 -di5 d i5 -ti1 t i1 -ti2 t i2 -ti3 t i3 -ti4 t i4 -ti5 t i5 -ni1 n i1 -ni2 n i2 -ni3 n i3 -ni4 n i4 -ni5 n i5 -li1 l i1 -li2 l i2 -li3 l i3 -li4 l i4 -li5 l i5 -ji1 j i1 -ji2 j i2 -ji3 j i3 -ji4 j i4 -ji5 j i5 -qi1 q i1 -qi2 q i2 -qi3 q i3 -qi4 q i4 -qi5 q i5 -xi1 x i1 -xi2 x i2 -xi3 x i3 -xi4 x i4 -xi5 x i5 -ya1 y ia1 -ya2 y ia2 -ya3 y ia3 -ya4 y ia4 -ya5 y ia5 -dia1 d ia1 -dia2 d ia2 -dia3 d ia3 -dia4 d ia4 -dia5 d ia5 -lia1 l ia1 -lia2 l ia2 -lia3 l ia3 -lia4 l ia4 -lia5 l ia5 -jia1 j ia1 -jia2 j ia2 -jia3 j ia3 -jia4 j ia4 -jia5 j ia5 -qia1 q ia1 -qia2 q ia2 -qia3 q ia3 -qia4 q ia4 -qia5 q ia5 -xia1 x ia1 -xia2 x ia2 -xia3 x ia3 -xia4 x ia4 -xia5 x ia5 -yo1 y io1 -yo2 y io2 -yo3 y io3 -yo4 y io4 -yo5 y io5 -ye1 y ie1 -ye2 y ie2 -ye3 y ie3 -ye4 y ie4 -ye5 y ie5 -bie1 b ie1 -bie2 b ie2 -bie3 b ie3 -bie4 b ie4 -bie5 b ie5 -pie1 p ie1 -pie2 p ie2 -pie3 p ie3 -pie4 p ie4 -pie5 p ie5 -mie1 m ie1 -mie2 m ie2 -mie3 m ie3 -mie4 m ie4 -mie5 m ie5 -die1 d ie1 -die2 d ie2 -die3 d ie3 -die4 d ie4 -die5 d ie5 -tie1 t ie1 -tie2 t ie2 -tie3 t ie3 -tie4 t ie4 -tie5 t ie5 -nie1 n ie1 -nie2 n ie2 -nie3 n ie3 -nie4 n ie4 -nie5 n ie5 -lie1 l ie1 -lie2 l ie2 -lie3 l ie3 -lie4 l ie4 -lie5 l ie5 -jie1 j ie1 -jie2 j ie2 -jie3 j ie3 -jie4 j ie4 -jie5 j ie5 -qie1 q ie1 -qie2 q ie2 -qie3 q ie3 -qie4 q ie4 -qie5 q ie5 -xie1 x ie1 -xie2 x ie2 -xie3 x ie3 -xie4 x ie4 -xie5 x ie5 -yai1 y ai1 -yai2 y ai2 -yai3 y ai3 -yai4 y ai4 -yai5 y ai5 -yao1 y au1 -yao2 y au2 -yao3 y au3 -yao4 y au4 -yao5 y au5 -biao1 b iau1 -biao2 b iau2 -biao3 b iau3 -biao4 b iau4 -biao5 b iau5 -piao1 p iau1 -piao2 p iau2 -piao3 p iau3 -piao4 p iau4 -piao5 p iau5 -miao1 m iau1 -miao2 m iau2 -miao3 m iau3 -miao4 m iau4 -miao5 m iau5 -fiao1 f iau1 -fiao2 f iau2 -fiao3 f iau3 -fiao4 f iau4 -fiao5 f iau5 -diao1 d iau1 -diao2 d iau2 -diao3 d iau3 -diao4 d iau4 -diao5 d iau5 -tiao1 t iau1 -tiao2 t iau2 -tiao3 t iau3 -tiao4 t iau4 -tiao5 t iau5 -niao1 n iau1 -niao2 n iau2 -niao3 n iau3 -niao4 n iau4 -niao5 n iau5 -liao1 l iau1 -liao2 l iau2 -liao3 l iau3 -liao4 l iau4 -liao5 l iau5 -jiao1 j iau1 -jiao2 j iau2 -jiao3 j iau3 -jiao4 j iau4 -jiao5 j iau5 -qiao1 q iau1 -qiao2 q iau2 -qiao3 q iau3 -qiao4 q iau4 -qiao5 q iau5 -xiao1 x iau1 -xiao2 x iau2 -xiao3 x iau3 -xiao4 x iau4 -xiao5 x iau5 -you1 y iou1 -you2 y iou2 -you3 y iou3 -you4 y iou4 -you5 y iou5 -miu1 m iou1 -miu2 m iou2 -miu3 m iou3 -miu4 m iou4 -miu5 m iou5 -diu1 d iou1 -diu2 d iou2 -diu3 d iou3 -diu4 d iou4 -diu5 d iou5 -niu1 n iou1 -niu2 n iou2 -niu3 n iou3 -niu4 n iou4 -niu5 n iou5 -liu1 l iou1 -liu2 l iou2 -liu3 l iou3 -liu4 l iou4 -liu5 l iou5 -jiu1 j iou1 -jiu2 j iou2 -jiu3 j iou3 -jiu4 j iou4 -jiu5 j iou5 -qiu1 q iou1 -qiu2 q iou2 -qiu3 q iou3 -qiu4 q iou4 -qiu5 q iou5 -xiu1 xiou1 -xiu2 xiou2 -xiu3 xiou3 -xiu4 xiou4 -xiu5 xiou5 -yan1 y ian1 -yan2 y ian2 -yan3 y ian3 -yan4 y ian4 -yan5 y ian5 -bian1 b ian1 -bian2 b ian2 -bian3 b ian3 -bian4 b ian4 -bian5 b ian5 -pian1 p ian1 -pian2 p ian2 -pian3 p ian3 -pian4 p ian4 -pian5 p ian5 -mian1 m ian1 -mian2 m ian2 -mian3 m ian3 -mian4 m ian4 -mian5 m ian5 -dian1 d ian1 -dian2 d ian2 -dian3 d ian3 -dian4 d ian4 -dian5 d ian5 -tian1 t ian1 -tian2 t ian2 -tian3 t ian3 -tian4 t ian4 -tian5 t ian5 -nian1 n ian1 -nian2 n ian2 -nian3 n ian3 -nian4 n ian4 -nian5 n ian5 -lian1 l ian1 -lian2 l ian2 -lian3 l ian3 -lian4 l ian4 -lian5 l ian5 -jian1 j ian1 -jian2 j ian2 -jian3 j ian3 -jian4 j ian4 -jian5 j ian5 -qian1 q ian1 -qian2 q ian2 -qian3 q ian3 -qian4 q ian4 -qian5 q ian5 -xian1 x ian1 -xian2 x ian2 -xian3 x ian3 -xian4 x ian4 -xian5 x ian5 -yin1 y in1 -yin2 y in2 -yin3 y in3 -yin4 y in4 -yin5 y in5 -bin1 b in1 -bin2 b in2 -bin3 b in3 -bin4 b in4 -bin5 b in5 -pin1 p in1 -pin2 p in2 -pin3 p in3 -pin4 p in4 -pin5 p in5 -min1 m in1 -min2 m in2 -min3 m in3 -min4 m in4 -min5 m in5 -din1 d in1 -din2 d in2 -din3 d in3 -din4 d in4 -din5 d in5 -nin1 n in1 -nin2 n in2 -nin3 n in3 -nin4 n in4 -nin5 n in5 -lin1 l in1 -lin2 l in2 -lin3 l in3 -lin4 l in4 -lin5 l in5 -jin1 j in1 -jin2 j in2 -jin3 j in3 -jin4 j in4 -jin5 j in5 -qin1 q in1 -qin2 q in2 -qin3 q in3 -qin4 q in4 -qin5 q in5 -xin1 x in1 -xin2 x in2 -xin3 x in3 -xin4 x in4 -xin5 x in5 -yang1 y iang1 -yang2 y iang2 -yang3 y iang3 -yang4 y iang4 -yang5 y iang5 -biang1 b iang1 -biang2 b iang2 -biang3 b iang3 -biang4 b iang4 -biang5 b iang5 -niang1 n iang1 -niang2 n iang2 -niang3 n iang3 -niang4 n iang4 -niang5 n iang5 -liang1 l iang1 -liang2 l iang2 -liang3 l iang3 -liang4 l iang4 -liang5 l iang5 -jiang1 j iang1 -jiang2 j iang2 -jiang3 j iang3 -jiang4 j iang4 -jiang5 j iang5 -qiang1 q iang1 -qiang2 q iang2 -qiang3 q iang3 -qiang4 q iang4 -qiang5 q iang5 -xiang1 x iang1 -xiang2 x iang2 -xiang3 x iang3 -xiang4 x iang4 -xiang5 x iang5 -ying1 y ing1 -ying2 y ing2 -ying3 y ing3 -ying4 y ing4 -ying5 y ing5 -bing1 b ing1 -bing2 b ing2 -bing3 b ing3 -bing4 b ing4 -bing5 b ing5 -ping1 p ing1 -ping2 p ing2 -ping3 p ing3 -ping4 p ing4 -ping5 p ing5 -ming1 m ing1 -ming2 m ing2 -ming3 m ing3 -ming4 m ing4 -ming5 m ing5 -ding1 d ing1 -ding2 d ing2 -ding3 d ing3 -ding4 d ing4 -ding5 d ing5 -ting1 t ing1 -ting2 t ing2 -ting3 t ing3 -ting4 t ing4 -ting5 t ing5 -ning1 n ing1 -ning2 n ing2 -ning3 n ing3 -ning4 n ing4 -ning5 n ing5 -ling1 l ing1 -ling2 l ing2 -ling3 l ing3 -ling4 l ing4 -ling5 l ing5 -jing1 j ing1 -jing2 j ing2 -jing3 j ing3 -jing4 j ing4 -jing5 j ing5 -qing1 q ing1 -qing2 q ing2 -qing3 q ing3 -qing4 q ing4 -qing5 q ing5 -xing1 x ing1 -xing2 x ing2 -xing3 x ing3 -xing4 x ing4 -xing5 x ing5 -wu1 w u1 -wu2 w u2 -wu3 w u3 -wu4 w u4 -wu5 w u5 -bu1 b u1 -bu2 b u2 -bu3 b u3 -bu4 b u4 -bu5 b u5 -pu1 p u1 -pu2 p u2 -pu3 p u3 -pu4 p u4 -pu5 p u5 -mu1 m u1 -mu2 m u2 -mu3 m u3 -mu4 m u4 -mu5 m u5 -fu1 f u1 -fu2 f u2 -fu3 f u3 -fu4 f u4 -fu5 f u5 -du1 d u1 -du2 d u2 -du3 d u3 -du4 d u4 -du5 d u5 -tu1 t u1 -tu2 t u2 -tu3 t u3 -tu4 t u4 -tu5 t u5 -nu1 n u1 -nu2 n u2 -nu3 n u3 -nu4 n u4 -nu5 n u5 -lu1 l u1 -lu2 l u2 -lu3 l u3 -lu4 l u4 -lu5 l u5 -gu1 g u1 -gu2 g u2 -gu3 g u3 -gu4 g u4 -gu5 g u5 -ku1 k u1 -ku2 k u2 -ku3 k u3 -ku4 k u4 -ku5 k u5 -hu1 h u1 -hu2 h u2 -hu3 h u3 -hu4 h u4 -hu5 h u5 -zhu1 zh u1 -zhu2 zh u2 -zhu3 zh u3 -zhu4 zh u4 -zhu5 zh u5 -chu1 ch u1 -chu2 ch u2 -chu3 ch u3 -chu4 ch u4 -chu5 ch u5 -shu1 sh u1 -shu2 sh u2 -shu3 sh u3 -shu4 sh u4 -shu5 sh u5 -ru1 r u1 -ru2 r u2 -ru3 r u3 -ru4 r u4 -ru5 r u5 -zu1 z u1 -zu2 z u2 -zu3 z u3 -zu4 z u4 -zu5 z u5 -cu1 c u1 -cu2 c u2 -cu3 c u3 -cu4 c u4 -cu5 c u5 -su1 s u1 -su2 s u2 -su3 s u3 -su4 s u4 -su5 s u5 -wa1 w ua1 -wa2 w ua2 -wa3 w ua3 -wa4 w ua4 -wa5 w ua5 -gua1 g ua1 -gua2 g ua2 -gua3 g ua3 -gua4 g ua4 -gua5 g ua5 -kua1 k ua1 -kua2 k ua2 -kua3 k ua3 -kua4 k ua4 -kua5 k ua5 -hua1 h ua1 -hua2 h ua2 -hua3 h ua3 -hua4 h ua4 -hua5 h ua5 -zhua1 zh ua1 -zhua2 zh ua2 -zhua3 zh ua3 -zhua4 zh ua4 -zhua5 zh ua5 -chua1 ch ua1 -chua2 ch ua2 -chua3 ch ua3 -chua4 ch ua4 -chua5 ch ua5 -shua1 sh ua1 -shua2 sh ua2 -shua3 sh ua3 -shua4 sh ua4 -shua5 sh ua5 -wo1 w uo1 -wo2 w uo2 -wo3 w uo3 -wo4 w uo4 -wo5 w uo5 -duo1 d uo1 -duo2 d uo2 -duo3 d uo3 -duo4 d uo4 -duo5 d uo5 -tuo1 t uo1 -tuo2 t uo2 -tuo3 t uo3 -tuo4 t uo4 -tuo5 t uo5 -nuo1 n uo1 -nuo2 n uo2 -nuo3 n uo3 -nuo4 n uo4 -nuo5 n uo5 -luo1 l uo1 -luo2 l uo2 -luo3 l uo3 -luo4 l uo4 -luo5 l uo5 -guo1 g uo1 -guo2 g uo2 -guo3 g uo3 -guo4 g uo4 -guo5 g uo5 -kuo1 k uo1 -kuo2 k uo2 -kuo3 k uo3 -kuo4 k uo4 -kuo5 k uo5 -huo1 h uo1 -huo2 h uo2 -huo3 h uo3 -huo4 h uo4 -huo5 h uo5 -zhuo1 zh uo1 -zhuo2 zh uo2 -zhuo3 zh uo3 -zhuo4 zh uo4 -zhuo5 zh uo5 -chuo1 ch uo1 -chuo2 ch uo2 -chuo3 ch uo3 -chuo4 ch uo4 -chuo5 ch uo5 -shuo1 sh uo1 -shuo2 sh uo2 -shuo3 sh uo3 -shuo4 sh uo4 -shuo5 sh uo5 -ruo1 r uo1 -ruo2 r uo2 -ruo3 r uo3 -ruo4 r uo4 -ruo5 r uo5 -zuo1 z uo1 -zuo2 z uo2 -zuo3 z uo3 -zuo4 z uo4 -zuo5 z uo5 -cuo1 c uo1 -cuo2 c uo2 -cuo3 c uo3 -cuo4 c uo4 -cuo5 c uo5 -suo1 s uo1 -suo2 s uo2 -suo3 s uo3 -suo4 s uo4 -suo5 s uo5 -wai1 w uai1 -wai2 w uai2 -wai3 w uai3 -wai4 w uai4 -wai5 w uai5 -guai1 g uai1 -guai2 g uai2 -guai3 g uai3 -guai4 g uai4 -guai5 g uai5 -kuai1 k uai1 -kuai2 k uai2 -kuai3 k uai3 -kuai4 k uai4 -kuai5 k uai5 -huai1 h uai1 -huai2 h uai2 -huai3 h uai3 -huai4 h uai4 -huai5 h uai5 -zhuai1 zh uai1 -zhuai2 zh uai2 -zhuai3 zh uai3 -zhuai4 zh uai4 -zhuai5 zh uai5 -chuai1 ch uai1 -chuai2 ch uai2 -chuai3 ch uai3 -chuai4 ch uai4 -chuai5 ch uai5 -shuai1 sh uai1 -shuai2 sh uai2 -shuai3 sh uai3 -shuai4 sh uai4 -shuai5 sh uai5 -wei1 w uei1 -wei2 w uei2 -wei3 w uei3 -wei4 w uei4 -wei5 w uei5 -dui1 d uei1 -dui2 d uei2 -dui3 d uei3 -dui4 d uei4 -dui5 d uei5 -tui1 t uei1 -tui2 t uei2 -tui3 t uei3 -tui4 t uei4 -tui5 t uei5 -gui1 g uei1 -gui2 g uei2 -gui3 g uei3 -gui4 g uei4 -gui5 g uei5 -kui1 k uei1 -kui2 k uei2 -kui3 k uei3 -kui4 k uei4 -kui5 k uei5 -hui1 h uei1 -hui2 h uei2 -hui3 h uei3 -hui4 h uei4 -hui5 h uei5 -zhui1 zh uei1 -zhui2 zh uei2 -zhui3 zh uei3 -zhui4 zh uei4 -zhui5 zh uei5 -chui1 ch uei1 -chui2 ch uei2 -chui3 ch uei3 -chui4 ch uei4 -chui5 ch uei5 -shui1 sh uei1 -shui2 sh uei2 -shui3 sh uei3 -shui4 sh uei4 -shui5 sh uei5 -rui1 r uei1 -rui2 r uei2 -rui3 r uei3 -rui4 r uei4 -rui5 r uei5 -zui1 z uei1 -zui2 z uei2 -zui3 z uei3 -zui4 z uei4 -zui5 z uei5 -cui1 c uei1 -cui2 c uei2 -cui3 c uei3 -cui4 c uei4 -cui5 c uei5 -sui1 s uei1 -sui2 s uei2 -sui3 s uei3 -sui4 s uei4 -sui5 s uei5 -wan1 w uan1 -wan2 w uan2 -wan3 w uan3 -wan4 w uan4 -wan5 w uan5 -duan1 d uan1 -duan2 d uan2 -duan3 d uan3 -duan4 d uan4 -duan5 d uan5 -tuan1 t uan1 -tuan2 t uan2 -tuan3 t uan3 -tuan4 t uan4 -tuan5 t uan5 -nuan1 n uan1 -nuan2 n uan2 -nuan3 n uan3 -nuan4 n uan4 -nuan5 n uan5 -luan1 l uan1 -luan2 l uan2 -luan3 l uan3 -luan4 l uan4 -luan5 l uan5 -guan1 g uan1 -guan2 g uan2 -guan3 g uan3 -guan4 g uan4 -guan5 g uan5 -kuan1 k uan1 -kuan2 k uan2 -kuan3 k uan3 -kuan4 k uan4 -kuan5 k uan5 -huan1 h uan1 -huan2 h uan2 -huan3 h uan3 -huan4 h uan4 -huan5 h uan5 -zhuan1 zh uan1 -zhuan2 zh uan2 -zhuan3 zh uan3 -zhuan4 zh uan4 -zhuan5 zh uan5 -chuan1 ch uan1 -chuan2 ch uan2 -chuan3 ch uan3 -chuan4 ch uan4 -chuan5 ch uan5 -shuan1 sh uan1 -shuan2 sh uan2 -shuan3 sh uan3 -shuan4 sh uan4 -shuan5 sh uan5 -ruan1 r uan1 -ruan2 r uan2 -ruan3 r uan3 -ruan4 r uan4 -ruan5 r uan5 -zuan1 z uan1 -zuan2 z uan2 -zuan3 z uan3 -zuan4 z uan4 -zuan5 z uan5 -cuan1 c uan1 -cuan2 c uan2 -cuan3 c uan3 -cuan4 c uan4 -cuan5 c uan5 -suan1 s uan1 -suan2 s uan2 -suan3 s uan3 -suan4 s uan4 -suan5 s uan5 -wen1 w uen1 -wen2 w uen2 -wen3 w uen3 -wen4 w uen4 -wen5 w uen5 -dun1 d uen1 -dun2 d uen2 -dun3 d uen3 -dun4 d uen4 -dun5 d uen5 -tun1 t uen1 -tun2 t uen2 -tun3 t uen3 -tun4 t uen4 -tun5 t uen5 -nun1 n uen1 -nun2 n uen2 -nun3 n uen3 -nun4 n uen4 -nun5 n uen5 -lun1 l uen1 -lun2 l uen2 -lun3 l uen3 -lun4 l uen4 -lun5 l uen5 -gun1 g uen1 -gun2 g uen2 -gun3 g uen3 -gun4 g uen4 -gun5 g uen5 -kun1 k uen1 -kun2 k uen2 -kun3 k uen3 -kun4 k uen4 -kun5 k uen5 -hun1 h uen1 -hun2 h uen2 -hun3 h uen3 -hun4 h uen4 -hun5 h uen5 -zhun1 zh uen1 -zhun2 zh uen2 -zhun3 zh uen3 -zhun4 zh uen4 -zhun5 zh uen5 -chun1 ch uen1 -chun2 ch uen2 -chun3 ch uen3 -chun4 ch uen4 -chun5 ch uen5 -shun1 sh uen1 -shun2 sh uen2 -shun3 sh uen3 -shun4 sh uen4 -shun5 sh uen5 -run1 r uen1 -run2 r uen2 -run3 r uen3 -run4 r uen4 -run5 r uen5 -zun1 z uen1 -zun2 z uen2 -zun3 z uen3 -zun4 z uen4 -zun5 z uen5 -cun1 c uen1 -cun2 c uen2 -cun3 c uen3 -cun4 c uen4 -cun5 c uen5 -sun1 s uen1 -sun2 s uen2 -sun3 s uen3 -sun4 s uen4 -sun5 s uen5 -wang1 w uang1 -wang2 w uang2 -wang3 w uang3 -wang4 w uang4 -wang5 w uang5 -guang1 g uang1 -guang2 g uang2 -guang3 g uang3 -guang4 g uang4 -guang5 g uang5 -kuang1 k uang1 -kuang2 k uang2 -kuang3 k uang3 -kuang4 k uang4 -kuang5 k uang5 -huang1 h uang1 -huang2 h uang2 -huang3 h uang3 -huang4 h uang4 -huang5 h uang5 -zhuang1 zh uang1 -zhuang2 zh uang2 -zhuang3 zh uang3 -zhuang4 zh uang4 -zhuang5 zh uang5 -chuang1 ch uang1 -chuang2 ch uang2 -chuang3 ch uang3 -chuang4 ch uang4 -chuang5 ch uang5 -shuang1 sh uang1 -shuang2 sh uang2 -shuang3 sh uang3 -shuang4 sh uang4 -shuang5 sh uang5 -weng1 w ung1 -weng2 w ung2 -weng3 w ung3 -weng4 w ung4 -weng5 w ung5 -dong1 d ung1 -dong2 d ung2 -dong3 d ung3 -dong4 d ung4 -dong5 d ung5 -tong1 t ung1 -tong2 t ung2 -tong3 t ung3 -tong4 t ung4 -tong5 t ung5 -nong1 n ung1 -nong2 n ung2 -nong3 n ung3 -nong4 n ung4 -nong5 n ung5 -long1 l ung1 -long2 l ung2 -long3 l ung3 -long4 l ung4 -long5 l ung5 -gong1 g ung1 -gong2 g ung2 -gong3 g ung3 -gong4 g ung4 -gong5 g ung5 -kong1 k ung1 -kong2 k ung2 -kong3 k ung3 -kong4 k ung4 -kong5 k ung5 -hong1 h ung1 -hong2 h ung2 -hong3 h ung3 -hong4 h ung4 -hong5 h ung5 -zhong1 zh ung1 -zhong2 zh ung2 -zhong3 zh ung3 -zhong4 zh ung4 -zhong5 zh ung5 -chong1 ch ung1 -chong2 ch ung2 -chong3 ch ung3 -chong4 ch ung4 -chong5 ch ung5 -rong1 r ung1 -rong2 r ung2 -rong3 r ung3 -rong4 r ung4 -rong5 r ung5 -zong1 z ung1 -zong2 z ung2 -zong3 z ung3 -zong4 z ung4 -zong5 z ung5 -cong1 c ung1 -cong2 c ung2 -cong3 c ung3 -cong4 c ung4 -cong5 c ung5 -song1 s ung1 -song2 s ung2 -song3 s ung3 -song4 s ung4 -song5 s ung5 -yu1 y v1 -yu2 y v2 -yu3 y v3 -yu4 y v4 -yu5 y v5 -nv1 n v1 -nv2 n v2 -nv3 n v3 -nv4 n v4 -nv5 n v5 -lv1 l v1 -lv2 l v2 -lv3 l v3 -lv4 l v4 -lv5 l v5 -ju1 j v1 -ju2 j v2 -ju3 j v3 -ju4 j v4 -ju5 j v5 -qu1 q v1 -qu2 q v2 -qu3 q v3 -qu4 q v4 -qu5 q v5 -xu1 x v1 -xu2 x v2 -xu3 x v3 -xu4 x v4 -xu5 x v5 -yue1 y ve1 -yue2 y ve2 -yue3 y ve3 -yue4 y ve4 -yue5 y ve5 -nue1 n ve1 -nue2 n ve2 -nue3 n ve3 -nue4 n ve4 -nue5 n ve5 -nve1 n ve1 -nve2 n ve2 -nve3 n ve3 -nve4 n ve4 -nve5 n ve5 -lue1 l ve1 -lue2 l ve2 -lue3 l ve3 -lue4 l ve4 -lue5 l ve5 -lve1 l ve1 -lve2 l ve2 -lve3 l ve3 -lve4 l ve4 -lve5 l ve5 -jue1 j ve1 -jue2 j ve2 -jue3 j ve3 -jue4 j ve4 -jue5 j ve5 -que1 q ve1 -que2 q ve2 -que3 q ve3 -que4 q ve4 -que5 q ve5 -xue1 x ve1 -xue2 x ve2 -xue3 x ve3 -xue4 x ve4 -xue5 x ve5 -yuan1 y van1 -yuan2 y van2 -yuan3 y van3 -yuan4 y van4 -yuan5 y van5 -juan1 j van1 -juan2 j van2 -juan3 j van3 -juan4 j van4 -juan5 j van5 -quan1 q van1 -quan2 q van2 -quan3 q van3 -quan4 q van4 -quan5 q van5 -xuan1 x van1 -xuan2 x van2 -xuan3 x van3 -xuan4 x van4 -xuan5 x van5 -yun1 y vn1 -yun2 y vn2 -yun3 y vn3 -yun4 y vn4 -yun5 y vn5 -jun1 j vn1 -jun2 j vn2 -jun3 j vn3 -jun4 j vn4 -jun5 j vn5 -qun1 q vn1 -qun2 q vn2 -qun3 q vn3 -qun4 q vn4 -qun5 q vn5 -xun1 x vn1 -xun2 x vn2 -xun3 x vn3 -xun4 x vn4 -xun5 x vn5 -yong1 y vng1 -yong2 y vng2 -yong3 y vng3 -yong4 y vng4 -yong5 y vng5 -jiong1 j vng1 -jiong2 j vng2 -jiong3 j vng3 -jiong4 j vng4 -jiong5 j vng5 -qiong1 q vng1 -qiong2 q vng2 -qiong3 q vng3 -qiong4 q vng4 -qiong5 q vng5 -xiong1 x vng1 -xiong2 x vng2 -xiong3 x vng3 -xiong4 x vng4 -xiong5 x vng5 -zhir1 zh iii1 &r -zhir2 zh iii2 &r -zhir3 zh iii3 &r -zhir4 zh iii4 &r -zhir5 zh iii5 &r -chir1 ch iii1 &r -chir2 ch iii2 &r -chir3 ch iii3 &r -chir4 ch iii4 &r -chir5 ch iii5 &r -shir1 sh iii1 &r -shir2 sh iii2 &r -shir3 sh iii3 &r -shir4 sh iii4 &r -shir5 sh iii5 &r -rir1 r iii1 &r -rir2 r iii2 &r -rir3 r iii3 &r -rir4 r iii4 &r -rir5 r iii5 &r -zir1 z ii1 &r -zir2 z ii2 &r -zir3 z ii3 &r -zir4 z ii4 &r -zir5 z ii5 &r -cir1 c ii1 &r -cir2 c ii2 &r -cir3 c ii3 &r -cir4 c ii4 &r -cir5 c ii5 &r -sir1 s ii1 &r -sir2 s ii2 &r -sir3 s ii3 &r -sir4 s ii4 &r -sir5 s ii5 &r -ar1 a1 &r -ar2 a2 &r -ar3 a3 &r -ar4 a4 &r -ar5 a5 &r -bar1 b a1 &r -bar2 b a2 &r -bar3 b a3 &r -bar4 b a4 &r -bar5 b a5 &r -par1 p a1 &r -par2 p a2 &r -par3 p a3 &r -par4 p a4 &r -par5 p a5 &r -mar1 m a1 &r -mar2 m a2 &r -mar3 m a3 &r -mar4 m a4 &r -mar5 m a5 &r -far1 f a1 &r -far2 f a2 &r -far3 f a3 &r -far4 f a4 &r -far5 f a5 &r -dar1 d a1 &r -dar2 d a2 &r -dar3 d a3 &r -dar4 d a4 &r -dar5 d a5 &r -tar1 t a1 &r -tar2 t a2 &r -tar3 t a3 &r -tar4 t a4 &r -tar5 t a5 &r -nar1 n a1 &r -nar2 n a2 &r -nar3 n a3 &r -nar4 n a4 &r -nar5 n a5 &r -lar1 l a1 &r -lar2 l a2 &r -lar3 l a3 &r -lar4 l a4 &r -lar5 l a5 &r -gar1 g a1 &r -gar2 g a2 &r -gar3 g a3 &r -gar4 g a4 &r -gar5 g a5 &r -kar1 k a1 &r -kar2 k a2 &r -kar3 k a3 &r -kar4 k a4 &r -kar5 k a5 &r -har1 h a1 &r -har2 h a2 &r -har3 h a3 &r -har4 h a4 &r -har5 h a5 &r -zhar1 zh a1 &r -zhar2 zh a2 &r -zhar3 zh a3 &r -zhar4 zh a4 &r -zhar5 zh a5 &r -char1 ch a1 &r -char2 ch a2 &r -char3 ch a3 &r -char4 ch a4 &r -char5 ch a5 &r -shar1 sh a1 &r -shar2 sh a2 &r -shar3 sh a3 &r -shar4 sh a4 &r -shar5 sh a5 &r -zar1 z a1 &r -zar2 z a2 &r -zar3 z a3 &r -zar4 z a4 &r -zar5 z a5 &r -car1 c a1 &r -car2 c a2 &r -car3 c a3 &r -car4 c a4 &r -car5 c a5 &r -sar1 s a1 &r -sar2 s a2 &r -sar3 s a3 &r -sar4 s a4 &r -sar5 s a5 &r -or1 o1 &r -or2 o2 &r -or3 o3 &r -or4 o4 &r -or5 o5 &r -bor1 b uo1 &r -bor2 b uo2 &r -bor3 b uo3 &r -bor4 b uo4 &r -bor5 b uo5 &r -por1 p uo1 &r -por2 p uo2 &r -por3 p uo3 &r -por4 p uo4 &r -por5 p uo5 &r -mor1 m uo1 &r -mor2 m uo2 &r -mor3 m uo3 &r -mor4 m uo4 &r -mor5 m uo5 &r -for1 f uo1 &r -for2 f uo2 &r -for3 f uo3 &r -for4 f uo4 &r -for5 f uo5 &r -lor1 l o1 &r -lor2 l o2 &r -lor3 l o3 &r -lor4 l o4 &r -lor5 l o5 &r -mer1 m e1 &r -mer2 m e2 &r -mer3 m e3 &r -mer4 m e4 &r -mer5 m e5 &r -der1 d e1 &r -der2 d e2 &r -der3 d e3 &r -der4 d e4 &r -der5 d e5 &r -ter1 t e1 &r -ter2 t e2 &r -ter3 t e3 &r -ter4 t e4 &r -ter5 t e5 &r -ner1 n e1 &r -ner2 n e2 &r -ner3 n e3 &r -ner4 n e4 &r -ner5 n e5 &r -ler1 l e1 &r -ler2 l e2 &r -ler3 l e3 &r -ler4 l e4 &r -ler5 l e5 &r -ger1 g e1 &r -ger2 g e2 &r -ger3 g e3 &r -ger4 g e4 &r -ger5 g e5 &r -ker1 k e1 &r -ker2 k e2 &r -ker3 k e3 &r -ker4 k e4 &r -ker5 k e5 &r -her1 h e1 &r -her2 h e2 &r -her3 h e3 &r -her4 h e4 &r -her5 h e5 &r -zher1 zh e1 &r -zher2 zh e2 &r -zher3 zh e3 &r -zher4 zh e4 &r -zher5 zh e5 &r -cher1 ch e1 &r -cher2 ch e2 &r -cher3 ch e3 &r -cher4 ch e4 &r -cher5 ch e5 &r -sher1 sh e1 &r -sher2 sh e2 &r -sher3 sh e3 &r -sher4 sh e4 &r -sher5 sh e5 &r -rer1 r e1 &r -rer2 r e2 &r -rer3 r e3 &r -rer4 r e4 &r -rer5 r e5 &r -zer1 z e1 &r -zer2 z e2 &r -zer3 z e3 &r -zer4 z e4 &r -zer5 z e5 &r -cer1 c e1 &r -cer2 c e2 &r -cer3 c e3 &r -cer4 c e4 &r -cer5 c e5 &r -ser1 s e1 &r -ser2 s e2 &r -ser3 s e3 &r -ser4 s e4 &r -ser5 s e5 &r -air1 ai1 &r -air2 ai2 &r -air3 ai3 &r -air4 ai4 &r -air5 ai5 &r -bair1 b ai1 &r -bair2 b ai2 &r -bair3 b ai3 &r -bair4 b ai4 &r -bair5 b ai5 &r -pair1 p ai1 &r -pair2 p ai2 &r -pair3 p ai3 &r -pair4 p ai4 &r -pair5 p ai5 &r -mair1 m ai1 &r -mair2 m ai2 &r -mair3 m ai3 &r -mair4 m ai4 &r -mair5 m ai5 &r -dair1 d ai1 &r -dair2 d ai2 &r -dair3 d ai3 &r -dair4 d ai4 &r -dair5 d ai5 &r -tair1 t ai1 &r -tair2 t ai2 &r -tair3 t ai3 &r -tair4 t ai4 &r -tair5 t ai5 &r -nair1 n ai1 &r -nair2 n ai2 &r -nair3 n ai3 &r -nair4 n ai4 &r -nair5 n ai5 &r -lair1 l ai1 &r -lair2 l ai2 &r -lair3 l ai3 &r -lair4 l ai4 &r -lair5 l ai5 &r -gair1 g ai1 &r -gair2 g ai2 &r -gair3 g ai3 &r -gair4 g ai4 &r -gair5 g ai5 &r -kair1 k ai1 &r -kair2 k ai2 &r -kair3 k ai3 &r -kair4 k ai4 &r -kair5 k ai5 &r -hair1 h ai1 &r -hair2 h ai2 &r -hair3 h ai3 &r -hair4 h ai4 &r -hair5 h ai5 &r -zhair1 zh ai1 &r -zhair2 zh ai2 &r -zhair3 zh ai3 &r -zhair4 zh ai4 &r -zhair5 zh ai5 &r -chair1 ch ai1 &r -chair2 ch ai2 &r -chair3 ch ai3 &r -chair4 ch ai4 &r -chair5 ch ai5 &r -shair1 sh ai1 &r -shair2 sh ai2 &r -shair3 sh ai3 &r -shair4 sh ai4 &r -shair5 sh ai5 &r -zair1 z ai1 &r -zair2 z ai2 &r -zair3 z ai3 &r -zair4 z ai4 &r -zair5 z ai5 &r -cair1 c ai1 &r -cair2 c ai2 &r -cair3 c ai3 &r -cair4 c ai4 &r -cair5 c ai5 &r -sair1 s ai1 &r -sair2 s ai2 &r -sair3 s ai3 &r -sair4 s ai4 &r -sair5 s ai5 &r -beir1 b ei1 &r -beir2 b ei2 &r -beir3 b ei3 &r -beir4 b ei4 &r -beir5 b ei5 &r -peir1 p ei1 &r -peir2 p ei2 &r -peir3 p ei3 &r -peir4 p ei4 &r -peir5 p ei5 &r -meir1 m ei1 &r -meir2 m ei2 &r -meir3 m ei3 &r -meir4 m ei4 &r -meir5 m ei5 &r -feir1 f ei1 &r -feir2 f ei2 &r -feir3 f ei3 &r -feir4 f ei4 &r -feir5 f ei5 &r -deir1 d ei1 &r -deir2 d ei2 &r -deir3 d ei3 &r -deir4 d ei4 &r -deir5 d ei5 &r -teir1 t ei1 &r -teir2 t ei2 &r -teir3 t ei3 &r -teir4 t ei4 &r -teir5 t ei5 &r -neir1 n ei1 &r -neir2 n ei2 &r -neir3 n ei3 &r -neir4 n ei4 &r -neir5 n ei5 &r -leir1 l ei1 &r -leir2 l ei2 &r -leir3 l ei3 &r -leir4 l ei4 &r -leir5 l ei5 &r -geir1 g ei1 &r -geir2 g ei2 &r -geir3 g ei3 &r -geir4 g ei4 &r -geir5 g ei5 &r -keir1 k ei1 &r -keir2 k ei2 &r -keir3 k ei3 &r -keir4 k ei4 &r -keir5 k ei5 &r -heir1 h ei1 &r -heir2 h ei2 &r -heir3 h ei3 &r -heir4 h ei4 &r -heir5 h ei5 &r -zheir1 zh ei1 &r -zheir2 zh ei2 &r -zheir3 zh ei3 &r -zheir4 zh ei4 &r -zheir5 zh ei5 &r -sheir1 sh ei1 &r -sheir2 sh ei2 &r -sheir3 sh ei3 &r -sheir4 sh ei4 &r -sheir5 sh ei5 &r -zeir1 z ei1 &r -zeir2 z ei2 &r -zeir3 z ei3 &r -zeir4 z ei4 &r -zeir5 z ei5 &r -aor1 au1 &r -aor2 au2 &r -aor3 au3 &r -aor4 au4 &r -aor5 au5 &r -baor1 b au1 &r -baor2 b au2 &r -baor3 b au3 &r -baor4 b au4 &r -baor5 b au5 &r -paor1 p au1 &r -paor2 p au2 &r -paor3 p au3 &r -paor4 p au4 &r -paor5 p au5 &r -maor1 m au1 &r -maor2 m au2 &r -maor3 m au3 &r -maor4 m au4 &r -maor5 m au5 &r -daor1 d au1 &r -daor2 d au2 &r -daor3 d au3 &r -daor4 d au4 &r -daor5 d au5 &r -taor1 t au1 &r -taor2 t au2 &r -taor3 t au3 &r -taor4 t au4 &r -taor5 t au5 &r -naor1 n au1 &r -naor2 n au2 &r -naor3 n au3 &r -naor4 n au4 &r -naor5 n au5 &r -laor1 l au1 &r -laor2 l au2 &r -laor3 l au3 &r -laor4 l au4 &r -laor5 l au5 &r -gaor1 g au1 &r -gaor2 g au2 &r -gaor3 g au3 &r -gaor4 g au4 &r -gaor5 g au5 &r -kaor1 k au1 &r -kaor2 k au2 &r -kaor3 k au3 &r -kaor4 k au4 &r -kaor5 k au5 &r -haor1 h au1 &r -haor2 h au2 &r -haor3 h au3 &r -haor4 h au4 &r -haor5 h au5 &r -zhaor1 zh au1 &r -zhaor2 zh au2 &r -zhaor3 zh au3 &r -zhaor4 zh au4 &r -zhaor5 zh au5 &r -chaor1 ch au1 &r -chaor2 ch au2 &r -chaor3 ch au3 &r -chaor4 ch au4 &r -chaor5 ch au5 &r -shaor1 sh au1 &r -shaor2 sh au2 &r -shaor3 sh au3 &r -shaor4 sh au4 &r -shaor5 sh au5 &r -raor1 r au1 &r -raor2 r au2 &r -raor3 r au3 &r -raor4 r au4 &r -raor5 r au5 &r -zaor1 z au1 &r -zaor2 z au2 &r -zaor3 z au3 &r -zaor4 z au4 &r -zaor5 z au5 &r -caor1 c au1 &r -caor2 c au2 &r -caor3 c au3 &r -caor4 c au4 &r -caor5 c au5 &r -saor1 s au1 &r -saor2 s au2 &r -saor3 s au3 &r -saor4 s au4 &r -saor5 s au5 &r -our1 ou1 &r -our2 ou2 &r -our3 ou3 &r -our4 ou4 &r -our5 ou5 &r -pour1 p ou1 &r -pour2 p ou2 &r -pour3 p ou3 &r -pour4 p ou4 &r -pour5 p ou5 &r -mour1 m ou1 &r -mour2 m ou2 &r -mour3 m ou3 &r -mour4 m ou4 &r -mour5 m ou5 &r -four1 f ou1 &r -four2 f ou2 &r -four3 f ou3 &r -four4 f ou4 &r -four5 f ou5 &r -dour1 d ou1 &r -dour2 d ou2 &r -dour3 d ou3 &r -dour4 d ou4 &r -dour5 d ou5 &r -tour1 t ou1 &r -tour2 t ou2 &r -tour3 t ou3 &r -tour4 t ou4 &r -tour5 t ou5 &r -nour1 n ou1 &r -nour2 n ou2 &r -nour3 n ou3 &r -nour4 n ou4 &r -nour5 n ou5 &r -lour1 l ou1 &r -lour2 l ou2 &r -lour3 l ou3 &r -lour4 l ou4 &r -lour5 l ou5 &r -gour1 g ou1 &r -gour2 g ou2 &r -gour3 g ou3 &r -gour4 g ou4 &r -gour5 g ou5 &r -kour1 k ou1 &r -kour2 k ou2 &r -kour3 k ou3 &r -kour4 k ou4 &r -kour5 k ou5 &r -hour1 h ou1 &r -hour2 h ou2 &r -hour3 h ou3 &r -hour4 h ou4 &r -hour5 h ou5 &r -zhour1 zh ou1 &r -zhour2 zh ou2 &r -zhour3 zh ou3 &r -zhour4 zh ou4 &r -zhour5 zh ou5 &r -chour1 ch ou1 &r -chour2 ch ou2 &r -chour3 ch ou3 &r -chour4 ch ou4 &r -chour5 ch ou5 &r -shour1 sh ou1 &r -shour2 sh ou2 &r -shour3 sh ou3 &r -shour4 sh ou4 &r -shour5 sh ou5 &r -rour1 r ou1 &r -rour2 r ou2 &r -rour3 r ou3 &r -rour4 r ou4 &r -rour5 r ou5 &r -zour1 z ou1 &r -zour2 z ou2 &r -zour3 z ou3 &r -zour4 z ou4 &r -zour5 z ou5 &r -cour1 c ou1 &r -cour2 c ou2 &r -cour3 c ou3 &r -cour4 c ou4 &r -cour5 c ou5 &r -sour1 s ou1 &r -sour2 s ou2 &r -sour3 s ou3 &r -sour4 s ou4 &r -sour5 s ou5 &r -anr1 an1 &r -anr2 an2 &r -anr3 an3 &r -anr4 an4 &r -anr5 an5 &r -banr1 b an1 &r -banr2 b an2 &r -banr3 b an3 &r -banr4 b an4 &r -banr5 b an5 &r -panr1 p an1 &r -panr2 p an2 &r -panr3 p an3 &r -panr4 p an4 &r -panr5 p an5 &r -manr1 m an1 &r -manr2 m an2 &r -manr3 m an3 &r -manr4 m an4 &r -manr5 m an5 &r -fanr1 f an1 &r -fanr2 f an2 &r -fanr3 f an3 &r -fanr4 f an4 &r -fanr5 f an5 &r -danr1 d an1 &r -danr2 d an2 &r -danr3 d an3 &r -danr4 d an4 &r -danr5 d an5 &r -tanr1 t an1 &r -tanr2 t an2 &r -tanr3 t an3 &r -tanr4 t an4 &r -tanr5 t an5 &r -nanr1 n an1 &r -nanr2 n an2 &r -nanr3 n an3 &r -nanr4 n an4 &r -nanr5 n an5 &r -lanr1 l an1 &r -lanr2 l an2 &r -lanr3 l an3 &r -lanr4 l an4 &r -lanr5 l an5 &r -ganr1 g an1 &r -ganr2 g an2 &r -ganr3 g an3 &r -ganr4 g an4 &r -ganr5 g an5 &r -kanr1 k an1 &r -kanr2 k an2 &r -kanr3 k an3 &r -kanr4 k an4 &r -kanr5 k an5 &r -hanr1 h an1 &r -hanr2 h an2 &r -hanr3 h an3 &r -hanr4 h an4 &r -hanr5 h an5 &r -zhanr1 zh an1 &r -zhanr2 zh an2 &r -zhanr3 zh an3 &r -zhanr4 zh an4 &r -zhanr5 zh an5 &r -chanr1 ch an1 &r -chanr2 ch an2 &r -chanr3 ch an3 &r -chanr4 ch an4 &r -chanr5 ch an5 &r -shanr1 sh an1 &r -shanr2 sh an2 &r -shanr3 sh an3 &r -shanr4 sh an4 &r -shanr5 sh an5 &r -ranr1 r an1 &r -ranr2 r an2 &r -ranr3 r an3 &r -ranr4 r an4 &r -ranr5 r an5 &r -zanr1 z an1 &r -zanr2 z an2 &r -zanr3 z an3 &r -zanr4 z an4 &r -zanr5 z an5 &r -canr1 c an1 &r -canr2 c an2 &r -canr3 c an3 &r -canr4 c an4 &r -canr5 c an5 &r -sanr1 s an1 &r -sanr2 s an2 &r -sanr3 s an3 &r -sanr4 s an4 &r -sanr5 s an5 &r -benr1 b en1 &r -benr2 b en2 &r -benr3 b en3 &r -benr4 b en4 &r -benr5 b en5 &r -penr1 p en1 &r -penr2 p en2 &r -penr3 p en3 &r -penr4 p en4 &r -penr5 p en5 &r -menr1 m en1 &r -menr2 m en2 &r -menr3 m en3 &r -menr4 m en4 &r -menr5 m en5 &r -fenr1 f en1 &r -fenr2 f en2 &r -fenr3 f en3 &r -fenr4 f en4 &r -fenr5 f en5 &r -denr1 d en1 &r -denr2 d en2 &r -denr3 d en3 &r -denr4 d en4 &r -denr5 d en5 &r -nenr1 n en1 &r -nenr2 n en2 &r -nenr3 n en3 &r -nenr4 n en4 &r -nenr5 n en5 &r -genr1 g en1 &r -genr2 g en2 &r -genr3 g en3 &r -genr4 g en4 &r -genr5 g en5 &r -kenr1 k en1 &r -kenr2 k en2 &r -kenr3 k en3 &r -kenr4 k en4 &r -kenr5 k en5 &r -henr1 h en1 &r -henr2 h en2 &r -henr3 h en3 &r -henr4 h en4 &r -henr5 h en5 &r -zhenr1 zh en1 &r -zhenr2 zh en2 &r -zhenr3 zh en3 &r -zhenr4 zh en4 &r -zhenr5 zh en5 &r -chenr1 ch en1 &r -chenr2 ch en2 &r -chenr3 ch en3 &r -chenr4 ch en4 &r -chenr5 ch en5 &r -shenr1 sh en1 &r -shenr2 sh en2 &r -shenr3 sh en3 &r -shenr4 sh en4 &r -shenr5 sh en5 &r -renr1 r en1 &r -renr2 r en2 &r -renr3 r en3 &r -renr4 r en4 &r -renr5 r en5 &r -zenr1 z en1 &r -zenr2 z en2 &r -zenr3 z en3 &r -zenr4 z en4 &r -zenr5 z en5 &r -cenr1 c en1 &r -cenr2 c en2 &r -cenr3 c en3 &r -cenr4 c en4 &r -cenr5 c en5 &r -senr1 s en1 &r -senr2 s en2 &r -senr3 s en3 &r -senr4 s en4 &r -senr5 s en5 &r -angr1 ang1 &r -angr2 ang2 &r -angr3 ang3 &r -angr4 ang4 &r -angr5 ang5 &r -bangr1 b ang1 &r -bangr2 b ang2 &r -bangr3 b ang3 &r -bangr4 b ang4 &r -bangr5 b ang5 &r -pangr1 p ang1 &r -pangr2 p ang2 &r -pangr3 p ang3 &r -pangr4 p ang4 &r -pangr5 p ang5 &r -mangr1 m ang1 &r -mangr2 m ang2 &r -mangr3 m ang3 &r -mangr4 m ang4 &r -mangr5 m ang5 &r -fangr1 f ang1 &r -fangr2 f ang2 &r -fangr3 f ang3 &r -fangr4 f ang4 &r -fangr5 f ang5 &r -dangr1 d ang1 &r -dangr2 d ang2 &r -dangr3 d ang3 &r -dangr4 d ang4 &r -dangr5 d ang5 &r -tangr1 t ang1 &r -tangr2 t ang2 &r -tangr3 t ang3 &r -tangr4 t ang4 &r -tangr5 t ang5 &r -nangr1 n ang1 &r -nangr2 n ang2 &r -nangr3 n ang3 &r -nangr4 n ang4 &r -nangr5 n ang5 &r -langr1 l ang1 &r -langr2 l ang2 &r -langr3 l ang3 &r -langr4 l ang4 &r -langr5 l ang5 &r -gangr1 g ang1 &r -gangr2 g ang2 &r -gangr3 g ang3 &r -gangr4 g ang4 &r -gangr5 g ang5 &r -kangr1 k ang1 &r -kangr2 k ang2 &r -kangr3 k ang3 &r -kangr4 k ang4 &r -kangr5 k ang5 &r -hangr1 h ang1 &r -hangr2 h ang2 &r -hangr3 h ang3 &r -hangr4 h ang4 &r -hangr5 h ang5 &r -zhangr1 zh ang1 &r -zhangr2 zh ang2 &r -zhangr3 zh ang3 &r -zhangr4 zh ang4 &r -zhangr5 zh ang5 &r -changr1 ch ang1 &r -changr2 ch ang2 &r -changr3 ch ang3 &r -changr4 ch ang4 &r -changr5 ch ang5 &r -shangr1 sh ang1 &r -shangr2 sh ang2 &r -shangr3 sh ang3 &r -shangr4 sh ang4 &r -shangr5 sh ang5 &r -rangr1 r ang1 &r -rangr2 r ang2 &r -rangr3 r ang3 &r -rangr4 r ang4 &r -rangr5 r ang5 &r -zangr1 z ang1 &r -zangr2 z ang2 &r -zangr3 z ang3 &r -zangr4 z ang4 &r -zangr5 z ang5 &r -cangr1 c ang1 &r -cangr2 c ang2 &r -cangr3 c ang3 &r -cangr4 c ang4 &r -cangr5 c ang5 &r -sangr1 s ang1 &r -sangr2 s ang2 &r -sangr3 s ang3 &r -sangr4 s ang4 &r -sangr5 s ang5 &r -bengr1 b eng1 &r -bengr2 b eng2 &r -bengr3 b eng3 &r -bengr4 b eng4 &r -bengr5 b eng5 &r -pengr1 p eng1 &r -pengr2 p eng2 &r -pengr3 p eng3 &r -pengr4 p eng4 &r -pengr5 p eng5 &r -mengr1 m eng1 &r -mengr2 m eng2 &r -mengr3 m eng3 &r -mengr4 m eng4 &r -mengr5 m eng5 &r -fengr1 f eng1 &r -fengr2 f eng2 &r -fengr3 f eng3 &r -fengr4 f eng4 &r -fengr5 f eng5 &r -dengr1 d eng1 &r -dengr2 d eng2 &r -dengr3 d eng3 &r -dengr4 d eng4 &r -dengr5 d eng5 &r -tengr1 t eng1 &r -tengr2 t eng2 &r -tengr3 t eng3 &r -tengr4 t eng4 &r -tengr5 t eng5 &r -nengr1 n eng1 &r -nengr2 n eng2 &r -nengr3 n eng3 &r -nengr4 n eng4 &r -nengr5 n eng5 &r -lengr1 l eng1 &r -lengr2 l eng2 &r -lengr3 l eng3 &r -lengr4 l eng4 &r -lengr5 l eng5 &r -gengr1 g eng1 &r -gengr2 g eng2 &r -gengr3 g eng3 &r -gengr4 g eng4 &r -gengr5 g eng5 &r -kengr1 k eng1 &r -kengr2 k eng2 &r -kengr3 k eng3 &r -kengr4 k eng4 &r -kengr5 k eng5 &r -hengr1 h eng1 &r -hengr2 h eng2 &r -hengr3 h eng3 &r -hengr4 h eng4 &r -hengr5 h eng5 &r -zhengr1 zh eng1 &r -zhengr2 zh eng2 &r -zhengr3 zh eng3 &r -zhengr4 zh eng4 &r -zhengr5 zh eng5 &r -chengr1 ch eng1 &r -chengr2 ch eng2 &r -chengr3 ch eng3 &r -chengr4 ch eng4 &r -chengr5 ch eng5 &r -shengr1 sh eng1 &r -shengr2 sh eng2 &r -shengr3 sh eng3 &r -shengr4 sh eng4 &r -shengr5 sh eng5 &r -rengr1 r eng1 &r -rengr2 r eng2 &r -rengr3 r eng3 &r -rengr4 r eng4 &r -rengr5 r eng5 &r -zengr1 z eng1 &r -zengr2 z eng2 &r -zengr3 z eng3 &r -zengr4 z eng4 &r -zengr5 z eng5 &r -cengr1 c eng1 &r -cengr2 c eng2 &r -cengr3 c eng3 &r -cengr4 c eng4 &r -cengr5 c eng5 &r -sengr1 s eng1 &r -sengr2 s eng2 &r -sengr3 s eng3 &r -sengr4 s eng4 &r -sengr5 s eng5 &r -yir1 y i1 &r -yir2 y i2 &r -yir3 y i3 &r -yir4 y i4 &r -yir5 y i5 &r -bir1 b i1 &r -bir2 b i2 &r -bir3 b i3 &r -bir4 b i4 &r -bir5 b i5 &r -pir1 p i1 &r -pir2 p i2 &r -pir3 p i3 &r -pir4 p i4 &r -pir5 p i5 &r -mir1 m i1 &r -mir2 m i2 &r -mir3 m i3 &r -mir4 m i4 &r -mir5 m i5 &r -dir1 d i1 &r -dir2 d i2 &r -dir3 d i3 &r -dir4 d i4 &r -dir5 d i5 &r -tir1 t i1 &r -tir2 t i2 &r -tir3 t i3 &r -tir4 t i4 &r -tir5 t i5 &r -nir1 n i1 &r -nir2 n i2 &r -nir3 n i3 &r -nir4 n i4 &r -nir5 n i5 &r -lir1 l i1 &r -lir2 l i2 &r -lir3 l i3 &r -lir4 l i4 &r -lir5 l i5 &r -jir1 j i1 &r -jir2 j i2 &r -jir3 j i3 &r -jir4 j i4 &r -jir5 j i5 &r -qir1 q i1 &r -qir2 q i2 &r -qir3 q i3 &r -qir4 q i4 &r -qir5 q i5 &r -xir1 x i1 &r -xir2 x i2 &r -xir3 x i3 &r -xir4 x i4 &r -xir5 x i5 &r -yar1 y ia1 &r -yar2 y ia2 &r -yar3 y ia3 &r -yar4 y ia4 &r -yar5 y ia5 &r -diar1 d ia1 &r -diar2 d ia2 &r -diar3 d ia3 &r -diar4 d ia4 &r -diar5 d ia5 &r -liar1 l ia1 &r -liar2 l ia2 &r -liar3 l ia3 &r -liar4 l ia4 &r -liar5 l ia5 &r -jiar1 j ia1 &r -jiar2 j ia2 &r -jiar3 j ia3 &r -jiar4 j ia4 &r -jiar5 j ia5 &r -qiar1 q ia1 &r -qiar2 q ia2 &r -qiar3 q ia3 &r -qiar4 q ia4 &r -qiar5 q ia5 &r -xiar1 x ia1 &r -xiar2 x ia2 &r -xiar3 x ia3 &r -xiar4 x ia4 &r -xiar5 x ia5 &r -yor1 y io1 &r -yor2 y io2 &r -yor3 y io3 &r -yor4 y io4 &r -yor5 y io5 &r -yer1 y ie1 &r -yer2 y ie2 &r -yer3 y ie3 &r -yer4 y ie4 &r -yer5 y ie5 &r -bier1 b ie1 &r -bier2 b ie2 &r -bier3 b ie3 &r -bier4 b ie4 &r -bier5 b ie5 &r -pier1 p ie1 &r -pier2 p ie2 &r -pier3 p ie3 &r -pier4 p ie4 &r -pier5 p ie5 &r -mier1 m ie1 &r -mier2 m ie2 &r -mier3 m ie3 &r -mier4 m ie4 &r -mier5 m ie5 &r -dier1 d ie1 &r -dier2 d ie2 &r -dier3 d ie3 &r -dier4 d ie4 &r -dier5 d ie5 &r -tier1 t ie1 &r -tier2 t ie2 &r -tier3 t ie3 &r -tier4 t ie4 &r -tier5 t ie5 &r -nier1 n ie1 &r -nier2 n ie2 &r -nier3 n ie3 &r -nier4 n ie4 &r -nier5 n ie5 &r -lier1 l ie1 &r -lier2 l ie2 &r -lier3 l ie3 &r -lier4 l ie4 &r -lier5 l ie5 &r -jier1 j ie1 &r -jier2 j ie2 &r -jier3 j ie3 &r -jier4 j ie4 &r -jier5 j ie5 &r -qier1 q ie1 &r -qier2 q ie2 &r -qier3 q ie3 &r -qier4 q ie4 &r -qier5 q ie5 &r -xier1 x ie1 &r -xier2 x ie2 &r -xier3 x ie3 &r -xier4 x ie4 &r -xier5 x ie5 &r -yair1 y ai1 &r -yair2 y ai2 &r -yair3 y ai3 &r -yair4 y ai4 &r -yair5 y ai5 &r -yaor1 y au1 &r -yaor2 y au2 &r -yaor3 y au3 &r -yaor4 y au4 &r -yaor5 y au5 &r -biaor1 b iau1 &r -biaor2 b iau2 &r -biaor3 b iau3 &r -biaor4 b iau4 &r -biaor5 b iau5 &r -piaor1 p iau1 &r -piaor2 p iau2 &r -piaor3 p iau3 &r -piaor4 p iau4 &r -piaor5 p iau5 &r -miaor1 m iau1 &r -miaor2 m iau2 &r -miaor3 m iau3 &r -miaor4 m iau4 &r -miaor5 m iau5 &r -fiaor1 f iau1 &r -fiaor2 f iau2 &r -fiaor3 f iau3 &r -fiaor4 f iau4 &r -fiaor5 f iau5 &r -diaor1 d iau1 &r -diaor2 d iau2 &r -diaor3 d iau3 &r -diaor4 d iau4 &r -diaor5 d iau5 &r -tiaor1 t iau1 &r -tiaor2 t iau2 &r -tiaor3 t iau3 &r -tiaor4 t iau4 &r -tiaor5 t iau5 &r -niaor1 n iau1 &r -niaor2 n iau2 &r -niaor3 n iau3 &r -niaor4 n iau4 &r -niaor5 n iau5 &r -liaor1 l iau1 &r -liaor2 l iau2 &r -liaor3 l iau3 &r -liaor4 l iau4 &r -liaor5 l iau5 &r -jiaor1 j iau1 &r -jiaor2 j iau2 &r -jiaor3 j iau3 &r -jiaor4 j iau4 &r -jiaor5 j iau5 &r -qiaor1 q iau1 &r -qiaor2 q iau2 &r -qiaor3 q iau3 &r -qiaor4 q iau4 &r -qiaor5 q iau5 &r -xiaor1 x iau1 &r -xiaor2 x iau2 &r -xiaor3 x iau3 &r -xiaor4 x iau4 &r -xiaor5 x iau5 &r -your1 y iou1 &r -your2 y iou2 &r -your3 y iou3 &r -your4 y iou4 &r -your5 y iou5 &r -miur1 m iou1 &r -miur2 m iou2 &r -miur3 m iou3 &r -miur4 m iou4 &r -miur5 m iou5 &r -diur1 d iou1 &r -diur2 d iou2 &r -diur3 d iou3 &r -diur4 d iou4 &r -diur5 d iou5 &r -niur1 n iou1 &r -niur2 n iou2 &r -niur3 n iou3 &r -niur4 n iou4 &r -niur5 n iou5 &r -liur1 l iou1 &r -liur2 l iou2 &r -liur3 l iou3 &r -liur4 l iou4 &r -liur5 l iou5 &r -jiur1 j iou1 &r -jiur2 j iou2 &r -jiur3 j iou3 &r -jiur4 j iou4 &r -jiur5 j iou5 &r -qiur1 q iou1 &r -qiur2 q iou2 &r -qiur3 q iou3 &r -qiur4 q iou4 &r -qiur5 q iou5 &r -xiur1 xiou1 &r -xiur2 xiou2 &r -xiur3 xiou3 &r -xiur4 xiou4 &r -xiur5 xiou5 &r -yanr1 y ian1 &r -yanr2 y ian2 &r -yanr3 y ian3 &r -yanr4 y ian4 &r -yanr5 y ian5 &r -bianr1 b ian1 &r -bianr2 b ian2 &r -bianr3 b ian3 &r -bianr4 b ian4 &r -bianr5 b ian5 &r -pianr1 p ian1 &r -pianr2 p ian2 &r -pianr3 p ian3 &r -pianr4 p ian4 &r -pianr5 p ian5 &r -mianr1 m ian1 &r -mianr2 m ian2 &r -mianr3 m ian3 &r -mianr4 m ian4 &r -mianr5 m ian5 &r -dianr1 d ian1 &r -dianr2 d ian2 &r -dianr3 d ian3 &r -dianr4 d ian4 &r -dianr5 d ian5 &r -tianr1 t ian1 &r -tianr2 t ian2 &r -tianr3 t ian3 &r -tianr4 t ian4 &r -tianr5 t ian5 &r -nianr1 n ian1 &r -nianr2 n ian2 &r -nianr3 n ian3 &r -nianr4 n ian4 &r -nianr5 n ian5 &r -lianr1 l ian1 &r -lianr2 l ian2 &r -lianr3 l ian3 &r -lianr4 l ian4 &r -lianr5 l ian5 &r -jianr1 j ian1 &r -jianr2 j ian2 &r -jianr3 j ian3 &r -jianr4 j ian4 &r -jianr5 j ian5 &r -qianr1 q ian1 &r -qianr2 q ian2 &r -qianr3 q ian3 &r -qianr4 q ian4 &r -qianr5 q ian5 &r -xianr1 x ian1 &r -xianr2 x ian2 &r -xianr3 x ian3 &r -xianr4 x ian4 &r -xianr5 x ian5 &r -yinr1 y in1 &r -yinr2 y in2 &r -yinr3 y in3 &r -yinr4 y in4 &r -yinr5 y in5 &r -binr1 b in1 &r -binr2 b in2 &r -binr3 b in3 &r -binr4 b in4 &r -binr5 b in5 &r -pinr1 p in1 &r -pinr2 p in2 &r -pinr3 p in3 &r -pinr4 p in4 &r -pinr5 p in5 &r -minr1 m in1 &r -minr2 m in2 &r -minr3 m in3 &r -minr4 m in4 &r -minr5 m in5 &r -dinr1 d in1 &r -dinr2 d in2 &r -dinr3 d in3 &r -dinr4 d in4 &r -dinr5 d in5 &r -ninr1 n in1 &r -ninr2 n in2 &r -ninr3 n in3 &r -ninr4 n in4 &r -ninr5 n in5 &r -linr1 l in1 &r -linr2 l in2 &r -linr3 l in3 &r -linr4 l in4 &r -linr5 l in5 &r -jinr1 j in1 &r -jinr2 j in2 &r -jinr3 j in3 &r -jinr4 j in4 &r -jinr5 j in5 &r -qinr1 q in1 &r -qinr2 q in2 &r -qinr3 q in3 &r -qinr4 q in4 &r -qinr5 q in5 &r -xinr1 x in1 &r -xinr2 x in2 &r -xinr3 x in3 &r -xinr4 x in4 &r -xinr5 x in5 &r -yangr1 y iang1 &r -yangr2 y iang2 &r -yangr3 y iang3 &r -yangr4 y iang4 &r -yangr5 y iang5 &r -biangr1 b iang1 &r -biangr2 b iang2 &r -biangr3 b iang3 &r -biangr4 b iang4 &r -biangr5 b iang5 &r -niangr1 n iang1 &r -niangr2 n iang2 &r -niangr3 n iang3 &r -niangr4 n iang4 &r -niangr5 n iang5 &r -liangr1 l iang1 &r -liangr2 l iang2 &r -liangr3 l iang3 &r -liangr4 l iang4 &r -liangr5 l iang5 &r -jiangr1 j iang1 &r -jiangr2 j iang2 &r -jiangr3 j iang3 &r -jiangr4 j iang4 &r -jiangr5 j iang5 &r -qiangr1 q iang1 &r -qiangr2 q iang2 &r -qiangr3 q iang3 &r -qiangr4 q iang4 &r -qiangr5 q iang5 &r -xiangr1 x iang1 &r -xiangr2 x iang2 &r -xiangr3 x iang3 &r -xiangr4 x iang4 &r -xiangr5 x iang5 &r -yingr1 y ing1 &r -yingr2 y ing2 &r -yingr3 y ing3 &r -yingr4 y ing4 &r -yingr5 y ing5 &r -bingr1 b ing1 &r -bingr2 b ing2 &r -bingr3 b ing3 &r -bingr4 b ing4 &r -bingr5 b ing5 &r -pingr1 p ing1 &r -pingr2 p ing2 &r -pingr3 p ing3 &r -pingr4 p ing4 &r -pingr5 p ing5 &r -mingr1 m ing1 &r -mingr2 m ing2 &r -mingr3 m ing3 &r -mingr4 m ing4 &r -mingr5 m ing5 &r -dingr1 d ing1 &r -dingr2 d ing2 &r -dingr3 d ing3 &r -dingr4 d ing4 &r -dingr5 d ing5 &r -tingr1 t ing1 &r -tingr2 t ing2 &r -tingr3 t ing3 &r -tingr4 t ing4 &r -tingr5 t ing5 &r -ningr1 n ing1 &r -ningr2 n ing2 &r -ningr3 n ing3 &r -ningr4 n ing4 &r -ningr5 n ing5 &r -lingr1 l ing1 &r -lingr2 l ing2 &r -lingr3 l ing3 &r -lingr4 l ing4 &r -lingr5 l ing5 &r -jingr1 j ing1 &r -jingr2 j ing2 &r -jingr3 j ing3 &r -jingr4 j ing4 &r -jingr5 j ing5 &r -qingr1 q ing1 &r -qingr2 q ing2 &r -qingr3 q ing3 &r -qingr4 q ing4 &r -qingr5 q ing5 &r -xingr1 x ing1 &r -xingr2 x ing2 &r -xingr3 x ing3 &r -xingr4 x ing4 &r -xingr5 x ing5 &r -wur1 w u1 &r -wur2 w u2 &r -wur3 w u3 &r -wur4 w u4 &r -wur5 w u5 &r -bur1 b u1 &r -bur2 b u2 &r -bur3 b u3 &r -bur4 b u4 &r -bur5 b u5 &r -pur1 p u1 &r -pur2 p u2 &r -pur3 p u3 &r -pur4 p u4 &r -pur5 p u5 &r -mur1 m u1 &r -mur2 m u2 &r -mur3 m u3 &r -mur4 m u4 &r -mur5 m u5 &r -fur1 f u1 &r -fur2 f u2 &r -fur3 f u3 &r -fur4 f u4 &r -fur5 f u5 &r -dur1 d u1 &r -dur2 d u2 &r -dur3 d u3 &r -dur4 d u4 &r -dur5 d u5 &r -tur1 t u1 &r -tur2 t u2 &r -tur3 t u3 &r -tur4 t u4 &r -tur5 t u5 &r -nur1 n u1 &r -nur2 n u2 &r -nur3 n u3 &r -nur4 n u4 &r -nur5 n u5 &r -lur1 l u1 &r -lur2 l u2 &r -lur3 l u3 &r -lur4 l u4 &r -lur5 l u5 &r -gur1 g u1 &r -gur2 g u2 &r -gur3 g u3 &r -gur4 g u4 &r -gur5 g u5 &r -kur1 k u1 &r -kur2 k u2 &r -kur3 k u3 &r -kur4 k u4 &r -kur5 k u5 &r -hur1 h u1 &r -hur2 h u2 &r -hur3 h u3 &r -hur4 h u4 &r -hur5 h u5 &r -zhur1 zh u1 &r -zhur2 zh u2 &r -zhur3 zh u3 &r -zhur4 zh u4 &r -zhur5 zh u5 &r -chur1 ch u1 &r -chur2 ch u2 &r -chur3 ch u3 &r -chur4 ch u4 &r -chur5 ch u5 &r -shur1 sh u1 &r -shur2 sh u2 &r -shur3 sh u3 &r -shur4 sh u4 &r -shur5 sh u5 &r -rur1 r u1 &r -rur2 r u2 &r -rur3 r u3 &r -rur4 r u4 &r -rur5 r u5 &r -zur1 z u1 &r -zur2 z u2 &r -zur3 z u3 &r -zur4 z u4 &r -zur5 z u5 &r -cur1 c u1 &r -cur2 c u2 &r -cur3 c u3 &r -cur4 c u4 &r -cur5 c u5 &r -sur1 s u1 &r -sur2 s u2 &r -sur3 s u3 &r -sur4 s u4 &r -sur5 s u5 &r -war1 w ua1 &r -war2 w ua2 &r -war3 w ua3 &r -war4 w ua4 &r -war5 w ua5 &r -guar1 g ua1 &r -guar2 g ua2 &r -guar3 g ua3 &r -guar4 g ua4 &r -guar5 g ua5 &r -kuar1 k ua1 &r -kuar2 k ua2 &r -kuar3 k ua3 &r -kuar4 k ua4 &r -kuar5 k ua5 &r -huar1 h ua1 &r -huar2 h ua2 &r -huar3 h ua3 &r -huar4 h ua4 &r -huar5 h ua5 &r -zhuar1 zh ua1 &r -zhuar2 zh ua2 &r -zhuar3 zh ua3 &r -zhuar4 zh ua4 &r -zhuar5 zh ua5 &r -chuar1 ch ua1 &r -chuar2 ch ua2 &r -chuar3 ch ua3 &r -chuar4 ch ua4 &r -chuar5 ch ua5 &r -shuar1 sh ua1 &r -shuar2 sh ua2 &r -shuar3 sh ua3 &r -shuar4 sh ua4 &r -shuar5 sh ua5 &r -wor1 w uo1 &r -wor2 w uo2 &r -wor3 w uo3 &r -wor4 w uo4 &r -wor5 w uo5 &r -duor1 d uo1 &r -duor2 d uo2 &r -duor3 d uo3 &r -duor4 d uo4 &r -duor5 d uo5 &r -tuor1 t uo1 &r -tuor2 t uo2 &r -tuor3 t uo3 &r -tuor4 t uo4 &r -tuor5 t uo5 &r -nuor1 n uo1 &r -nuor2 n uo2 &r -nuor3 n uo3 &r -nuor4 n uo4 &r -nuor5 n uo5 &r -luor1 l uo1 &r -luor2 l uo2 &r -luor3 l uo3 &r -luor4 l uo4 &r -luor5 l uo5 &r -guor1 g uo1 &r -guor2 g uo2 &r -guor3 g uo3 &r -guor4 g uo4 &r -guor5 g uo5 &r -kuor1 k uo1 &r -kuor2 k uo2 &r -kuor3 k uo3 &r -kuor4 k uo4 &r -kuor5 k uo5 &r -huor1 h uo1 &r -huor2 h uo2 &r -huor3 h uo3 &r -huor4 h uo4 &r -huor5 h uo5 &r -zhuor1 zh uo1 &r -zhuor2 zh uo2 &r -zhuor3 zh uo3 &r -zhuor4 zh uo4 &r -zhuor5 zh uo5 &r -chuor1 ch uo1 &r -chuor2 ch uo2 &r -chuor3 ch uo3 &r -chuor4 ch uo4 &r -chuor5 ch uo5 &r -shuor1 sh uo1 &r -shuor2 sh uo2 &r -shuor3 sh uo3 &r -shuor4 sh uo4 &r -shuor5 sh uo5 &r -ruor1 r uo1 &r -ruor2 r uo2 &r -ruor3 r uo3 &r -ruor4 r uo4 &r -ruor5 r uo5 &r -zuor1 z uo1 &r -zuor2 z uo2 &r -zuor3 z uo3 &r -zuor4 z uo4 &r -zuor5 z uo5 &r -cuor1 c uo1 &r -cuor2 c uo2 &r -cuor3 c uo3 &r -cuor4 c uo4 &r -cuor5 c uo5 &r -suor1 s uo1 &r -suor2 s uo2 &r -suor3 s uo3 &r -suor4 s uo4 &r -suor5 s uo5 &r -wair1 w uai1 &r -wair2 w uai2 &r -wair3 w uai3 &r -wair4 w uai4 &r -wair5 w uai5 &r -guair1 g uai1 &r -guair2 g uai2 &r -guair3 g uai3 &r -guair4 g uai4 &r -guair5 g uai5 &r -kuair1 k uai1 &r -kuair2 k uai2 &r -kuair3 k uai3 &r -kuair4 k uai4 &r -kuair5 k uai5 &r -huair1 h uai1 &r -huair2 h uai2 &r -huair3 h uai3 &r -huair4 h uai4 &r -huair5 h uai5 &r -zhuair1 zh uai1 &r -zhuair2 zh uai2 &r -zhuair3 zh uai3 &r -zhuair4 zh uai4 &r -zhuair5 zh uai5 &r -chuair1 ch uai1 &r -chuair2 ch uai2 &r -chuair3 ch uai3 &r -chuair4 ch uai4 &r -chuair5 ch uai5 &r -shuair1 sh uai1 &r -shuair2 sh uai2 &r -shuair3 sh uai3 &r -shuair4 sh uai4 &r -shuair5 sh uai5 &r -weir1 w uei1 &r -weir2 w uei2 &r -weir3 w uei3 &r -weir4 w uei4 &r -weir5 w uei5 &r -duir1 d uei1 &r -duir2 d uei2 &r -duir3 d uei3 &r -duir4 d uei4 &r -duir5 d uei5 &r -tuir1 t uei1 &r -tuir2 t uei2 &r -tuir3 t uei3 &r -tuir4 t uei4 &r -tuir5 t uei5 &r -guir1 g uei1 &r -guir2 g uei2 &r -guir3 g uei3 &r -guir4 g uei4 &r -guir5 g uei5 &r -kuir1 k uei1 &r -kuir2 k uei2 &r -kuir3 k uei3 &r -kuir4 k uei4 &r -kuir5 k uei5 &r -huir1 h uei1 &r -huir2 h uei2 &r -huir3 h uei3 &r -huir4 h uei4 &r -huir5 h uei5 &r -zhuir1 zh uei1 &r -zhuir2 zh uei2 &r -zhuir3 zh uei3 &r -zhuir4 zh uei4 &r -zhuir5 zh uei5 &r -chuir1 ch uei1 &r -chuir2 ch uei2 &r -chuir3 ch uei3 &r -chuir4 ch uei4 &r -chuir5 ch uei5 &r -shuir1 sh uei1 &r -shuir2 sh uei2 &r -shuir3 sh uei3 &r -shuir4 sh uei4 &r -shuir5 sh uei5 &r -ruir1 r uei1 &r -ruir2 r uei2 &r -ruir3 r uei3 &r -ruir4 r uei4 &r -ruir5 r uei5 &r -zuir1 z uei1 &r -zuir2 z uei2 &r -zuir3 z uei3 &r -zuir4 z uei4 &r -zuir5 z uei5 &r -cuir1 c uei1 &r -cuir2 c uei2 &r -cuir3 c uei3 &r -cuir4 c uei4 &r -cuir5 c uei5 &r -suir1 s uei1 &r -suir2 s uei2 &r -suir3 s uei3 &r -suir4 s uei4 &r -suir5 s uei5 &r -wanr1 w uan1 &r -wanr2 w uan2 &r -wanr3 w uan3 &r -wanr4 w uan4 &r -wanr5 w uan5 &r -duanr1 d uan1 &r -duanr2 d uan2 &r -duanr3 d uan3 &r -duanr4 d uan4 &r -duanr5 d uan5 &r -tuanr1 t uan1 &r -tuanr2 t uan2 &r -tuanr3 t uan3 &r -tuanr4 t uan4 &r -tuanr5 t uan5 &r -nuanr1 n uan1 &r -nuanr2 n uan2 &r -nuanr3 n uan3 &r -nuanr4 n uan4 &r -nuanr5 n uan5 &r -luanr1 l uan1 &r -luanr2 l uan2 &r -luanr3 l uan3 &r -luanr4 l uan4 &r -luanr5 l uan5 &r -guanr1 g uan1 &r -guanr2 g uan2 &r -guanr3 g uan3 &r -guanr4 g uan4 &r -guanr5 g uan5 &r -kuanr1 k uan1 &r -kuanr2 k uan2 &r -kuanr3 k uan3 &r -kuanr4 k uan4 &r -kuanr5 k uan5 &r -huanr1 h uan1 &r -huanr2 h uan2 &r -huanr3 h uan3 &r -huanr4 h uan4 &r -huanr5 h uan5 &r -zhuanr1 zh uan1 &r -zhuanr2 zh uan2 &r -zhuanr3 zh uan3 &r -zhuanr4 zh uan4 &r -zhuanr5 zh uan5 &r -chuanr1 ch uan1 &r -chuanr2 ch uan2 &r -chuanr3 ch uan3 &r -chuanr4 ch uan4 &r -chuanr5 ch uan5 &r -shuanr1 sh uan1 &r -shuanr2 sh uan2 &r -shuanr3 sh uan3 &r -shuanr4 sh uan4 &r -shuanr5 sh uan5 &r -ruanr1 r uan1 &r -ruanr2 r uan2 &r -ruanr3 r uan3 &r -ruanr4 r uan4 &r -ruanr5 r uan5 &r -zuanr1 z uan1 &r -zuanr2 z uan2 &r -zuanr3 z uan3 &r -zuanr4 z uan4 &r -zuanr5 z uan5 &r -cuanr1 c uan1 &r -cuanr2 c uan2 &r -cuanr3 c uan3 &r -cuanr4 c uan4 &r -cuanr5 c uan5 &r -suanr1 s uan1 &r -suanr2 s uan2 &r -suanr3 s uan3 &r -suanr4 s uan4 &r -suanr5 s uan5 &r -wenr1 w uen1 &r -wenr2 w uen2 &r -wenr3 w uen3 &r -wenr4 w uen4 &r -wenr5 w uen5 &r -dunr1 d uen1 &r -dunr2 d uen2 &r -dunr3 d uen3 &r -dunr4 d uen4 &r -dunr5 d uen5 &r -tunr1 t uen1 &r -tunr2 t uen2 &r -tunr3 t uen3 &r -tunr4 t uen4 &r -tunr5 t uen5 &r -nunr1 n uen1 &r -nunr2 n uen2 &r -nunr3 n uen3 &r -nunr4 n uen4 &r -nunr5 n uen5 &r -lunr1 l uen1 &r -lunr2 l uen2 &r -lunr3 l uen3 &r -lunr4 l uen4 &r -lunr5 l uen5 &r -gunr1 g uen1 &r -gunr2 g uen2 &r -gunr3 g uen3 &r -gunr4 g uen4 &r -gunr5 g uen5 &r -kunr1 k uen1 &r -kunr2 k uen2 &r -kunr3 k uen3 &r -kunr4 k uen4 &r -kunr5 k uen5 &r -hunr1 h uen1 &r -hunr2 h uen2 &r -hunr3 h uen3 &r -hunr4 h uen4 &r -hunr5 h uen5 &r -zhunr1 zh uen1 &r -zhunr2 zh uen2 &r -zhunr3 zh uen3 &r -zhunr4 zh uen4 &r -zhunr5 zh uen5 &r -chunr1 ch uen1 &r -chunr2 ch uen2 &r -chunr3 ch uen3 &r -chunr4 ch uen4 &r -chunr5 ch uen5 &r -shunr1 sh uen1 &r -shunr2 sh uen2 &r -shunr3 sh uen3 &r -shunr4 sh uen4 &r -shunr5 sh uen5 &r -runr1 r uen1 &r -runr2 r uen2 &r -runr3 r uen3 &r -runr4 r uen4 &r -runr5 r uen5 &r -zunr1 z uen1 &r -zunr2 z uen2 &r -zunr3 z uen3 &r -zunr4 z uen4 &r -zunr5 z uen5 &r -cunr1 c uen1 &r -cunr2 c uen2 &r -cunr3 c uen3 &r -cunr4 c uen4 &r -cunr5 c uen5 &r -sunr1 s uen1 &r -sunr2 s uen2 &r -sunr3 s uen3 &r -sunr4 s uen4 &r -sunr5 s uen5 &r -wangr1 w uang1 &r -wangr2 w uang2 &r -wangr3 w uang3 &r -wangr4 w uang4 &r -wangr5 w uang5 &r -guangr1 g uang1 &r -guangr2 g uang2 &r -guangr3 g uang3 &r -guangr4 g uang4 &r -guangr5 g uang5 &r -kuangr1 k uang1 &r -kuangr2 k uang2 &r -kuangr3 k uang3 &r -kuangr4 k uang4 &r -kuangr5 k uang5 &r -huangr1 h uang1 &r -huangr2 h uang2 &r -huangr3 h uang3 &r -huangr4 h uang4 &r -huangr5 h uang5 &r -zhuangr1 zh uang1 &r -zhuangr2 zh uang2 &r -zhuangr3 zh uang3 &r -zhuangr4 zh uang4 &r -zhuangr5 zh uang5 &r -chuangr1 ch uang1 &r -chuangr2 ch uang2 &r -chuangr3 ch uang3 &r -chuangr4 ch uang4 &r -chuangr5 ch uang5 &r -shuangr1 sh uang1 &r -shuangr2 sh uang2 &r -shuangr3 sh uang3 &r -shuangr4 sh uang4 &r -shuangr5 sh uang5 &r -wengr1 w ung1 &r -wengr2 w ung2 &r -wengr3 w ung3 &r -wengr4 w ung4 &r -wengr5 w ung5 &r -dongr1 d ung1 &r -dongr2 d ung2 &r -dongr3 d ung3 &r -dongr4 d ung4 &r -dongr5 d ung5 &r -tongr1 t ung1 &r -tongr2 t ung2 &r -tongr3 t ung3 &r -tongr4 t ung4 &r -tongr5 t ung5 &r -nongr1 n ung1 &r -nongr2 n ung2 &r -nongr3 n ung3 &r -nongr4 n ung4 &r -nongr5 n ung5 &r -longr1 l ung1 &r -longr2 l ung2 &r -longr3 l ung3 &r -longr4 l ung4 &r -longr5 l ung5 &r -gongr1 g ung1 &r -gongr2 g ung2 &r -gongr3 g ung3 &r -gongr4 g ung4 &r -gongr5 g ung5 &r -kongr1 k ung1 &r -kongr2 k ung2 &r -kongr3 k ung3 &r -kongr4 k ung4 &r -kongr5 k ung5 &r -hongr1 h ung1 &r -hongr2 h ung2 &r -hongr3 h ung3 &r -hongr4 h ung4 &r -hongr5 h ung5 &r -zhongr1 zh ung1 &r -zhongr2 zh ung2 &r -zhongr3 zh ung3 &r -zhongr4 zh ung4 &r -zhongr5 zh ung5 &r -chongr1 ch ung1 &r -chongr2 ch ung2 &r -chongr3 ch ung3 &r -chongr4 ch ung4 &r -chongr5 ch ung5 &r -rongr1 r ung1 &r -rongr2 r ung2 &r -rongr3 r ung3 &r -rongr4 r ung4 &r -rongr5 r ung5 &r -zongr1 z ung1 &r -zongr2 z ung2 &r -zongr3 z ung3 &r -zongr4 z ung4 &r -zongr5 z ung5 &r -congr1 c ung1 &r -congr2 c ung2 &r -congr3 c ung3 &r -congr4 c ung4 &r -congr5 c ung5 &r -songr1 s ung1 &r -songr2 s ung2 &r -songr3 s ung3 &r -songr4 s ung4 &r -songr5 s ung5 &r -yur1 y v1 &r -yur2 y v2 &r -yur3 y v3 &r -yur4 y v4 &r -yur5 y v5 &r -nvr1 n v1 &r -nvr2 n v2 &r -nvr3 n v3 &r -nvr4 n v4 &r -nvr5 n v5 &r -lvr1 l v1 &r -lvr2 l v2 &r -lvr3 l v3 &r -lvr4 l v4 &r -lvr5 l v5 &r -jur1 j v1 &r -jur2 j v2 &r -jur3 j v3 &r -jur4 j v4 &r -jur5 j v5 &r -qur1 q v1 &r -qur2 q v2 &r -qur3 q v3 &r -qur4 q v4 &r -qur5 q v5 &r -xur1 x v1 &r -xur2 x v2 &r -xur3 x v3 &r -xur4 x v4 &r -xur5 x v5 &r -yuer1 y ve1 &r -yuer2 y ve2 &r -yuer3 y ve3 &r -yuer4 y ve4 &r -yuer5 y ve5 &r -nuer1 n ve1 &r -nuer2 n ve2 &r -nuer3 n ve3 &r -nuer4 n ve4 &r -nuer5 n ve5 &r -nver1 n ve1 &r -nver2 n ve2 &r -nver3 n ve3 &r -nver4 n ve4 &r -nver5 n ve5 &r -luer1 l ve1 &r -luer2 l ve2 &r -luer3 l ve3 &r -luer4 l ve4 &r -luer5 l ve5 &r -lver1 l ve1 &r -lver2 l ve2 &r -lver3 l ve3 &r -lver4 l ve4 &r -lver5 l ve5 &r -juer1 j ve1 &r -juer2 j ve2 &r -juer3 j ve3 &r -juer4 j ve4 &r -juer5 j ve5 &r -quer1 q ve1 &r -quer2 q ve2 &r -quer3 q ve3 &r -quer4 q ve4 &r -quer5 q ve5 &r -xuer1 x ve1 &r -xuer2 x ve2 &r -xuer3 x ve3 &r -xuer4 x ve4 &r -xuer5 x ve5 &r -yuanr1 y van1 &r -yuanr2 y van2 &r -yuanr3 y van3 &r -yuanr4 y van4 &r -yuanr5 y van5 &r -juanr1 j van1 &r -juanr2 j van2 &r -juanr3 j van3 &r -juanr4 j van4 &r -juanr5 j van5 &r -quanr1 q van1 &r -quanr2 q van2 &r -quanr3 q van3 &r -quanr4 q van4 &r -quanr5 q van5 &r -xuanr1 x van1 &r -xuanr2 x van2 &r -xuanr3 x van3 &r -xuanr4 x van4 &r -xuanr5 x van5 &r -yunr1 y vn1 &r -yunr2 y vn2 &r -yunr3 y vn3 &r -yunr4 y vn4 &r -yunr5 y vn5 &r -junr1 j vn1 &r -junr2 j vn2 &r -junr3 j vn3 &r -junr4 j vn4 &r -junr5 j vn5 &r -qunr1 q vn1 &r -qunr2 q vn2 &r -qunr3 q vn3 &r -qunr4 q vn4 &r -qunr5 q vn5 &r -xunr1 x vn1 &r -xunr2 x vn2 &r -xunr3 x vn3 &r -xunr4 x vn4 &r -xunr5 x vn5 &r -yongr1 y vng1 &r -yongr2 y vng2 &r -yongr3 y vng3 &r -yongr4 y vng4 &r -yongr5 y vng5 &r -jiongr1 j vng1 &r -jiongr2 j vng2 &r -jiongr3 j vng3 &r -jiongr4 j vng4 &r -jiongr5 j vng5 &r -qiongr1 q vng1 &r -qiongr2 q vng2 &r -qiongr3 q vng3 &r -qiongr4 q vng4 &r -qiongr5 q vng5 &r -xiongr1 x vng1 &r -xiongr2 x vng2 &r -xiongr3 x vng3 &r -xiongr4 x vng4 &r -xiongr5 x vng5 &r diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/preprocess_transcription.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/preprocess_transcription.py deleted file mode 100644 index ce117d42..00000000 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/preprocess_transcription.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import pickle -import re -from pathlib import Path - -import tqdm -import yaml - -zh_pattern = re.compile("[\u4e00-\u9fa5]") - -_tones = {'', '', '', '0', '1', '2', '3', '4', '5'} - -_pauses = {'%', '$'} - -_initials = { - 'b', - 'p', - 'm', - 'f', - 'd', - 't', - 'n', - 'l', - 'g', - 'k', - 'h', - 'j', - 'q', - 'x', - 'zh', - 'ch', - 'sh', - 'r', - 'z', - 'c', - 's', -} - -_finals = { - 'ii', - 'iii', - 'a', - 'o', - 'e', - 'ea', - 'ai', - 'ei', - 'ao', - 'ou', - 'an', - 'en', - 'ang', - 'eng', - 'er', - 'i', - 'ia', - 'io', - 'ie', - 'iai', - 'iao', - 'iou', - 'ian', - 'ien', - 'iang', - 'ieng', - 'u', - 'ua', - 'uo', - 'uai', - 'uei', - 'uan', - 'uen', - 'uang', - 'ueng', - 'v', - 've', - 'van', - 'ven', - 'veng', -} - -_ernized_symbol = {'&r'} - -_specials = {'', '', '', ''} - -_phones = _initials | _finals | _ernized_symbol | _specials | _pauses - - -def is_zh(word): - global zh_pattern - match = zh_pattern.search(word) - return match is not None - - -def ernized(syllable): - return syllable[:2] != "er" and syllable[-2] == 'r' - - -def convert(syllable): - # expansion of o -> uo - syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable) - # syllable = syllable.replace("bo", "buo").replace("po", "puo").replace("mo", "muo").replace("fo", "fuo") - # expansion for iong, ong - syllable = syllable.replace("iong", "veng").replace("ong", "ueng") - - # expansion for ing, in - syllable = syllable.replace("ing", "ieng").replace("in", "ien") - - # expansion for un, ui, iu - syllable = syllable.replace("un", "uen").replace("ui", - "uei").replace("iu", "iou") - - # rule for variants of i - syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\ - .replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\ - .replace("ri", "riii") - - # rule for y preceding i, u - syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i") - - # rule for w - syllable = syllable.replace("wu", "u").replace("w", "u") - - # rule for v following j, q, x - syllable = syllable.replace("ju", "jv").replace("qu", - "qv").replace("xu", "xv") - - return syllable - - -def split_syllable(syllable: str): - """Split a syllable in pinyin into a list of phones and a list of tones. - Initials have no tone, represented by '0', while finals have tones from - '1,2,3,4,5'. - - e.g. - - zhang -> ['zh', 'ang'], ['0', '1'] - """ - if syllable in _pauses: - # syllable, tone - return [syllable], ['0'] - - tone = syllable[-1] - syllable = convert(syllable[:-1]) - - phones = [] - tones = [] - - global _initials - if syllable[:2] in _initials: - phones.append(syllable[:2]) - tones.append('0') - phones.append(syllable[2:]) - tones.append(tone) - elif syllable[0] in _initials: - phones.append(syllable[0]) - tones.append('0') - phones.append(syllable[1:]) - tones.append(tone) - else: - phones.append(syllable) - tones.append(tone) - return phones, tones - - -def load_aishell3_transcription(line: str): - sentence_id, pinyin, text = line.strip().split("|") - syllables = pinyin.strip().split() - - results = [] - - for syllable in syllables: - if syllable in _pauses: - results.append(syllable) - elif not ernized(syllable): - results.append(syllable) - else: - results.append(syllable[:-2] + syllable[-1]) - results.append('&r5') - - phones = [] - tones = [] - for syllable in results: - p, t = split_syllable(syllable) - phones.extend(p) - tones.extend(t) - for p in phones: - assert p in _phones, p - return { - "sentence_id": sentence_id, - "text": text, - "syllables": results, - "phones": phones, - "tones": tones - } - - -def process_aishell3(dataset_root, output_dir): - dataset_root = Path(dataset_root).expanduser() - output_dir = Path(output_dir).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - - prosody_label_path = dataset_root / "label_train-set.txt" - with open(prosody_label_path, 'rt') as f: - lines = [line.strip() for line in f] - - records = lines[5:] - - processed_records = [] - for record in tqdm.tqdm(records): - new_record = load_aishell3_transcription(record) - processed_records.append(new_record) - print(new_record) - - with open(output_dir / "metadata.pickle", 'wb') as f: - pickle.dump(processed_records, f) - - with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f: - yaml.safe_dump( - processed_records, f, default_flow_style=None, allow_unicode=True) - - print("metadata done!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)." - ) - parser.add_argument( - "--input", - type=str, - default="~/datasets/aishell3/train", - help="path of the training dataset,(contains a label_train-set.txt).") - parser.add_argument( - "--output", - type=str, - help="the directory to save the processed transcription." - "If not provided, it would be the same as the input.") - args = parser.parse_args() - if args.output is None: - args.output = args.input - - process_aishell3(args.input, args.output) diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/process_wav.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/process_wav.py deleted file mode 100644 index 56d8e4c3..00000000 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/process_wav.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -from functools import partial -from multiprocessing import Pool -from pathlib import Path - -import librosa -import numpy as np -import soundfile as sf -from praatio import textgrid -from tqdm import tqdm - - -def get_valid_part(fpath): - f = textgrid.openTextgrid(fpath, includeEmptyIntervals=True) - - start = 0 - phone_entry_list = f.tierDict['phones'].entryList - first_entry = phone_entry_list[0] - if first_entry.label == "sil": - start = first_entry.end - - last_entry = phone_entry_list[-1] - if last_entry.label == "sp": - end = last_entry.start - else: - end = last_entry.end - return start, end - - -def process_utterance(fpath, source_dir, target_dir, alignment_dir): - rel_path = fpath.relative_to(source_dir) - opath = target_dir / rel_path - apath = (alignment_dir / rel_path).with_suffix(".TextGrid") - opath.parent.mkdir(parents=True, exist_ok=True) - - start, end = get_valid_part(apath) - wav, _ = librosa.load(fpath, sr=22050, offset=start, duration=end - start) - normalized_wav = wav / np.max(wav) * 0.999 - sf.write(opath, normalized_wav, samplerate=22050, subtype='PCM_16') - # print(f"{fpath} => {opath}") - - -def preprocess_aishell3(source_dir, target_dir, alignment_dir): - source_dir = Path(source_dir).expanduser() - target_dir = Path(target_dir).expanduser() - alignment_dir = Path(alignment_dir).expanduser() - - wav_paths = list(source_dir.rglob("*.wav")) - print(f"there are {len(wav_paths)} audio files in total") - fx = partial( - process_utterance, - source_dir=source_dir, - target_dir=target_dir, - alignment_dir=alignment_dir) - with Pool(16) as p: - list( - tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance")) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Process audio in AiShell3, trim silence according to the alignment " - "files generated by MFA, and normalize volume by peak.") - parser.add_argument( - "--input", - type=str, - default="~/datasets/aishell3/train/wav", - help="path of the original audio folder in aishell3.") - parser.add_argument( - "--output", - type=str, - default="~/datasets/aishell3/train/normalized_wav", - help="path of the folder to save the processed audio files.") - parser.add_argument( - "--alignment", - type=str, - default="~/datasets/aishell3/train/alignment", - help="path of the alignment files.") - args = parser.parse_args() - - preprocess_aishell3(args.input, args.output, args.alignment) diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py deleted file mode 100644 index ea5f12da..00000000 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py +++ /dev/null @@ -1,263 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import time -from collections import defaultdict -from pathlib import Path - -import numpy as np -import paddle -from matplotlib import pyplot as plt -from paddle import distributed as dist -from paddle.io import DataLoader -from paddle.io import DistributedBatchSampler - -from paddlespeech.t2s.data import dataset -from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import AiShell3 -from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import collate_aishell3_examples -from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults -from paddlespeech.t2s.models.tacotron2 import Tacotron2 -from paddlespeech.t2s.models.tacotron2 import Tacotron2Loss -from paddlespeech.t2s.training.cli import default_argument_parser -from paddlespeech.t2s.training.experiment import ExperimentBase -from paddlespeech.t2s.utils import display -from paddlespeech.t2s.utils import mp_tools - - -class Experiment(ExperimentBase): - def compute_losses(self, inputs, outputs): - texts, tones, mel_targets, utterance_embeds, text_lens, output_lens, stop_tokens = inputs - - mel_outputs = outputs["mel_output"] - mel_outputs_postnet = outputs["mel_outputs_postnet"] - alignments = outputs["alignments"] - - losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets, - alignments, output_lens, text_lens) - return losses - - def train_batch(self): - start = time.time() - batch = self.read_batch() - data_loader_time = time.time() - start - - self.optimizer.clear_grad() - self.model.train() - texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch - outputs = self.model( - texts, - text_lens, - mels, - output_lens, - tones=tones, - global_condition=utterance_embeds) - losses = self.compute_losses(batch, outputs) - loss = losses["loss"] - loss.backward() - self.optimizer.step() - iteration_time = time.time() - start - - losses_np = {k: float(v) for k, v in losses.items()} - # logging - msg = "Rank: {}, ".format(dist.get_rank()) - msg += "step: {}, ".format(self.iteration) - msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, - iteration_time) - msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in losses_np.items()) - self.logger.info(msg) - - if dist.get_rank() == 0: - for key, value in losses_np.items(): - self.visualizer.add_scalar(f"train_loss/{key}", value, - self.iteration) - - @mp_tools.rank_zero_only - @paddle.no_grad() - def valid(self): - valid_losses = defaultdict(list) - for i, batch in enumerate(self.valid_loader): - texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch - outputs = self.model( - texts, - text_lens, - mels, - output_lens, - tones=tones, - global_condition=utterance_embeds) - losses = self.compute_losses(batch, outputs) - for key, value in losses.items(): - valid_losses[key].append(float(value)) - - attention_weights = outputs["alignments"] - self.visualizer.add_figure( - f"valid_sentence_{i}_alignments", - display.plot_alignment(attention_weights[0].numpy().T), - self.iteration) - self.visualizer.add_figure( - f"valid_sentence_{i}_target_spectrogram", - display.plot_spectrogram(mels[0].numpy().T), self.iteration) - mel_pred = outputs['mel_outputs_postnet'] - self.visualizer.add_figure( - f"valid_sentence_{i}_predicted_spectrogram", - display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration) - - # write visual log - valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} - - # logging - msg = "Valid: " - msg += "step: {}, ".format(self.iteration) - msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in valid_losses.items()) - self.logger.info(msg) - - for key, value in valid_losses.items(): - self.visualizer.add_scalar(f"valid/{key}", value, self.iteration) - - @mp_tools.rank_zero_only - @paddle.no_grad() - def eval(self): - """Evaluation of Tacotron2 in autoregressive manner.""" - self.model.eval() - mel_dir = Path(self.output_dir / ("eval_{}".format(self.iteration))) - mel_dir.mkdir(parents=True, exist_ok=True) - for i, batch in enumerate(self.test_loader): - texts, tones, mels, utterance_embeds, *_ = batch - outputs = self.model.infer( - texts, tones=tones, global_condition=utterance_embeds) - - display.plot_alignment(outputs["alignments"][0].numpy().T) - plt.savefig(mel_dir / f"sentence_{i}.png") - plt.close() - np.save(mel_dir / f"sentence_{i}", - outputs["mel_outputs_postnet"][0].numpy().T) - print(f"sentence_{i}") - - def setup_model(self): - config = self.config - model = Tacotron2( - vocab_size=config.model.vocab_size, - n_tones=config.model.n_tones, - d_mels=config.data.d_mels, - d_encoder=config.model.d_encoder, - encoder_conv_layers=config.model.encoder_conv_layers, - encoder_kernel_size=config.model.encoder_kernel_size, - d_prenet=config.model.d_prenet, - d_attention_rnn=config.model.d_attention_rnn, - d_decoder_rnn=config.model.d_decoder_rnn, - attention_filters=config.model.attention_filters, - attention_kernel_size=config.model.attention_kernel_size, - d_attention=config.model.d_attention, - d_postnet=config.model.d_postnet, - postnet_kernel_size=config.model.postnet_kernel_size, - postnet_conv_layers=config.model.postnet_conv_layers, - reduction_factor=config.model.reduction_factor, - p_encoder_dropout=config.model.p_encoder_dropout, - p_prenet_dropout=config.model.p_prenet_dropout, - p_attention_dropout=config.model.p_attention_dropout, - p_decoder_dropout=config.model.p_decoder_dropout, - p_postnet_dropout=config.model.p_postnet_dropout, - d_global_condition=config.model.d_global_condition, - use_stop_token=config.model.use_stop_token, ) - - if self.parallel: - model = paddle.DataParallel(model) - - grad_clip = paddle.nn.ClipGradByGlobalNorm( - config.training.grad_clip_thresh) - optimizer = paddle.optimizer.Adam( - learning_rate=config.training.lr, - parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), - grad_clip=grad_clip) - criterion = Tacotron2Loss( - use_stop_token_loss=config.model.use_stop_token, - use_guided_attention_loss=config.model.use_guided_attention_loss, - sigma=config.model.guided_attention_loss_sigma) - self.model = model - self.optimizer = optimizer - self.criterion = criterion - - def setup_dataloader(self): - args = self.args - config = self.config - aishell3_dataset = AiShell3(args.data) - - valid_set, train_set = dataset.split(aishell3_dataset, - config.data.valid_size) - batch_fn = collate_aishell3_examples - - if not self.parallel: - self.train_loader = DataLoader( - train_set, - batch_size=config.data.batch_size, - shuffle=True, - drop_last=True, - collate_fn=batch_fn) - else: - sampler = DistributedBatchSampler( - train_set, - batch_size=config.data.batch_size, - shuffle=True, - drop_last=True) - self.train_loader = DataLoader( - train_set, batch_sampler=sampler, collate_fn=batch_fn) - - self.valid_loader = DataLoader( - valid_set, - batch_size=config.data.batch_size, - shuffle=False, - drop_last=False, - collate_fn=batch_fn) - - self.test_loader = DataLoader( - valid_set, - batch_size=1, - shuffle=False, - drop_last=False, - collate_fn=batch_fn) - - -def main_sp(config, args): - exp = Experiment(config, args) - exp.setup() - exp.resume_or_load() - if not args.test: - exp.run() - else: - exp.eval() - - -def main(config, args): - if args.ngpu > 1: - dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) - else: - main_sp(config, args) - - -if __name__ == "__main__": - config = get_cfg_defaults() - parser = default_argument_parser() - parser.add_argument("--test", action="store_true") - args = parser.parse_args() - if args.config: - config.merge_from_file(args.config) - if args.opts: - config.merge_from_list(args.opts) - config.freeze() - print(config) - print(args) - - main(config, args) diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py deleted file mode 100644 index 4e6b8d36..00000000 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -from pathlib import Path - -import numpy as np -import paddle -import soundfile as sf -from matplotlib import pyplot as plt - -from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones -from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones -from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence -from paddlespeech.t2s.models.tacotron2 import Tacotron2 -from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow -from paddlespeech.t2s.utils import display -from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor -from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder - - -def voice_cloning(args): - # speaker encoder - p = SpeakerVerificationPreprocessor( - sampling_rate=16000, - audio_norm_target_dBFS=-30, - vad_window_length=30, - vad_moving_average_width=8, - vad_max_silence_length=6, - mel_window_length=25, - mel_window_step=10, - n_mels=40, - partial_n_frames=160, - min_pad_coverage=0.75, - partial_overlap_ratio=0.5) - print("Audio Processor Done!") - - speaker_encoder = LSTMSpeakerEncoder( - n_mels=40, num_layers=3, hidden_size=256, output_size=256) - speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path)) - speaker_encoder.eval() - print("GE2E Done!") - - synthesizer = Tacotron2( - vocab_size=68, - n_tones=10, - d_mels=80, - d_encoder=512, - encoder_conv_layers=3, - encoder_kernel_size=5, - d_prenet=256, - d_attention_rnn=1024, - d_decoder_rnn=1024, - attention_filters=32, - attention_kernel_size=31, - d_attention=128, - d_postnet=512, - postnet_kernel_size=5, - postnet_conv_layers=5, - reduction_factor=1, - p_encoder_dropout=0.5, - p_prenet_dropout=0.5, - p_attention_dropout=0.1, - p_decoder_dropout=0.1, - p_postnet_dropout=0.5, - d_global_condition=256, - use_stop_token=False, ) - synthesizer.set_state_dict(paddle.load(args.tacotron2_params_path)) - synthesizer.eval() - print("Tacotron2 Done!") - - # vocoder - vocoder = ConditionalWaveFlow( - upsample_factors=[16, 16], - n_flows=8, - n_layers=8, - n_group=16, - channels=128, - n_mels=80, - kernel_size=[3, 3]) - vocoder.set_state_dict(paddle.load(args.waveflow_params_path)) - vocoder.eval() - print("WaveFlow Done!") - - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - input_dir = Path(args.input_dir) - - # 因为 AISHELL-3 数据集中使用 % 和 $ 表示韵律词和韵律短语的边界,它们大约对应着较短和较长的停顿,在文本中可以使用 % 和 $ 来调节韵律。 - # 值得的注意的是,句子的有效字符集仅包含汉字和 %, $, 因此输入的句子只能包含这些字符。 - sentence = "每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$" - phones, tones = convert_sentence(sentence) - phones = np.array( - [voc_phones.lookup(item) for item in phones], dtype=np.int64) - tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64) - phones = paddle.to_tensor(phones).unsqueeze(0) - tones = paddle.to_tensor(tones).unsqueeze(0) - - for name in os.listdir(input_dir): - utt_id = name.split(".")[0] - ref_audio_path = input_dir / name - mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path)) - print("mel_sequences: ", mel_sequences.shape) - with paddle.no_grad(): - embed = speaker_encoder.embed_utterance( - paddle.to_tensor(mel_sequences)) - print("embed shape: ", embed.shape) - utterance_embeds = paddle.unsqueeze(embed, 0) - outputs = synthesizer.infer( - phones, tones=tones, global_condition=utterance_embeds) - mel_input = paddle.transpose(outputs["mel_outputs_postnet"], [0, 2, 1]) - alignment = outputs["alignments"][0].numpy().T - display.plot_alignment(alignment) - plt.savefig(str(output_dir / (utt_id + ".png"))) - - with paddle.no_grad(): - wav = vocoder.infer(mel_input) - wav = wav.numpy()[0] - sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=22050) - - -def main(): - # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser(description="") - parser.add_argument( - "--ge2e_params_path", type=str, help="ge2e params path.") - parser.add_argument( - "--tacotron2_params_path", type=str, help="tacotron2 params path.") - parser.add_argument( - "--waveflow_params_path", type=str, help="waveflow params path.") - - parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") - - parser.add_argument( - "--input-dir", - type=str, - help="input dir of *.wav, the sample rate will be resample to 16k.") - parser.add_argument("--output-dir", type=str, help="output dir.") - - args = parser.parse_args() - - if args.ngpu == 0: - paddle.set_device("cpu") - elif args.ngpu > 0: - paddle.set_device("gpu") - else: - print("ngpu should >= 0 !") - - voice_cloning(args) - - -if __name__ == "__main__": - main() diff --git a/paddlespeech/t2s/exps/waveflow/ljspeech.py b/paddlespeech/t2s/exps/waveflow/ljspeech.py index 655b63da..a6efa9ec 100644 --- a/paddlespeech/t2s/exps/waveflow/ljspeech.py +++ b/paddlespeech/t2s/exps/waveflow/ljspeech.py @@ -17,8 +17,8 @@ import numpy as np import pandas from paddle.io import Dataset -from paddlespeech.t2s.data.batch import batch_spec -from paddlespeech.t2s.data.batch import batch_wav +from paddlespeech.t2s.datasets.batch import batch_spec +from paddlespeech.t2s.datasets.batch import batch_wav class LJSpeech(Dataset): diff --git a/paddlespeech/t2s/exps/waveflow/train.py b/paddlespeech/t2s/exps/waveflow/train.py index d500336a..cf03f5ef 100644 --- a/paddlespeech/t2s/exps/waveflow/train.py +++ b/paddlespeech/t2s/exps/waveflow/train.py @@ -19,7 +19,7 @@ from paddle import distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from paddlespeech.t2s.data import dataset +from paddlespeech.t2s.datasets import dataset from paddlespeech.t2s.exps.waveflow.config import get_cfg_defaults from paddlespeech.t2s.exps.waveflow.ljspeech import LJSpeech from paddlespeech.t2s.exps.waveflow.ljspeech import LJSpeechClipCollector diff --git a/paddlespeech/t2s/exps/voice_cloning/__init__.py b/paddlespeech/t2s/exps/wavernn/__init__.py similarity index 100% rename from paddlespeech/t2s/exps/voice_cloning/__init__.py rename to paddlespeech/t2s/exps/wavernn/__init__.py diff --git a/paddlespeech/t2s/exps/wavernn/synthesize.py b/paddlespeech/t2s/exps/wavernn/synthesize.py new file mode 100644 index 00000000..4357b282 --- /dev/null +++ b/paddlespeech/t2s/exps/wavernn/synthesize.py @@ -0,0 +1,108 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from pathlib import Path + +import jsonlines +import numpy as np +import paddle +import soundfile as sf +import yaml +from paddle import distributed as dist +from timer import timer +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.models.wavernn import WaveRNN + + +def main(): + parser = argparse.ArgumentParser(description="Synthesize with WaveRNN.") + + parser.add_argument("--config", type=str, help="Vocoder config file.") + parser.add_argument("--checkpoint", type=str, help="snapshot to load.") + parser.add_argument("--test-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + + args = parser.parse_args() + + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + model = WaveRNN( + hop_length=config.n_shift, sample_rate=config.fs, **config["model"]) + state_dict = paddle.load(args.checkpoint) + model.set_state_dict(state_dict["main_params"]) + + model.eval() + + with jsonlines.open(args.test_metadata, 'r') as reader: + metadata = list(reader) + test_dataset = DataTable( + metadata, + fields=['utt_id', 'feats'], + converters={ + 'utt_id': None, + 'feats': np.load, + }) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + N = 0 + T = 0 + for example in test_dataset: + utt_id = example['utt_id'] + mel = example['feats'] + mel = paddle.to_tensor(mel) # (T, C) + with timer() as t: + with paddle.no_grad(): + wav = model.generate( + c=mel, + batched=config.inference.gen_batched, + target=config.inference.target, + overlap=config.inference.overlap, + mu_law=config.mu_law, + gen_display=True) + wav = wav.numpy() + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = config.fs / speed + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs) + print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }") + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py new file mode 100644 index 00000000..8661d311 --- /dev/null +++ b/paddlespeech/t2s/exps/wavernn/train.py @@ -0,0 +1,212 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +import shutil +from pathlib import Path + +import jsonlines +import numpy as np +import paddle +import yaml +from paddle import DataParallel +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.optimizer import Adam +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.datasets.vocoder_batch_fn import WaveRNNClip +from paddlespeech.t2s.models.wavernn import WaveRNN +from paddlespeech.t2s.models.wavernn import WaveRNNEvaluator +from paddlespeech.t2s.models.wavernn import WaveRNNUpdater +from paddlespeech.t2s.modules.losses import discretized_mix_logistic_loss +from paddlespeech.t2s.training.extensions.snapshot import Snapshot +from paddlespeech.t2s.training.extensions.visualizer import VisualDL +from paddlespeech.t2s.training.seeding import seed_everything +from paddlespeech.t2s.training.trainer import Trainer + + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + world_size = paddle.distributed.get_world_size() + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + paddle.set_device("cpu") + else: + paddle.set_device("gpu") + if world_size > 1: + paddle.distributed.init_parallel_env() + + # set the random seed, it is a must for multiprocess training + seed_everything(config.seed) + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + + # construct dataset for training and validation + with jsonlines.open(args.train_metadata, 'r') as reader: + train_metadata = list(reader) + train_dataset = DataTable( + data=train_metadata, + fields=["wave", "feats"], + converters={ + "wave": np.load, + "feats": np.load, + }, ) + + with jsonlines.open(args.dev_metadata, 'r') as reader: + dev_metadata = list(reader) + dev_dataset = DataTable( + data=dev_metadata, + fields=["wave", "feats"], + converters={ + "wave": np.load, + "feats": np.load, + }, ) + + batch_fn = WaveRNNClip( + mode=config.model.mode, + aux_context_window=config.model.aux_context_window, + hop_size=config.n_shift, + batch_max_steps=config.batch_max_steps, + bits=config.model.bits) + + # collate function and dataloader + train_sampler = DistributedBatchSampler( + train_dataset, + batch_size=config.batch_size, + shuffle=True, + drop_last=True) + dev_sampler = DistributedBatchSampler( + dev_dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=False) + print("samplers done!") + + train_dataloader = DataLoader( + train_dataset, + batch_sampler=train_sampler, + collate_fn=batch_fn, + num_workers=config.num_workers) + + dev_dataloader = DataLoader( + dev_dataset, + collate_fn=batch_fn, + batch_sampler=dev_sampler, + num_workers=config.num_workers) + + valid_generate_loader = DataLoader(dev_dataset, batch_size=1) + + print("dataloaders done!") + + model = WaveRNN( + hop_length=config.n_shift, sample_rate=config.fs, **config["model"]) + if world_size > 1: + model = DataParallel(model) + print("model done!") + + if config.model.mode == 'RAW': + criterion = paddle.nn.CrossEntropyLoss(axis=1) + elif config.model.mode == 'MOL': + criterion = discretized_mix_logistic_loss + else: + criterion = None + RuntimeError('Unknown model mode value - ', config.model.mode) + print("criterions done!") + clip = paddle.nn.ClipGradByGlobalNorm(config.grad_clip) + optimizer = Adam( + parameters=model.parameters(), + learning_rate=config.learning_rate, + grad_clip=clip) + + print("optimizer done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + if dist.get_rank() == 0: + config_name = args.config.split("/")[-1] + # copy conf to output_dir + shutil.copyfile(args.config, output_dir / config_name) + + updater = WaveRNNUpdater( + model=model, + optimizer=optimizer, + criterion=criterion, + dataloader=train_dataloader, + output_dir=output_dir, + mode=config.model.mode) + + evaluator = WaveRNNEvaluator( + model=model, + dataloader=dev_dataloader, + criterion=criterion, + output_dir=output_dir, + valid_generate_loader=valid_generate_loader, + config=config) + + trainer = Trainer( + updater, + stop_trigger=(config.train_max_steps, "iteration"), + out=output_dir) + + if dist.get_rank() == 0: + trainer.extend( + evaluator, trigger=(config.eval_interval_steps, 'iteration')) + trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) + + print("Trainer Done!") + trainer.run() + + +def main(): + # parse args and config and redirect to train_sp + + parser = argparse.ArgumentParser(description="Train a WaveRNN model.") + parser.add_argument( + "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--train-metadata", type=str, help="training data.") + parser.add_argument("--dev-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + + args = parser.parse_args() + + with open(args.config, 'rt') as f: + config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + # dispatch + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) + else: + train_sp(args, config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/frontend/arpabet.py b/paddlespeech/t2s/frontend/arpabet.py index 094a2bfa..7a81b645 100644 --- a/paddlespeech/t2s/frontend/arpabet.py +++ b/paddlespeech/t2s/frontend/arpabet.py @@ -133,16 +133,11 @@ class ARPABET(Phonetics): def phoneticize(self, sentence, add_start_end=False): """ Normalize the input text sequence and convert it into pronunciation sequence. + Args: + sentence (str): The input text sequence. - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: The list of pronunciation sequence. """ phonemes = [ self._remove_vowels(item) for item in self.backend(sentence) @@ -156,16 +151,12 @@ class ARPABET(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. + + Args: + phonemes (List[str]): The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [self.vocab.lookup(item) for item in phonemes] return ids @@ -173,30 +164,23 @@ class ARPABET(Phonetics): def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. + Args: + ids( List[int]): The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: + The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence, add_start_end=False): """ Convert the input text sequence into pronunciation id sequence. - Parameters - ----------- - sentence: str - The input text sequence. + Args: + sentence (str): The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize( self.phoneticize(sentence, add_start_end=add_start_end)) @@ -229,15 +213,11 @@ class ARPABETWithStress(Phonetics): def phoneticize(self, sentence, add_start_end=False): """ Normalize the input text sequence and convert it into pronunciation sequence. - Parameters - ----------- - sentence: str - The input text sequence. + Args: + sentence (str): The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: The list of pronunciation sequence. """ phonemes = self.backend(sentence) if add_start_end: @@ -249,47 +229,33 @@ class ARPABETWithStress(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. + + Args: + phonemes (List[str]): The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [self.vocab.lookup(item) for item in phonemes] return ids def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. + Args: + ids (List[int]): The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence, add_start_end=False): """ Convert the input text sequence into pronunciation id sequence. + Args: + sentence (str): The input text sequence. - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize( self.phoneticize(sentence, add_start_end=add_start_end)) diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index 25413871..8e9f1173 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -65,14 +65,10 @@ class English(Phonetics): def phoneticize(self, sentence): """ Normalize the input text sequence and convert it into pronunciation sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[str]: The list of pronunciation sequence. """ start = self.vocab.start_symbol end = self.vocab.end_symbol @@ -83,11 +79,6 @@ class English(Phonetics): return phonemes def _p2id(self, phonemes: List[str]) -> np.array: - # replace unk phone with sp - phonemes = [ - phn if (phn in self.vocab_phones and phn not in self.punc) else "sp" - for phn in phonemes - ] phone_ids = [self.vocab_phones[item] for item in phonemes] return np.array(phone_ids, np.int64) @@ -102,6 +93,12 @@ class English(Phonetics): # remove start_symbol and end_symbol phones = phones[1:-1] phones = [phn for phn in phones if not phn.isspace()] + # replace unk phone with sp + phones = [ + phn + if (phn in self.vocab_phones and phn not in self.punc) else "sp" + for phn in phones + ] phones_list.append(phones) if merge_sentences: @@ -122,14 +119,10 @@ class English(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Args: + phonemes (List[str]): The list of pronunciation sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [ self.vocab.lookup(item) for item in phonemes @@ -139,27 +132,19 @@ class English(Phonetics): def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + ids (List[int]): The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence): """ Convert the input text sequence into pronunciation id sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Args: + sentence(str): The input text sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize(self.phoneticize(sentence)) @@ -182,28 +167,21 @@ class EnglishCharacter(Phonetics): def phoneticize(self, sentence): """ Normalize the input text sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - str - A text sequence after normalize. + Args: + sentence(str): The input text sequence. + Returns: + str: A text sequence after normalize. """ words = normalize(sentence) return words def numericalize(self, sentence): """ Convert a text sequence into ids. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[int] - List of a character id sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[int]: + List of a character id sequence. """ ids = [ self.vocab.lookup(item) for item in sentence @@ -213,27 +191,19 @@ class EnglishCharacter(Phonetics): def reverse(self, ids): """ Convert a character id sequence into text. - Parameters - ----------- - ids: List[int] - List of a character id sequence. - Returns - ---------- - str - The input text sequence. + Args: + ids (List[int]): List of a character id sequence. + Returns: + str: The input text sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence): """ Normalize the input text sequence and convert it into character id sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[int] - List of a character id sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[int]: List of a character id sequence. """ return self.numericalize(self.phoneticize(sentence)) @@ -263,14 +233,10 @@ class Chinese(Phonetics): def phoneticize(self, sentence): """ Normalize the input text sequence and convert it into pronunciation sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + sentence(str): The input text sequence. + Returns: + List[str]: The list of pronunciation sequence. """ # simplified = self.opencc_backend.convert(sentence) simplified = sentence @@ -295,28 +261,20 @@ class Chinese(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Args: + phonemes(List[str]): The list of pronunciation sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [self.vocab.lookup(item) for item in phonemes] return ids def __call__(self, sentence): """ Convert the input text sequence into pronunciation id sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize(self.phoneticize(sentence)) @@ -328,13 +286,9 @@ class Chinese(Phonetics): def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + ids (List[int]): The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] diff --git a/paddlespeech/t2s/frontend/vocab.py b/paddlespeech/t2s/frontend/vocab.py index 9ef6b137..76bb3c7b 100644 --- a/paddlespeech/t2s/frontend/vocab.py +++ b/paddlespeech/t2s/frontend/vocab.py @@ -20,22 +20,12 @@ __all__ = ["Vocab"] class Vocab(object): """ Vocabulary. - Parameters - ----------- - symbols: Iterable[str] - Common symbols. - - padding_symbol: str, optional - Symbol for pad. Defaults to "". - - unk_symbol: str, optional - Symbol for unknow. Defaults to "" - - start_symbol: str, optional - Symbol for start. Defaults to "" - - end_symbol: str, optional - Symbol for end. Defaults to "" + Args: + symbols (Iterable[str]): Common symbols. + padding_symbol (str, optional): Symbol for pad. Defaults to "". + unk_symbol (str, optional): Symbol for unknow. Defaults to "" + start_symbol (str, optional): Symbol for start. Defaults to "" + end_symbol (str, optional): Symbol for end. Defaults to "" """ def __init__(self, diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py index 8801baa0..bfa7d2b1 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py +++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py @@ -44,12 +44,10 @@ RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' def replace_time(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ is_range = len(match.groups()) > 5 @@ -87,12 +85,10 @@ RE_DATE = re.compile(r'(\d{4}|\d{2})年' def replace_date(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ year = match.group(1) month = match.group(3) @@ -114,12 +110,10 @@ RE_DATE2 = re.compile( def replace_date2(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ year = match.group(1) month = match.group(3) diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py index 1e575c08..27a2f846 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/num.py +++ b/paddlespeech/t2s/frontend/zh_normalization/num.py @@ -36,12 +36,10 @@ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') def replace_frac(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) nominator = match.group(2) @@ -59,12 +57,10 @@ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') def replace_percentage(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) percent = match.group(2) @@ -81,12 +77,10 @@ RE_INTEGER = re.compile(r'(-)' r'(\d+)') def replace_negative_num(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) number = match.group(2) @@ -103,12 +97,10 @@ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*') def replace_default_num(match): """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ number = match.group(0) return verbalize_digit(number) @@ -124,12 +116,10 @@ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') def replace_positive_quantifier(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ number = match.group(1) match_2 = match.group(2) @@ -142,12 +132,10 @@ def replace_positive_quantifier(match) -> str: def replace_number(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) number = match.group(2) @@ -169,12 +157,10 @@ RE_RANGE = re.compile( def replace_range(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ first, second = match.group(1), match.group(8) first = RE_NUMBER.sub(replace_number, first) @@ -222,7 +208,7 @@ def verbalize_digit(value_string: str, alt_one=False) -> str: result_symbols = [DIGITS[digit] for digit in value_string] result = ''.join(result_symbols) if alt_one: - result.replace("一", "幺") + result = result.replace("一", "幺") return result diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py index b7b69b41..06b5d41b 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py +++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py @@ -45,23 +45,19 @@ def phone2str(phone_string: str, mobile=True) -> str: def replace_phone(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ return phone2str(match.group(0), mobile=False) def replace_mobile(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ return phone2str(match.group(0)) diff --git a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py index d3805a32..268d7229 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py +++ b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py @@ -22,12 +22,10 @@ RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') def replace_temperature(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) temperature = match.group(2) diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index 9794a700..f9d1b8cb 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -55,14 +55,10 @@ class TextNormalizer(): def _split(self, text: str, lang="zh") -> List[str]: """Split long text into sentences with sentence-splitting punctuations. - Parameters - ---------- - text : str - The input text. - Returns - ------- - List[str] - Sentences. + Args: + text (str): The input text. + Returns: + List[str]: Sentences. """ # Only for pure Chinese here if lang == "zh": diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py index f268a4e3..41be7c1d 100644 --- a/paddlespeech/t2s/models/__init__.py +++ b/paddlespeech/t2s/models/__init__.py @@ -19,3 +19,4 @@ from .speedyspeech import * from .tacotron2 import * from .transformer_tts import * from .waveflow import * +from .wavernn import * diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 405ad957..73f5498e 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -38,17 +38,21 @@ from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder class FastSpeech2(nn.Layer): """FastSpeech2 module. - + This is a module of FastSpeech2 described in `FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`_. Instead of quantized pitch and energy, we use token-averaged value introduced in `FastPitch: Parallel Text-to-speech with Pitch Prediction`_. - + .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`: https://arxiv.org/abs/2006.04558 .. _`FastPitch: Parallel Text-to-speech with Pitch Prediction`: https://arxiv.org/abs/2006.06873 + Args: + + Returns: + """ def __init__( @@ -127,136 +131,72 @@ class FastSpeech2(nn.Layer): init_enc_alpha: float=1.0, init_dec_alpha: float=1.0, ): """Initialize FastSpeech2 module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - adim : int - Attention dimension. - aheads : int - Number of attention heads. - elayers : int - Number of encoder layers. - eunits : int - Number of encoder hidden units. - dlayers : int - Number of decoder layers. - dunits : int - Number of decoder hidden units. - postnet_layers : int - Number of postnet layers. - postnet_chans : int - Number of postnet channels. - postnet_filts : int - Kernel size of postnet. - postnet_dropout_rate : float - Dropout rate in postnet. - use_scaled_pos_enc : bool - Whether to use trainable scaled pos encoding. - use_batch_norm : bool - Whether to use batch normalization in encoder prenet. - encoder_normalize_before : bool - Whether to apply layernorm layer before encoder block. - decoder_normalize_before : bool - Whether to apply layernorm layer before - decoder block. - encoder_concat_after : bool - Whether to concatenate attention layer's input and output in encoder. - decoder_concat_after : bool - Whether to concatenate attention layer's input and output in decoder. - reduction_factor : int - Reduction factor. - encoder_type : str - Encoder type ("transformer" or "conformer"). - decoder_type : str - Decoder type ("transformer" or "conformer"). - transformer_enc_dropout_rate : float - Dropout rate in encoder except attention and positional encoding. - transformer_enc_positional_dropout_rate (float): Dropout rate after encoder - positional encoding. - transformer_enc_attn_dropout_rate (float): Dropout rate in encoder - self-attention module. - transformer_dec_dropout_rate (float): Dropout rate in decoder except - attention & positional encoding. - transformer_dec_positional_dropout_rate (float): Dropout rate after decoder - positional encoding. - transformer_dec_attn_dropout_rate (float): Dropout rate in decoder - self-attention module. - conformer_pos_enc_layer_type : str - Pos encoding layer type in conformer. - conformer_self_attn_layer_type : str - Self-attention layer type in conformer - conformer_activation_type : str - Activation function type in conformer. - use_macaron_style_in_conformer : bool - Whether to use macaron style FFN. - use_cnn_in_conformer : bool - Whether to use CNN in conformer. - zero_triu : bool - Whether to use zero triu in relative self-attention module. - conformer_enc_kernel_size : int - Kernel size of encoder conformer. - conformer_dec_kernel_size : int - Kernel size of decoder conformer. - duration_predictor_layers : int - Number of duration predictor layers. - duration_predictor_chans : int - Number of duration predictor channels. - duration_predictor_kernel_size : int - Kernel size of duration predictor. - duration_predictor_dropout_rate : float - Dropout rate in duration predictor. - pitch_predictor_layers : int - Number of pitch predictor layers. - pitch_predictor_chans : int - Number of pitch predictor channels. - pitch_predictor_kernel_size : int - Kernel size of pitch predictor. - pitch_predictor_dropout_rate : float - Dropout rate in pitch predictor. - pitch_embed_kernel_size : float - Kernel size of pitch embedding. - pitch_embed_dropout_rate : float - Dropout rate for pitch embedding. - stop_gradient_from_pitch_predictor : bool - Whether to stop gradient from pitch predictor to encoder. - energy_predictor_layers : int - Number of energy predictor layers. - energy_predictor_chans : int - Number of energy predictor channels. - energy_predictor_kernel_size : int - Kernel size of energy predictor. - energy_predictor_dropout_rate : float - Dropout rate in energy predictor. - energy_embed_kernel_size : float - Kernel size of energy embedding. - energy_embed_dropout_rate : float - Dropout rate for energy embedding. - stop_gradient_from_energy_predictor : bool - Whether to stop gradient from energy predictor to encoder. - spk_num : Optional[int] - Number of speakers. If not None, assume that the spk_embed_dim is not None, - spk_ids will be provided as the input and use spk_embedding_table. - spk_embed_dim : Optional[int] - Speaker embedding dimension. If not None, - assume that spk_emb will be provided as the input or spk_num is not None. - spk_embed_integration_type : str - How to integrate speaker embedding. - tone_num : Optional[int] - Number of tones. If not None, assume that the - tone_ids will be provided as the input and use tone_embedding_table. - tone_embed_dim : Optional[int] - Tone embedding dimension. If not None, assume that tone_num is not None. - tone_embed_integration_type : str - How to integrate tone embedding. - init_type : str - How to initialize transformer parameters. - init_enc_alpha : float - Initial value of alpha in scaled pos encoding of the encoder. - init_dec_alpha : float - Initial value of alpha in scaled pos encoding of the decoder. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + adim (int): Attention dimension. + aheads (int): Number of attention heads. + elayers (int): Number of encoder layers. + eunits (int): Number of encoder hidden units. + dlayers (int): Number of decoder layers. + dunits (int): Number of decoder hidden units. + postnet_layers (int): Number of postnet layers. + postnet_chans (int): Number of postnet channels. + postnet_filts (int): Kernel size of postnet. + postnet_dropout_rate (float): Dropout rate in postnet. + use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding. + use_batch_norm (bool): Whether to use batch normalization in encoder prenet. + encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block. + decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block. + encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder. + decoder_concat_after (bool): Whether to concatenate attention layer's input and output in decoder. + reduction_factor (int): Reduction factor. + encoder_type (str): Encoder type ("transformer" or "conformer"). + decoder_type (str): Decoder type ("transformer" or "conformer"). + transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding. + transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding. + transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module. + transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding. + transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding. + transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module. + conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer. + conformer_self_attn_layer_type (str): Self-attention layer type in conformer + conformer_activation_type (str): Activation function type in conformer. + use_macaron_style_in_conformer (bool): Whether to use macaron style FFN. + use_cnn_in_conformer (bool): Whether to use CNN in conformer. + zero_triu (bool): Whether to use zero triu in relative self-attention module. + conformer_enc_kernel_size (int): Kernel size of encoder conformer. + conformer_dec_kernel_size (int): Kernel size of decoder conformer. + duration_predictor_layers (int): Number of duration predictor layers. + duration_predictor_chans (int): Number of duration predictor channels. + duration_predictor_kernel_size (int): Kernel size of duration predictor. + duration_predictor_dropout_rate (float): Dropout rate in duration predictor. + pitch_predictor_layers (int): Number of pitch predictor layers. + pitch_predictor_chans (int): Number of pitch predictor channels. + pitch_predictor_kernel_size (int): Kernel size of pitch predictor. + pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor. + pitch_embed_kernel_size (float): Kernel size of pitch embedding. + pitch_embed_dropout_rate (float): Dropout rate for pitch embedding. + stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder. + energy_predictor_layers (int): Number of energy predictor layers. + energy_predictor_chans (int): Number of energy predictor channels. + energy_predictor_kernel_size (int): Kernel size of energy predictor. + energy_predictor_dropout_rate (float): Dropout rate in energy predictor. + energy_embed_kernel_size (float): Kernel size of energy embedding. + energy_embed_dropout_rate (float): Dropout rate for energy embedding. + stop_gradient_from_energy_predictor(bool): Whether to stop gradient from energy predictor to encoder. + spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None, + spk_ids will be provided as the input and use spk_embedding_table. + spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None, + assume that spk_emb will be provided as the input or spk_num is not None. + spk_embed_integration_type (str): How to integrate speaker embedding. + tone_num (Optional[int]): Number of tones. If not None, assume that the + tone_ids will be provided as the input and use tone_embedding_table. + tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None. + tone_embed_integration_type (str): How to integrate tone embedding. + init_type (str): How to initialize transformer parameters. + init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the encoder. + init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder. """ assert check_argument_types() @@ -489,45 +429,21 @@ class FastSpeech2(nn.Layer): ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. - Parameters - ---------- - text : Tensor(int64) - Batch of padded token ids (B, Tmax). - text_lengths : Tensor(int64) - Batch of lengths of each input (B,). - speech : Tensor - Batch of padded target features (B, Lmax, odim). - speech_lengths : Tensor(int64) - Batch of the lengths of each target (B,). - durations : Tensor(int64) - Batch of padded durations (B, Tmax). - pitch : Tensor - Batch of padded token-averaged pitch (B, Tmax, 1). - energy : Tensor - Batch of padded token-averaged energy (B, Tmax, 1). - tone_id : Tensor, optional(int64) - Batch of padded tone ids (B, Tmax). - spk_emb : Tensor, optional - Batch of speaker embeddings (B, spk_embed_dim). - spk_id : Tnesor, optional(int64) - Batch of speaker ids (B,) - - Returns - ---------- - Tensor - mel outs before postnet - Tensor - mel outs after postnet - Tensor - duration predictor's output - Tensor - pitch predictor's output - Tensor - energy predictor's output - Tensor - speech - Tensor - speech_lengths, modified if reduction_factor > 1 + Args: + text(Tensor(int64)): Batch of padded token ids (B, Tmax). + text_lengths(Tensor(int64)): Batch of lengths of each input (B,). + speech(Tensor): Batch of padded target features (B, Lmax, odim). + speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,). + durations(Tensor(int64)): Batch of padded durations (B, Tmax). + pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1). + energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1). + tone_id(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax). + spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim). + spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,) + + Returns: + + """ # input of embedding must be int64 @@ -556,8 +472,7 @@ class FastSpeech2(nn.Layer): tone_id=tone_id) # modify mod part of groundtruth if self.reduction_factor > 1: - olens = paddle.to_tensor( - [olen - olen % self.reduction_factor for olen in olens.numpy()]) + olens = olens - olens % self.reduction_factor max_olen = max(olens) ys = ys[:, :max_olen] @@ -627,7 +542,7 @@ class FastSpeech2(nn.Layer): hs = hs + e_embs + p_embs # (B, Lmax, adim) - hs = self.length_regulator(hs, d_outs, alpha) + hs = self.length_regulator(hs, d_outs, alpha, is_inference=True) else: d_outs = self.duration_predictor(hs, d_masks) # use groundtruth in training @@ -638,7 +553,7 @@ class FastSpeech2(nn.Layer): hs = hs + e_embs + p_embs # (B, Lmax, adim) - hs = self.length_regulator(hs, ds) + hs = self.length_regulator(hs, ds, is_inference=False) # forward decoder if olens is not None and not is_inference: @@ -681,34 +596,22 @@ class FastSpeech2(nn.Layer): ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Generate the sequence of features given the sequences of characters. - Parameters - ---------- - text : Tensor(int64) - Input sequence of characters (T,). - speech : Tensor, optional - Feature sequence to extract style (N, idim). - durations : Tensor, optional (int64) - Groundtruth of duration (T,). - pitch : Tensor, optional - Groundtruth of token-averaged pitch (T, 1). - energy : Tensor, optional - Groundtruth of token-averaged energy (T, 1). - alpha : float, optional - Alpha to control the speed. - use_teacher_forcing : bool, optional - Whether to use teacher forcing. - If true, groundtruth of duration, pitch and energy will be used. - spk_emb : Tensor, optional - peaker embedding vector (spk_embed_dim,). - spk_id : Tensor, optional(int64) - Batch of padded spk ids (1,). - tone_id : Tensor, optional(int64) - Batch of padded tone ids (T,). - - Returns - ---------- - Tensor - Output sequence of features (L, odim). + Args: + text(Tensor(int64)): Input sequence of characters (T,). + speech(Tensor, optional): Feature sequence to extract style (N, idim). + durations(Tensor, optional (int64)): Groundtruth of duration (T,). + pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1). + energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1). + alpha(float, optional): Alpha to control the speed. + use_teacher_forcing(bool, optional): Whether to use teacher forcing. + If true, groundtruth of duration, pitch and energy will be used. + spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None) + spk_id(Tensor, optional(int64), optional): Batch of padded spk ids (1,). (Default value = None) + tone_id(Tensor, optional(int64), optional): Batch of padded tone ids (T,). (Default value = None) + + Returns: + + """ # input of embedding must be int64 x = paddle.cast(text, 'int64') @@ -762,17 +665,13 @@ class FastSpeech2(nn.Layer): def _integrate_with_spk_embed(self, hs, spk_emb): """Integrate speaker embedding with hidden states. - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, adim). - spk_emb : Tensor - Batch of speaker embeddings (B, spk_embed_dim). - - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, adim) + Args: + hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). + spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim). + + Returns: + + """ if self.spk_embed_integration_type == "add": # apply projection and then add to hidden states @@ -781,7 +680,7 @@ class FastSpeech2(nn.Layer): elif self.spk_embed_integration_type == "concat": # concat hidden states with spk embeds and then apply projection spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( - shape=[-1, hs.shape[1], -1]) + shape=[-1, paddle.shape(hs)[1], -1]) hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1)) else: raise NotImplementedError("support only add or concat.") @@ -791,17 +690,13 @@ class FastSpeech2(nn.Layer): def _integrate_with_tone_embed(self, hs, tone_embs): """Integrate speaker embedding with hidden states. - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, adim). - tone_embs : Tensor - Batch of speaker embeddings (B, Tmax, tone_embed_dim). - - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, adim) + Args: + hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). + tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim). + + Returns: + + """ if self.tone_embed_integration_type == "add": # apply projection and then add to hidden states @@ -820,24 +715,17 @@ class FastSpeech2(nn.Layer): def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor: """Make masks for self-attention. - Parameters - ---------- - ilens : Tensor - Batch of lengths (B,). - - Returns - ------- - Tensor - Mask tensor for self-attention. - dtype=paddle.bool + Args: + ilens(Tensor): Batch of lengths (B,). - Examples - ------- - >>> ilens = [5, 3] - >>> self._source_mask(ilens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 0, 0]]]) bool + Returns: + Tensor: Mask tensor for self-attention. dtype=paddle.bool + Examples: + >>> ilens = [5, 3] + >>> self._source_mask(ilens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 0, 0]]]) bool """ x_masks = make_non_pad_mask(ilens) return x_masks.unsqueeze(-2) @@ -911,34 +799,26 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): spk_emb=None, spk_id=None): """ - Parameters - ---------- - text : Tensor(int64) - Input sequence of characters (T,). - speech : Tensor, optional - Feature sequence to extract style (N, idim). - durations : paddle.Tensor/np.ndarray, optional (int64) - Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias - durations_scale: int/float, optional - durations_bias: int/float, optional - pitch : paddle.Tensor/np.ndarray, optional - Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias - pitch_scale: int/float, optional - In denormed HZ domain. - pitch_bias: int/float, optional - In denormed HZ domain. - energy : paddle.Tensor/np.ndarray, optional - Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias - energy_scale: int/float, optional - In denormed domain. - energy_bias: int/float, optional - In denormed domain. - robot : bool, optional - Weather output robot style - Returns - ---------- - Tensor - Output sequence of features (L, odim). + + Args: + text(Tensor(int64)): Input sequence of characters (T,). + speech(Tensor, optional): Feature sequence to extract style (N, idim). + durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias + durations_scale(int/float, optional): + durations_bias(int/float, optional): + pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias + pitch_scale(int/float, optional): In denormed HZ domain. + pitch_bias(int/float, optional): In denormed HZ domain. + energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias + energy_scale(int/float, optional): In denormed domain. + energy_bias(int/float, optional): In denormed domain. + robot: bool: (Default value = False) + spk_emb: (Default value = None) + spk_id: (Default value = None) + + Returns: + Tensor: logmel + """ normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( text, @@ -1012,13 +892,9 @@ class FastSpeech2Loss(nn.Layer): def __init__(self, use_masking: bool=True, use_weighted_masking: bool=False): """Initialize feed-forward Transformer loss module. - - Parameters - ---------- - use_masking : bool - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool - Whether to weighted masking in loss calculation. + Args: + use_masking (bool): Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): Whether to weighted masking in loss calculation. """ assert check_argument_types() super().__init__() @@ -1049,42 +925,22 @@ class FastSpeech2Loss(nn.Layer): ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Calculate forward propagation. - Parameters - ---------- - after_outs : Tensor - Batch of outputs after postnets (B, Lmax, odim). - before_outs : Tensor - Batch of outputs before postnets (B, Lmax, odim). - d_outs : Tensor - Batch of outputs of duration predictor (B, Tmax). - p_outs : Tensor - Batch of outputs of pitch predictor (B, Tmax, 1). - e_outs : Tensor - Batch of outputs of energy predictor (B, Tmax, 1). - ys : Tensor - Batch of target features (B, Lmax, odim). - ds : Tensor - Batch of durations (B, Tmax). - ps : Tensor - Batch of target token-averaged pitch (B, Tmax, 1). - es : Tensor - Batch of target token-averaged energy (B, Tmax, 1). - ilens : Tensor - Batch of the lengths of each input (B,). - olens : Tensor - Batch of the lengths of each target (B,). - - Returns - ---------- - Tensor - L1 loss value. - Tensor - Duration predictor loss value. - Tensor - Pitch predictor loss value. - Tensor - Energy predictor loss value. - + Args: + after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim). + d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax). + p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1). + e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1). + ys(Tensor): Batch of target features (B, Lmax, odim). + ds(Tensor): Batch of durations (B, Tmax). + ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1). + es(Tensor): Batch of target token-averaged energy (B, Tmax, 1). + ilens(Tensor): Batch of the lengths of each input (B,). + olens(Tensor): Batch of the lengths of each target (B,). + + Returns: + + """ # apply mask to remove padded part if self.use_masking: diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py index 0dabf934..92aa9dfc 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from pathlib import Path from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator @@ -28,20 +32,17 @@ logger.setLevel(logging.INFO) class FastSpeech2Updater(StandardUpdater): def __init__(self, - model, - optimizer, - dataloader, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, init_state=None, - use_masking=False, - use_weighted_masking=False, - output_dir=None): + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None): super().__init__(model, optimizer, dataloader, init_state=None) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking self.criterion = FastSpeech2Loss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking) + use_masking=use_masking, use_weighted_masking=use_weighted_masking) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -107,14 +108,12 @@ class FastSpeech2Updater(StandardUpdater): class FastSpeech2Evaluator(StandardEvaluator): def __init__(self, - model, - dataloader, - use_masking=False, - use_weighted_masking=False, - output_dir=None): + model: Layer, + dataloader: DataLoader, + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None): super().__init__(model, dataloader) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -123,8 +122,7 @@ class FastSpeech2Evaluator(StandardEvaluator): self.msg = "" self.criterion = FastSpeech2Loss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking) + use_masking=use_masking, use_weighted_masking=use_weighted_masking) def evaluate_core(self, batch): self.msg = "Evaluate: " diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py index 82dd66c1..116376ec 100644 --- a/paddlespeech/t2s/models/hifigan/hifigan.py +++ b/paddlespeech/t2s/models/hifigan/hifigan.py @@ -37,35 +37,21 @@ class HiFiGANGenerator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initialize HiFiGANGenerator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - channels : int - Number of hidden representation channels. - kernel_size : int - Kernel size of initial and final conv layer. - upsample_scales : list - List of upsampling scales. - upsample_kernel_sizes : list - List of kernel sizes for upsampling layers. - resblock_kernel_sizes : list - List of kernel sizes for residual blocks. - resblock_dilations : list - List of dilation list for residual blocks. - use_additional_convs : bool - Whether to use additional conv layers in residual blocks. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + channels (int): Number of hidden representation channels. + kernel_size (int): Kernel size of initial and final conv layer. + upsample_scales (list): List of upsampling scales. + upsample_kernel_sizes (list): List of kernel sizes for upsampling layers. + resblock_kernel_sizes (list): List of kernel sizes for residual blocks. + resblock_dilations (list): List of dilation list for residual blocks. + use_additional_convs (bool): Whether to use additional conv layers in residual blocks. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -134,14 +120,11 @@ class HiFiGANGenerator(nn.Layer): def forward(self, c): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T). + + Args: + c (Tensor): Input tensor (B, in_channels, T). + Returns: + Tensor: Output tensor (B, out_channels, T). """ c = self.input_conv(c) for i in range(self.num_upsamples): @@ -196,15 +179,12 @@ class HiFiGANGenerator(nn.Layer): def inference(self, c): """Perform inference. - Parameters - ---------- - c : Tensor - Input tensor (T, in_channels). - normalize_before (bool): Whether to perform normalization. - Returns - ---------- - Tensor - Output tensor (T ** prod(upsample_scales), out_channels). + Args: + c (Tensor): Input tensor (T, in_channels). + normalize_before (bool): Whether to perform normalization. + Returns: + Tensor: + Output tensor (T ** prod(upsample_scales), out_channels). """ c = self.forward(c.transpose([1, 0]).unsqueeze(0)) return c.squeeze(0).transpose([1, 0]) @@ -229,36 +209,23 @@ class HiFiGANPeriodDiscriminator(nn.Layer): use_spectral_norm: bool=False, init_type: str="xavier_uniform", ): """Initialize HiFiGANPeriodDiscriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - period : int - Period. - kernel_sizes : list - Kernel sizes of initial conv layers and the final conv layer. - channels : int - Number of initial channels. - downsample_scales : list - List of downsampling scales. - max_downsample_channels : int - Number of maximum downsampling channels. - use_additional_convs : bool - Whether to use additional conv layers in residual blocks. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. - use_spectral_norm : bool - Whether to use spectral norm. - If set to true, it will be applied to all of the conv layers. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + period (int): Period. + kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer. + channels (int): Number of initial channels. + downsample_scales (list): List of downsampling scales. + max_downsample_channels (int): Number of maximum downsampling channels. + use_additional_convs (bool): Whether to use additional conv layers in residual blocks. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_spectral_norm (bool): Whether to use spectral norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -307,14 +274,11 @@ class HiFiGANPeriodDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - list - List of each layer's tensors. + + Args: + c (Tensor): Input tensor (B, in_channels, T). + Returns: + list: List of each layer's tensors. """ # transform 1d to 2d -> (B, C, T/P, P) b, c, t = paddle.shape(x) @@ -379,13 +343,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): }, init_type: str="xavier_uniform", ): """Initialize HiFiGANMultiPeriodDiscriminator module. - Parameters - ---------- - periods : list - List of periods. - discriminator_params : dict - Parameters for hifi-gan period discriminator module. - The period parameter will be overwritten. + + Args: + periods (list): List of periods. + discriminator_params (dict): Parameters for hifi-gan period discriminator module. + The period parameter will be overwritten. """ super().__init__() # initialize parameters @@ -399,14 +361,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of list of each discriminator outputs, which consists of each layer output tensors. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of list of each discriminator outputs, which consists of each layer output tensors. """ outs = [] for f in self.discriminators: @@ -434,33 +393,22 @@ class HiFiGANScaleDiscriminator(nn.Layer): use_spectral_norm: bool=False, init_type: str="xavier_uniform", ): """Initilize HiFiGAN scale discriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - kernel_sizes : list - List of four kernel sizes. The first will be used for the first conv layer, - and the second is for downsampling part, and the remaining two are for output layers. - channels : int - Initial number of channels for conv layer. - max_downsample_channels : int - Maximum number of channels for downsampling layers. - bias : bool - Whether to add bias parameter in convolution layers. - downsample_scales : list - List of downsampling scales. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. - use_spectral_norm : bool - Whether to use spectral norm. - If set to true, it will be applied to all of the conv layers. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer, + and the second is for downsampling part, and the remaining two are for output layers. + channels (int): Initial number of channels for conv layer. + max_downsample_channels (int): Maximum number of channels for downsampling layers. + bias (bool): Whether to add bias parameter in convolution layers. + downsample_scales (list): List of downsampling scales. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_spectral_norm (bool): Whether to use spectral norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -546,14 +494,11 @@ class HiFiGANScaleDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of output tensors of each layer. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of output tensors of each layer. """ outs = [] for f in self.layers: @@ -613,20 +558,14 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer): follow_official_norm: bool=False, init_type: str="xavier_uniform", ): """Initilize HiFiGAN multi-scale discriminator module. - Parameters - ---------- - scales : int - Number of multi-scales. - downsample_pooling : str - Pooling module name for downsampling of the inputs. - downsample_pooling_params : dict - Parameters for the above pooling module. - discriminator_params : dict - Parameters for hifi-gan scale discriminator module. - follow_official_norm : bool - Whether to follow the norm setting of the official - implementaion. The first discriminator uses spectral norm and the other - discriminators use weight norm. + + Args: + scales (int): Number of multi-scales. + downsample_pooling (str): Pooling module name for downsampling of the inputs. + downsample_pooling_params (dict): Parameters for the above pooling module. + discriminator_params (dict): Parameters for hifi-gan scale discriminator module. + follow_official_norm (bool): Whether to follow the norm setting of the official + implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm. """ super().__init__() @@ -651,14 +590,11 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of list of each discriminator outputs, which consists of each layer output tensors. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of list of each discriminator outputs, which consists of each layer output tensors. """ outs = [] for f in self.discriminators: @@ -715,24 +651,17 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): }, init_type: str="xavier_uniform", ): """Initilize HiFiGAN multi-scale + multi-period discriminator module. - Parameters - ---------- - scales : int - Number of multi-scales. - scale_downsample_pooling : str - Pooling module name for downsampling of the inputs. - scale_downsample_pooling_params : dict - Parameters for the above pooling module. - scale_discriminator_params : dict - Parameters for hifi-gan scale discriminator module. - follow_official_norm : bool): Whether to follow the norm setting of the official - implementaion. The first discriminator uses spectral norm and the other - discriminators use weight norm. - periods : list - List of periods. - period_discriminator_params : dict - Parameters for hifi-gan period discriminator module. - The period parameter will be overwritten. + + Args: + scales (int): Number of multi-scales. + scale_downsample_pooling (str): Pooling module name for downsampling of the inputs. + scale_downsample_pooling_params (dict): Parameters for the above pooling module. + scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module. + follow_official_norm (bool): Whether to follow the norm setting of the official implementaion. + The first discriminator uses spectral norm and the other discriminators use weight norm. + periods (list): List of periods. + period_discriminator_params (dict): Parameters for hifi-gan period discriminator module. + The period parameter will be overwritten. """ super().__init__() @@ -751,16 +680,14 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List: - List of list of each discriminator outputs, - which consists of each layer output tensors. - Multi scale and multi period ones are concatenated. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: + List of list of each discriminator outputs, + which consists of each layer output tensors. + Multi scale and multi period ones are concatenated. """ msd_outs = self.msd(x) mpd_outs = self.mpd(x) diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py index 3e90b691..6a139659 100644 --- a/paddlespeech/t2s/models/melgan/melgan.py +++ b/paddlespeech/t2s/models/melgan/melgan.py @@ -51,41 +51,26 @@ class MelGANGenerator(nn.Layer): use_causal_conv: bool=False, init_type: str="xavier_uniform", ): """Initialize MelGANGenerator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels, - the number of sub-band is out_channels in multi-band melgan. - kernel_size : int - Kernel size of initial and final conv layer. - channels : int - Initial number of channels for conv layer. - bias : bool - Whether to add bias parameter in convolution layers. - upsample_scales : List[int] - List of upsampling scales. - stack_kernel_size : int - Kernel size of dilated conv layers in residual stack. - stacks : int - Number of stacks in a single residual stack. - nonlinear_activation : Optional[str], optional - Non linear activation in upsample network, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to the linear activation in the upsample network, - by default {} - pad : str - Padding function module name before dilated convolution layer. - pad_params : dict - Hyperparameters for padding function. - use_final_nonlinear_activation : nn.Layer - Activation function for the final layer. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. - use_causal_conv : bool - Whether to use causal convolution. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels, + the number of sub-band is out_channels in multi-band melgan. + kernel_size (int): Kernel size of initial and final conv layer. + channels (int): Initial number of channels for conv layer. + bias (bool): Whether to add bias parameter in convolution layers. + upsample_scales (List[int]): List of upsampling scales. + stack_kernel_size (int): Kernel size of dilated conv layers in residual stack. + stacks (int): Number of stacks in a single residual stack. + nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, + by default {} + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. + use_final_nonlinear_activation (nn.Layer): Activation function for the final layer. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_causal_conv (bool): Whether to use causal convolution. """ super().__init__() @@ -207,14 +192,11 @@ class MelGANGenerator(nn.Layer): def forward(self, c): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T ** prod(upsample_scales)). + + Args: + c (Tensor): Input tensor (B, in_channels, T). + Returns: + Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)). """ out = self.melgan(c) return out @@ -260,14 +242,11 @@ class MelGANGenerator(nn.Layer): def inference(self, c): """Perform inference. - Parameters - ---------- - c : Union[Tensor, ndarray] - Input tensor (T, in_channels). - Returns - ---------- - Tensor - Output tensor (out_channels*T ** prod(upsample_scales), 1). + + Args: + c (Union[Tensor, ndarray]): Input tensor (T, in_channels). + Returns: + Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1). """ # pseudo batch c = c.transpose([1, 0]).unsqueeze(0) @@ -298,33 +277,22 @@ class MelGANDiscriminator(nn.Layer): pad_params: Dict[str, Any]={"mode": "reflect"}, init_type: str="xavier_uniform", ): """Initilize MelGAN discriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - kernel_sizes : List[int] - List of two kernel sizes. The prod will be used for the first conv layer, - and the first and the second kernel sizes will be used for the last two layers. - For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15, - the last two layers' kernel size will be 5 and 3, respectively. - channels : int - Initial number of channels for conv layer. - max_downsample_channels : int - Maximum number of channels for downsampling layers. - bias : bool - Whether to add bias parameter in convolution layers. - downsample_scales : List[int] - List of downsampling scales. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - pad : str - Padding function module name before dilated convolution layer. - pad_params : dict - Hyperparameters for padding function. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer, + and the first and the second kernel sizes will be used for the last two layers. + For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15, + the last two layers' kernel size will be 5 and 3, respectively. + channels (int): Initial number of channels for conv layer. + max_downsample_channels (int): Maximum number of channels for downsampling layers. + bias (bool): Whether to add bias parameter in convolution layers. + downsample_scales (List[int]): List of downsampling scales. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. """ super().__init__() @@ -395,14 +363,10 @@ class MelGANDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of output tensors of each layer (for feat_match_loss). + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of output tensors of each layer (for feat_match_loss). """ outs = [] for f in self.layers: @@ -440,39 +404,24 @@ class MelGANMultiScaleDiscriminator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initilize MelGAN multi-scale discriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - scales : int - Number of multi-scales. - downsample_pooling : str - Pooling module name for downsampling of the inputs. - downsample_pooling_params : dict - Parameters for the above pooling module. - kernel_sizes : List[int] - List of two kernel sizes. The sum will be used for the first conv layer, - and the first and the second kernel sizes will be used for the last two layers. - channels : int - Initial number of channels for conv layer. - max_downsample_channels : int - Maximum number of channels for downsampling layers. - bias : bool - Whether to add bias parameter in convolution layers. - downsample_scales : List[int] - List of downsampling scales. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - pad : str - Padding function module name before dilated convolution layer. - pad_params : dict - Hyperparameters for padding function. - use_causal_conv : bool - Whether to use causal convolution. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + scales (int): Number of multi-scales. + downsample_pooling (str): Pooling module name for downsampling of the inputs. + downsample_pooling_params (dict): Parameters for the above pooling module. + kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer, + and the first and the second kernel sizes will be used for the last two layers. + channels (int): Initial number of channels for conv layer. + max_downsample_channels (int): Maximum number of channels for downsampling layers. + bias (bool): Whether to add bias parameter in convolution layers. + downsample_scales (List[int]): List of downsampling scales. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. + use_causal_conv (bool): Whether to use causal convolution. """ super().__init__() @@ -514,14 +463,10 @@ class MelGANMultiScaleDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of list of each discriminator outputs, which consists of each layer output tensors. + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of list of each discriminator outputs, which consists of each layer output tensors. """ outs = [] for f in self.discriminators: diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py index bd451e1f..40a2f100 100644 --- a/paddlespeech/t2s/models/melgan/style_melgan.py +++ b/paddlespeech/t2s/models/melgan/style_melgan.py @@ -52,37 +52,23 @@ class StyleMelGANGenerator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initilize Style MelGAN generator. - Parameters - ---------- - in_channels : int - Number of input noise channels. - aux_channels : int - Number of auxiliary input channels. - channels : int - Number of channels for conv layer. - out_channels : int - Number of output channels. - kernel_size : int - Kernel size of conv layers. - dilation : int - Dilation factor for conv layers. - bias : bool - Whether to add bias parameter in convolution layers. - noise_upsample_scales : list - List of noise upsampling scales. - noise_upsample_activation : str - Activation function module name for noise upsampling. - noise_upsample_activation_params : dict - Hyperparameters for the above activation function. - upsample_scales : list - List of upsampling scales. - upsample_mode : str - Upsampling mode in TADE layer. - gated_function : str - Gated function in TADEResBlock ("softmax" or "sigmoid"). - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. + + Args: + in_channels (int): Number of input noise channels. + aux_channels (int): Number of auxiliary input channels. + channels (int): Number of channels for conv layer. + out_channels (int): Number of output channels. + kernel_size (int): Kernel size of conv layers. + dilation (int): Dilation factor for conv layers. + bias (bool): Whether to add bias parameter in convolution layers. + noise_upsample_scales (list): List of noise upsampling scales. + noise_upsample_activation (str): Activation function module name for noise upsampling. + noise_upsample_activation_params (dict): Hyperparameters for the above activation function. + upsample_scales (list): List of upsampling scales. + upsample_mode (str): Upsampling mode in TADE layer. + gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid"). + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -147,16 +133,12 @@ class StyleMelGANGenerator(nn.Layer): def forward(self, c, z=None): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Auxiliary input tensor (B, channels, T). - z : Tensor - Input noise tensor (B, in_channels, 1). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T ** prod(upsample_scales)). + + Args: + c (Tensor): Auxiliary input tensor (B, channels, T). + z (Tensor): Input noise tensor (B, in_channels, 1). + Returns: + Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)). """ # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300) if z is None: @@ -211,14 +193,10 @@ class StyleMelGANGenerator(nn.Layer): def inference(self, c): """Perform inference. - Parameters - ---------- - c : Tensor - Input tensor (T, in_channels). - Returns - ---------- - Tensor - Output tensor (T ** prod(upsample_scales), out_channels). + Args: + c (Tensor): Input tensor (T, in_channels). + Returns: + Tensor: Output tensor (T ** prod(upsample_scales), out_channels). """ # (1, in_channels, T) c = c.transpose([1, 0]).unsqueeze(0) @@ -278,18 +256,13 @@ class StyleMelGANDiscriminator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initilize Style MelGAN discriminator. - Parameters - ---------- - repeats : int - Number of repititons to apply RWD. - window_sizes : list - List of random window sizes. - pqmf_params : list - List of list of Parameters for PQMF modules - discriminator_params : dict - Parameters for base discriminator module. - use_weight_nom : bool - Whether to apply weight normalization. + + Args: + repeats (int): Number of repititons to apply RWD. + window_sizes (list): List of random window sizes. + pqmf_params (list): List of list of Parameters for PQMF modules + discriminator_params (dict): Parameters for base discriminator module. + use_weight_nom (bool): Whether to apply weight normalization. """ super().__init__() @@ -325,15 +298,11 @@ class StyleMelGANDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, 1, T). - Returns - ---------- - List - List of discriminator outputs, #items in the list will be - equal to repeats * #discriminators. + Args: + x (Tensor): Input tensor (B, 1, T). + Returns: + List: List of discriminator outputs, #items in the list will be + equal to repeats * #discriminators. """ outs = [] for _ in range(self.repeats): diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py index 9eff4497..cc8460e4 100644 --- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py +++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py @@ -31,51 +31,30 @@ from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet class PWGGenerator(nn.Layer): """Wave Generator for Parallel WaveGAN - Parameters - ---------- - in_channels : int, optional - Number of channels of the input waveform, by default 1 - out_channels : int, optional - Number of channels of the output waveform, by default 1 - kernel_size : int, optional - Kernel size of the residual blocks inside, by default 3 - layers : int, optional - Number of residual blocks inside, by default 30 - stacks : int, optional - The number of groups to split the residual blocks into, by default 3 - Within each group, the dilation of the residual block grows - exponentially. - residual_channels : int, optional - Residual channel of the residual blocks, by default 64 - gate_channels : int, optional - Gate channel of the residual blocks, by default 128 - skip_channels : int, optional - Skip channel of the residual blocks, by default 64 - aux_channels : int, optional - Auxiliary channel of the residual blocks, by default 80 - aux_context_window : int, optional - The context window size of the first convolution applied to the - auxiliary input, by default 2 - dropout : float, optional - Dropout of the residual blocks, by default 0. - bias : bool, optional - Whether to use bias in residual blocks, by default True - use_weight_norm : bool, optional - Whether to use weight norm in all convolutions, by default True - use_causal_conv : bool, optional - Whether to use causal padding in the upsample network and residual - blocks, by default False - upsample_scales : List[int], optional - Upsample scales of the upsample network, by default [4, 4, 4, 4] - nonlinear_activation : Optional[str], optional - Non linear activation in upsample network, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to the linear activation in the upsample network, - by default {} - interpolate_mode : str, optional - Interpolation mode of the upsample network, by default "nearest" - freq_axis_kernel_size : int, optional - Kernel size along the frequency axis of the upsample network, by default 1 + Args: + in_channels (int, optional): Number of channels of the input waveform, by default 1 + out_channels (int, optional): Number of channels of the output waveform, by default 1 + kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3 + layers (int, optional): Number of residual blocks inside, by default 30 + stacks (int, optional): The number of groups to split the residual blocks into, by default 3 + Within each group, the dilation of the residual block grows exponentially. + residual_channels (int, optional): Residual channel of the residual blocks, by default 64 + gate_channels (int, optional): Gate channel of the residual blocks, by default 128 + skip_channels (int, optional): Skip channel of the residual blocks, by default 64 + aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80 + aux_context_window (int, optional): The context window size of the first convolution applied to the + auxiliary input, by default 2 + dropout (float, optional): Dropout of the residual blocks, by default 0. + bias (bool, optional): Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True + use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual + blocks, by default False + upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4] + nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, + by default {} + interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest" + freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1 """ def __init__( @@ -167,18 +146,13 @@ class PWGGenerator(nn.Layer): def forward(self, x, c): """Generate waveform. - Parameters - ---------- - x : Tensor - Shape (N, C_in, T), The input waveform. - c : Tensor - Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It + Args: + x(Tensor): Shape (N, C_in, T), The input waveform. + c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It is upsampled to match the time resolution of the input. - Returns - ------- - Tensor - Shape (N, C_out, T), the generated waveform. + Returns: + Tensor: Shape (N, C_out, T), the generated waveform. """ c = self.upsample_net(c) assert c.shape[-1] == x.shape[-1] @@ -218,19 +192,14 @@ class PWGGenerator(nn.Layer): self.apply(_remove_weight_norm) def inference(self, c=None): - """Waveform generation. This function is used for single instance - inference. - Parameters - ---------- - c : Tensor, optional - Shape (T', C_aux), the auxiliary input, by default None - x : Tensor, optional - Shape (T, C_in), the noise waveform, by default None - If not provided, a sample is drawn from a gaussian distribution. - Returns - ------- - Tensor - Shape (T, C_out), the generated waveform + """Waveform generation. This function is used for single instance inference. + + Args: + c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None + x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None + + Returns: + Tensor: Shape (T, C_out), the generated waveform """ # when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files x = paddle.randn( @@ -244,32 +213,21 @@ class PWGGenerator(nn.Layer): class PWGDiscriminator(nn.Layer): """A convolutional discriminator for audio. - Parameters - ---------- - in_channels : int, optional - Number of channels of the input audio, by default 1 - out_channels : int, optional - Output feature size, by default 1 - kernel_size : int, optional - Kernel size of convolutional sublayers, by default 3 - layers : int, optional - Number of layers, by default 10 - conv_channels : int, optional - Feature size of the convolutional sublayers, by default 64 - dilation_factor : int, optional - The factor with which dilation of each convolutional sublayers grows - exponentially if it is greater than 1, else the dilation of each - convolutional sublayers grows linearly, by default 1 - nonlinear_activation : str, optional - The activation after each convolutional sublayer, by default "leakyrelu" - nonlinear_activation_params : Dict[str, Any], optional - The parameters passed to the activation's initializer, by default - {"negative_slope": 0.2} - bias : bool, optional - Whether to use bias in convolutional sublayers, by default True - use_weight_norm : bool, optional - Whether to use weight normalization at all convolutional sublayers, - by default True + Args: + in_channels (int, optional): Number of channels of the input audio, by default 1 + out_channels (int, optional): Output feature size, by default 1 + kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3 + layers (int, optional): Number of layers, by default 10 + conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64 + dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows + exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, + by default 1 + nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu" + nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default + {"negative_slope": 0.2} + bias (bool, optional): Whether to use bias in convolutional sublayers, by default True + use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, + by default True """ def __init__( @@ -330,15 +288,12 @@ class PWGDiscriminator(nn.Layer): def forward(self, x): """ - Parameters - ---------- - x : Tensor - Shape (N, in_channels, num_samples), the input audio. - - Returns - ------- - Tensor - Shape (N, out_channels, num_samples), the predicted logits. + + Args: + x (Tensor): Shape (N, in_channels, num_samples), the input audio. + + Returns: + Tensor: Shape (N, out_channels, num_samples), the predicted logits. """ return self.conv_layers(x) @@ -362,39 +317,25 @@ class PWGDiscriminator(nn.Layer): class ResidualPWGDiscriminator(nn.Layer): """A wavenet-style discriminator for audio. - Parameters - ---------- - in_channels : int, optional - Number of channels of the input audio, by default 1 - out_channels : int, optional - Output feature size, by default 1 - kernel_size : int, optional - Kernel size of residual blocks, by default 3 - layers : int, optional - Number of residual blocks, by default 30 - stacks : int, optional - Number of groups of residual blocks, within which the dilation - of each residual blocks grows exponentially, by default 3 - residual_channels : int, optional - Residual channels of residual blocks, by default 64 - gate_channels : int, optional - Gate channels of residual blocks, by default 128 - skip_channels : int, optional - Skip channels of residual blocks, by default 64 - dropout : float, optional - Dropout probability of residual blocks, by default 0. - bias : bool, optional - Whether to use bias in residual blocks, by default True - use_weight_norm : bool, optional - Whether to use weight normalization in all convolutional layers, - by default True - use_causal_conv : bool, optional - Whether to use causal convolution in residual blocks, by default False - nonlinear_activation : str, optional - Activation after convolutions other than those in residual blocks, - by default "leakyrelu" - nonlinear_activation_params : Dict[str, Any], optional - Parameters to pass to the activation, by default {"negative_slope": 0.2} + Args: + in_channels (int, optional): Number of channels of the input audio, by default 1 + out_channels (int, optional): Output feature size, by default 1 + kernel_size (int, optional): Kernel size of residual blocks, by default 3 + layers (int, optional): Number of residual blocks, by default 30 + stacks (int, optional): Number of groups of residual blocks, within which the dilation + of each residual blocks grows exponentially, by default 3 + residual_channels (int, optional): Residual channels of residual blocks, by default 64 + gate_channels (int, optional): Gate channels of residual blocks, by default 128 + skip_channels (int, optional): Skip channels of residual blocks, by default 64 + dropout (float, optional): Dropout probability of residual blocks, by default 0. + bias (bool, optional): Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, + by default True + use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False + nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, + by default "leakyrelu" + nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, + by default {"negative_slope": 0.2} """ def __init__( @@ -463,15 +404,11 @@ class ResidualPWGDiscriminator(nn.Layer): def forward(self, x): """ - Parameters - ---------- - x : Tensor - Shape (N, in_channels, num_samples), the input audio. - - Returns - ------- - Tensor - Shape (N, out_channels, num_samples), the predicted logits. + Args: + x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩ + + Returns: + Tensor: Shape (N, out_channels, num_samples), the predicted logits. """ x = self.first_conv(x) skip = 0 diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index cc9e2066..42e8f743 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -14,28 +14,9 @@ import paddle from paddle import nn +from paddlespeech.t2s.modules.nets_utils import initialize from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding - - -def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor: - """ - encodings: (B, T, C) - durations: (B, T) - """ - batch_size, t_enc = paddle.shape(durations) - slens = paddle.sum(durations, -1) - t_dec = paddle.max(slens) - M = paddle.zeros([batch_size, t_dec, t_enc]) - for i in range(batch_size): - k = 0 - for j in range(t_enc): - d = durations[i, j] - # If the d == 0, slice action is meaningless and not supported - if d >= 1: - M[0, k:k + d, j] = 1 - k += d - encodings = paddle.matmul(M, encodings) - return encodings +from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator class ResidualBlock(nn.Layer): @@ -175,19 +156,25 @@ class SpeedySpeechDecoder(nn.Layer): class SpeedySpeech(nn.Layer): - def __init__(self, - vocab_size, - encoder_hidden_size, - encoder_kernel_size, - encoder_dilations, - duration_predictor_hidden_size, - decoder_hidden_size, - decoder_output_size, - decoder_kernel_size, - decoder_dilations, - tone_size=None, - spk_num=None): + def __init__( + self, + vocab_size, + encoder_hidden_size, + encoder_kernel_size, + encoder_dilations, + duration_predictor_hidden_size, + decoder_hidden_size, + decoder_output_size, + decoder_kernel_size, + decoder_dilations, + tone_size=None, + spk_num=None, + init_type: str="xavier_uniform", ): super().__init__() + + # initialize parameters + initialize(self, init_type) + encoder = SpeedySpeechEncoder(vocab_size, tone_size, encoder_hidden_size, encoder_kernel_size, encoder_dilations, spk_num) @@ -198,6 +185,10 @@ class SpeedySpeech(nn.Layer): self.encoder = encoder self.duration_predictor = duration_predictor self.decoder = decoder + # define length regulator + self.length_regulator = LengthRegulator() + + nn.initializer.set_global_initializer(None) def forward(self, text, tones, durations, spk_id: paddle.Tensor=None): # input of embedding must be int64 @@ -212,7 +203,7 @@ class SpeedySpeech(nn.Layer): # expand encodings durations_to_expand = durations - encodings = expand(encodings, durations_to_expand) + encodings = self.length_regulator(encodings, durations_to_expand) # decode # remove positional encoding here @@ -240,7 +231,8 @@ class SpeedySpeech(nn.Layer): durations_to_expand = durations_to_expand.astype(paddle.int64) else: durations_to_expand = durations - encodings = expand(encodings, durations_to_expand) + encodings = self.length_regulator( + encodings, durations_to_expand, is_inference=True) shape = paddle.shape(encodings) t_dec, feature_size = shape[1], shape[2] diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py index ee45cdc8..e30a3fe1 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from pathlib import Path import paddle from paddle import distributed as dist from paddle.fluid.layers import huber_loss +from paddle.io import DataLoader from paddle.nn import functional as F +from paddle.nn import Layer +from paddle.optimizer import Optimizer from paddlespeech.t2s.modules.losses import masked_l1_loss from paddlespeech.t2s.modules.losses import ssim @@ -33,11 +37,11 @@ logger.setLevel(logging.INFO) class SpeedySpeechUpdater(StandardUpdater): def __init__(self, - model, - optimizer, - dataloader, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, init_state=None, - output_dir=None): + output_dir: Path=None): super().__init__(model, optimizer, dataloader, init_state=None) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) @@ -103,7 +107,10 @@ class SpeedySpeechUpdater(StandardUpdater): class SpeedySpeechEvaluator(StandardEvaluator): - def __init__(self, model, dataloader, output_dir=None): + def __init__(self, + model: Layer, + dataloader: DataLoader, + output_dir: Path=None): super().__init__(model, dataloader) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) diff --git a/paddlespeech/t2s/models/tacotron2.py b/paddlespeech/t2s/models/tacotron2.py deleted file mode 100644 index 01ea4f7d..00000000 --- a/paddlespeech/t2s/models/tacotron2.py +++ /dev/null @@ -1,1074 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math - -import paddle -from paddle import nn -from paddle.fluid.layers import sequence_mask -from paddle.nn import functional as F -from paddle.nn import initializer as I -from tqdm import trange - -from paddlespeech.t2s.modules.conv import Conv1dBatchNorm -from paddlespeech.t2s.modules.losses import guided_attention_loss -from paddlespeech.t2s.utils import checkpoint - -__all__ = ["Tacotron2", "Tacotron2Loss"] - - -class LocationSensitiveAttention(nn.Layer): - """Location Sensitive Attention module. - - Reference: `Attention-Based Models for Speech Recognition `_ - - Parameters - ----------- - d_query: int - The feature size of query. - d_key : int - The feature size of key. - d_attention : int - The feature size of dimension. - location_filters : int - Filter size of attention convolution. - location_kernel_size : int - Kernel size of attention convolution. - """ - - def __init__(self, - d_query: int, - d_key: int, - d_attention: int, - location_filters: int, - location_kernel_size: int): - super().__init__() - - self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False) - self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False) - self.value = nn.Linear(d_attention, 1, bias_attr=False) - - # Location Layer - self.location_conv = nn.Conv1D( - 2, - location_filters, - kernel_size=location_kernel_size, - padding=int((location_kernel_size - 1) / 2), - bias_attr=False, - data_format='NLC') - self.location_layer = nn.Linear( - location_filters, d_attention, bias_attr=False) - - def forward(self, - query, - processed_key, - value, - attention_weights_cat, - mask=None): - """Compute context vector and attention weights. - - Parameters - ----------- - query : Tensor [shape=(batch_size, d_query)] - The queries. - processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)] - The keys after linear layer. - value : Tensor [shape=(batch_size, time_steps_k, d_key)] - The values. - attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)] - Attention weights concat. - mask : Tensor, optional - The mask. Shape should be (batch_size, times_steps_k, 1). - Defaults to None. - - Returns - ---------- - attention_context : Tensor [shape=(batch_size, d_attention)] - The context vector. - attention_weights : Tensor [shape=(batch_size, time_steps_k)] - The attention weights. - """ - - processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1])) - processed_attention_weights = self.location_layer( - self.location_conv(attention_weights_cat)) - # (B, T_enc, 1) - alignment = self.value( - paddle.tanh(processed_attention_weights + processed_key + - processed_query)) - - if mask is not None: - alignment = alignment + (1.0 - mask) * -1e9 - - attention_weights = F.softmax(alignment, axis=1) - attention_context = paddle.matmul( - attention_weights, value, transpose_x=True) - - attention_weights = paddle.squeeze(attention_weights, axis=-1) - attention_context = paddle.squeeze(attention_context, axis=1) - - return attention_context, attention_weights - - -class DecoderPreNet(nn.Layer): - """Decoder prenet module for Tacotron2. - - Parameters - ---------- - d_input: int - The input feature size. - - d_hidden: int - The hidden size. - - d_output: int - The output feature size. - - dropout_rate: float - The droput probability. - - """ - - def __init__(self, - d_input: int, - d_hidden: int, - d_output: int, - dropout_rate: float): - super().__init__() - - self.dropout_rate = dropout_rate - self.linear1 = nn.Linear(d_input, d_hidden, bias_attr=False) - self.linear2 = nn.Linear(d_hidden, d_output, bias_attr=False) - - def forward(self, x): - """Calculate forward propagation. - - Parameters - ---------- - x: Tensor [shape=(B, T_mel, C)] - Batch of the sequences of padded mel spectrogram. - - Returns - ------- - output: Tensor [shape=(B, T_mel, C)] - Batch of the sequences of padded hidden state. - - """ - - x = F.dropout(F.relu(self.linear1(x)), self.dropout_rate, training=True) - output = F.dropout( - F.relu(self.linear2(x)), self.dropout_rate, training=True) - return output - - -class DecoderPostNet(nn.Layer): - """Decoder postnet module for Tacotron2. - - Parameters - ---------- - d_mels: int - The number of mel bands. - - d_hidden: int - The hidden size of postnet. - - kernel_size: int - The kernel size of the conv layer in postnet. - - num_layers: int - The number of conv layers in postnet. - - dropout: float - The droput probability. - - """ - - def __init__(self, - d_mels: int, - d_hidden: int, - kernel_size: int, - num_layers: int, - dropout: float): - super().__init__() - self.dropout = dropout - self.num_layers = num_layers - - padding = int((kernel_size - 1) / 2) - - self.conv_batchnorms = nn.LayerList() - k = math.sqrt(1.0 / (d_mels * kernel_size)) - self.conv_batchnorms.append( - Conv1dBatchNorm( - d_mels, - d_hidden, - kernel_size=kernel_size, - padding=padding, - bias_attr=I.Uniform(-k, k), - data_format='NLC')) - - k = math.sqrt(1.0 / (d_hidden * kernel_size)) - self.conv_batchnorms.extend([ - Conv1dBatchNorm( - d_hidden, - d_hidden, - kernel_size=kernel_size, - padding=padding, - bias_attr=I.Uniform(-k, k), - data_format='NLC') for i in range(1, num_layers - 1) - ]) - - self.conv_batchnorms.append( - Conv1dBatchNorm( - d_hidden, - d_mels, - kernel_size=kernel_size, - padding=padding, - bias_attr=I.Uniform(-k, k), - data_format='NLC')) - - def forward(self, x): - """Calculate forward propagation. - - Parameters - ---------- - x: Tensor [shape=(B, T_mel, C)] - Output sequence of features from decoder. - - Returns - ------- - output: Tensor [shape=(B, T_mel, C)] - Output sequence of features after postnet. - - """ - - for i in range(len(self.conv_batchnorms) - 1): - x = F.dropout( - F.tanh(self.conv_batchnorms[i](x)), - self.dropout, - training=self.training) - output = F.dropout( - self.conv_batchnorms[self.num_layers - 1](x), - self.dropout, - training=self.training) - return output - - -class Tacotron2Encoder(nn.Layer): - """Tacotron2 encoder module for Tacotron2. - - Parameters - ---------- - d_hidden: int - The hidden size in encoder module. - - conv_layers: int - The number of conv layers. - - kernel_size: int - The kernel size of conv layers. - - p_dropout: float - The droput probability. - """ - - def __init__(self, - d_hidden: int, - conv_layers: int, - kernel_size: int, - p_dropout: float): - super().__init__() - - k = math.sqrt(1.0 / (d_hidden * kernel_size)) - self.conv_batchnorms = nn.LayerList([ - Conv1dBatchNorm( - d_hidden, - d_hidden, - kernel_size, - stride=1, - padding=int((kernel_size - 1) / 2), - bias_attr=I.Uniform(-k, k), - data_format='NLC') for i in range(conv_layers) - ]) - self.p_dropout = p_dropout - - self.hidden_size = int(d_hidden / 2) - self.lstm = nn.LSTM( - d_hidden, self.hidden_size, direction="bidirectional") - - def forward(self, x, input_lens=None): - """Calculate forward propagation of tacotron2 encoder. - - Parameters - ---------- - x: Tensor [shape=(B, T, C)] - Input embeddings. - - text_lens: Tensor [shape=(B,)], optional - Batch of lengths of each text input batch. Defaults to None. - - Returns - ------- - output : Tensor [shape=(B, T, C)] - Batch of the sequences of padded hidden states. - - """ - for conv_batchnorm in self.conv_batchnorms: - x = F.dropout( - F.relu(conv_batchnorm(x)), - self.p_dropout, - training=self.training) - - output, _ = self.lstm(inputs=x, sequence_length=input_lens) - return output - - -class Tacotron2Decoder(nn.Layer): - """Tacotron2 decoder module for Tacotron2. - - Parameters - ---------- - d_mels: int - The number of mel bands. - - reduction_factor: int - The reduction factor of tacotron. - - d_encoder: int - The hidden size of encoder. - - d_prenet: int - The hidden size in decoder prenet. - - d_attention_rnn: int - The attention rnn layer hidden size. - - d_decoder_rnn: int - The decoder rnn layer hidden size. - - d_attention: int - The hidden size of the linear layer in location sensitive attention. - - attention_filters: int - The filter size of the conv layer in location sensitive attention. - - attention_kernel_size: int - The kernel size of the conv layer in location sensitive attention. - - p_prenet_dropout: float - The droput probability in decoder prenet. - - p_attention_dropout: float - The droput probability in location sensitive attention. - - p_decoder_dropout: float - The droput probability in decoder. - - use_stop_token: bool - Whether to use a binary classifier for stop token prediction. - Defaults to False - """ - - def __init__(self, - d_mels: int, - reduction_factor: int, - d_encoder: int, - d_prenet: int, - d_attention_rnn: int, - d_decoder_rnn: int, - d_attention: int, - attention_filters: int, - attention_kernel_size: int, - p_prenet_dropout: float, - p_attention_dropout: float, - p_decoder_dropout: float, - use_stop_token: bool=False): - super().__init__() - self.d_mels = d_mels - self.reduction_factor = reduction_factor - self.d_encoder = d_encoder - self.d_attention_rnn = d_attention_rnn - self.d_decoder_rnn = d_decoder_rnn - self.p_attention_dropout = p_attention_dropout - self.p_decoder_dropout = p_decoder_dropout - - self.prenet = DecoderPreNet( - d_mels * reduction_factor, - d_prenet, - d_prenet, - dropout_rate=p_prenet_dropout) - - # attention_rnn takes attention's context vector has an - # auxiliary input - self.attention_rnn = nn.LSTMCell(d_prenet + d_encoder, d_attention_rnn) - - self.attention_layer = LocationSensitiveAttention( - d_attention_rnn, d_encoder, d_attention, attention_filters, - attention_kernel_size) - - # decoder_rnn takes prenet's output and attention_rnn's input - # as input - self.decoder_rnn = nn.LSTMCell(d_attention_rnn + d_encoder, - d_decoder_rnn) - self.linear_projection = nn.Linear(d_decoder_rnn + d_encoder, - d_mels * reduction_factor) - - self.use_stop_token = use_stop_token - if use_stop_token: - self.stop_layer = nn.Linear(d_decoder_rnn + d_encoder, 1) - - # states - temporary attributes - self.attention_hidden = None - self.attention_cell = None - - self.decoder_hidden = None - self.decoder_cell = None - - self.attention_weights = None - self.attention_weights_cum = None - self.attention_context = None - - self.key = None - self.mask = None - self.processed_key = None - - def _initialize_decoder_states(self, key): - """init states be used in decoder - """ - batch_size, encoder_steps, _ = key.shape - - self.attention_hidden = paddle.zeros( - shape=[batch_size, self.d_attention_rnn], dtype=key.dtype) - self.attention_cell = paddle.zeros( - shape=[batch_size, self.d_attention_rnn], dtype=key.dtype) - - self.decoder_hidden = paddle.zeros( - shape=[batch_size, self.d_decoder_rnn], dtype=key.dtype) - self.decoder_cell = paddle.zeros( - shape=[batch_size, self.d_decoder_rnn], dtype=key.dtype) - - self.attention_weights = paddle.zeros( - shape=[batch_size, encoder_steps], dtype=key.dtype) - self.attention_weights_cum = paddle.zeros( - shape=[batch_size, encoder_steps], dtype=key.dtype) - self.attention_context = paddle.zeros( - shape=[batch_size, self.d_encoder], dtype=key.dtype) - - self.key = key # [B, T, C] - # pre-compute projected keys to improve efficiency - self.processed_key = self.attention_layer.key_layer(key) # [B, T, C] - - def _decode(self, query): - """decode one time step - """ - cell_input = paddle.concat([query, self.attention_context], axis=-1) - - # The first lstm layer (or spec encoder lstm) - _, (self.attention_hidden, self.attention_cell) = self.attention_rnn( - cell_input, (self.attention_hidden, self.attention_cell)) - self.attention_hidden = F.dropout( - self.attention_hidden, - self.p_attention_dropout, - training=self.training) - - # Loaction sensitive attention - attention_weights_cat = paddle.stack( - [self.attention_weights, self.attention_weights_cum], axis=-1) - self.attention_context, self.attention_weights = self.attention_layer( - self.attention_hidden, self.processed_key, self.key, - attention_weights_cat, self.mask) - self.attention_weights_cum += self.attention_weights - - # The second lstm layer (or spec decoder lstm) - decoder_input = paddle.concat( - [self.attention_hidden, self.attention_context], axis=-1) - _, (self.decoder_hidden, self.decoder_cell) = self.decoder_rnn( - decoder_input, (self.decoder_hidden, self.decoder_cell)) - self.decoder_hidden = F.dropout( - self.decoder_hidden, - p=self.p_decoder_dropout, - training=self.training) - - # decode output one step - decoder_hidden_attention_context = paddle.concat( - [self.decoder_hidden, self.attention_context], axis=-1) - decoder_output = self.linear_projection( - decoder_hidden_attention_context) - if self.use_stop_token: - stop_logit = self.stop_layer(decoder_hidden_attention_context) - return decoder_output, self.attention_weights, stop_logit - return decoder_output, self.attention_weights - - def forward(self, keys, querys, mask): - """Calculate forward propagation of tacotron2 decoder. - - Parameters - ---------- - keys: Tensor[shape=(B, T_key, C)] - Batch of the sequences of padded output from encoder. - - querys: Tensor[shape(B, T_query, C)] - Batch of the sequences of padded mel spectrogram. - - mask: Tensor - Mask generated with text length. Shape should be (B, T_key, 1). - - Returns - ------- - mel_output: Tensor [shape=(B, T_query, C)] - Output sequence of features. - - alignments: Tensor [shape=(B, T_query, T_key)] - Attention weights. - """ - self._initialize_decoder_states(keys) - self.mask = mask - - querys = paddle.reshape( - querys, - [querys.shape[0], querys.shape[1] // self.reduction_factor, -1]) - start_step = paddle.zeros( - shape=[querys.shape[0], 1, querys.shape[-1]], dtype=querys.dtype) - querys = paddle.concat([start_step, querys], axis=1) - - querys = self.prenet(querys) - - mel_outputs, alignments = [], [] - stop_logits = [] - # Ignore the last time step - while len(mel_outputs) < querys.shape[1] - 1: - query = querys[:, len(mel_outputs), :] - if self.use_stop_token: - mel_output, attention_weights, stop_logit = self._decode(query) - else: - mel_output, attention_weights = self._decode(query) - mel_outputs.append(mel_output) - alignments.append(attention_weights) - if self.use_stop_token: - stop_logits.append(stop_logit) - - alignments = paddle.stack(alignments, axis=1) - mel_outputs = paddle.stack(mel_outputs, axis=1) - if self.use_stop_token: - stop_logits = paddle.concat(stop_logits, axis=1) - return mel_outputs, alignments, stop_logits - return mel_outputs, alignments - - def infer(self, key, max_decoder_steps=1000): - """Calculate forward propagation of tacotron2 decoder. - - Parameters - ---------- - keys: Tensor [shape=(B, T_key, C)] - Batch of the sequences of padded output from encoder. - - max_decoder_steps: int, optional - Number of max step when synthesize. Defaults to 1000. - - Returns - ------- - mel_output: Tensor [shape=(B, T_mel, C)] - Output sequence of features. - - alignments: Tensor [shape=(B, T_mel, T_key)] - Attention weights. - - """ - self._initialize_decoder_states(key) - self.mask = None # mask is not needed for single instance inference - encoder_steps = key.shape[1] - - # [B, C] - start_step = paddle.zeros( - shape=[key.shape[0], self.d_mels * self.reduction_factor], - dtype=key.dtype) - query = start_step # [B, C] - first_hit_end = None - - mel_outputs, alignments = [], [] - stop_logits = [] - for i in trange(max_decoder_steps): - query = self.prenet(query) - if self.use_stop_token: - mel_output, alignment, stop_logit = self._decode(query) - else: - mel_output, alignment = self._decode(query) - - mel_outputs.append(mel_output) - alignments.append(alignment) # (B=1, T) - if self.use_stop_token: - stop_logits.append(stop_logit) - - if self.use_stop_token: - if F.sigmoid(stop_logit) > 0.5: - print("hit stop condition!") - break - else: - if int(paddle.argmax(alignment[0])) == encoder_steps - 1: - if first_hit_end is None: - first_hit_end = i - elif i > (first_hit_end + 20): - print("content exhausted!") - break - if len(mel_outputs) == max_decoder_steps: - print("Warning! Reached max decoder steps!!!") - break - - query = mel_output - - alignments = paddle.stack(alignments, axis=1) - mel_outputs = paddle.stack(mel_outputs, axis=1) - if self.use_stop_token: - stop_logits = paddle.concat(stop_logits, axis=1) - return mel_outputs, alignments, stop_logits - return mel_outputs, alignments - - -class Tacotron2(nn.Layer): - """Tacotron2 model for end-to-end text-to-speech (E2E-TTS). - - This is a model of Spectrogram prediction network in Tacotron2 described - in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram - Predictions `_, - which converts the sequence of characters - into the sequence of mel spectrogram. - - Parameters - ---------- - vocab_size : int - Vocabulary size of phons of the model. - - n_tones: int - Vocabulary size of tones of the model. Defaults to None. If provided, - the model has an extra tone embedding. - - d_mels: int - Number of mel bands. - - d_encoder: int - Hidden size in encoder module. - - encoder_conv_layers: int - Number of conv layers in encoder. - - encoder_kernel_size: int - Kernel size of conv layers in encoder. - - d_prenet: int - Hidden size in decoder prenet. - - d_attention_rnn: int - Attention rnn layer hidden size in decoder. - - d_decoder_rnn: int - Decoder rnn layer hidden size in decoder. - - attention_filters: int - Filter size of the conv layer in location sensitive attention. - - attention_kernel_size: int - Kernel size of the conv layer in location sensitive attention. - - d_attention: int - Hidden size of the linear layer in location sensitive attention. - - d_postnet: int - Hidden size of postnet. - - postnet_kernel_size: int - Kernel size of the conv layer in postnet. - - postnet_conv_layers: int - Number of conv layers in postnet. - - reduction_factor: int - Reduction factor of tacotron2. - - p_encoder_dropout: float - Droput probability in encoder. - - p_prenet_dropout: float - Droput probability in decoder prenet. - - p_attention_dropout: float - Droput probability in location sensitive attention. - - p_decoder_dropout: float - Droput probability in decoder. - - p_postnet_dropout: float - Droput probability in postnet. - - d_global_condition: int - Feature size of global condition. Defaults to None. If provided, The - model assumes a global condition that is concatenated to the encoder - outputs. - - """ - - def __init__(self, - vocab_size, - n_tones=None, - d_mels: int=80, - d_encoder: int=512, - encoder_conv_layers: int=3, - encoder_kernel_size: int=5, - d_prenet: int=256, - d_attention_rnn: int=1024, - d_decoder_rnn: int=1024, - attention_filters: int=32, - attention_kernel_size: int=31, - d_attention: int=128, - d_postnet: int=512, - postnet_kernel_size: int=5, - postnet_conv_layers: int=5, - reduction_factor: int=1, - p_encoder_dropout: float=0.5, - p_prenet_dropout: float=0.5, - p_attention_dropout: float=0.1, - p_decoder_dropout: float=0.1, - p_postnet_dropout: float=0.5, - d_global_condition=None, - use_stop_token=False): - super().__init__() - - std = math.sqrt(2.0 / (vocab_size + d_encoder)) - val = math.sqrt(3.0) * std # uniform bounds for std - self.embedding = nn.Embedding( - vocab_size, d_encoder, weight_attr=I.Uniform(-val, val)) - if n_tones: - self.embedding_tones = nn.Embedding( - n_tones, - d_encoder, - padding_idx=0, - weight_attr=I.Uniform(-0.1 * val, 0.1 * val)) - self.toned = n_tones is not None - - self.encoder = Tacotron2Encoder(d_encoder, encoder_conv_layers, - encoder_kernel_size, p_encoder_dropout) - - # input augmentation scheme: concat global condition to the encoder output - if d_global_condition is not None: - d_encoder += d_global_condition - self.decoder = Tacotron2Decoder( - d_mels, - reduction_factor, - d_encoder, - d_prenet, - d_attention_rnn, - d_decoder_rnn, - d_attention, - attention_filters, - attention_kernel_size, - p_prenet_dropout, - p_attention_dropout, - p_decoder_dropout, - use_stop_token=use_stop_token) - self.postnet = DecoderPostNet( - d_mels=d_mels * reduction_factor, - d_hidden=d_postnet, - kernel_size=postnet_kernel_size, - num_layers=postnet_conv_layers, - dropout=p_postnet_dropout) - - def forward(self, - text_inputs, - text_lens, - mels, - output_lens=None, - tones=None, - global_condition=None): - """Calculate forward propagation of tacotron2. - - Parameters - ---------- - text_inputs: Tensor [shape=(B, T_text)] - Batch of the sequencees of padded character ids. - - text_lens: Tensor [shape=(B,)] - Batch of lengths of each text input batch. - - mels: Tensor [shape(B, T_mel, C)] - Batch of the sequences of padded mel spectrogram. - - output_lens: Tensor [shape=(B,)], optional - Batch of lengths of each mels batch. Defaults to None. - - tones: Tensor [shape=(B, T_text)] - Batch of sequences of padded tone ids. - - global_condition: Tensor [shape(B, C)] - Batch of global conditions. Defaults to None. If the - `d_global_condition` of the model is not None, this input should be - provided. - - use_stop_token: bool - Whether to include a binary classifier to predict the stop token. - Defaults to False. - - Returns - ------- - outputs : Dict[str, Tensor] - - mel_output: output sequence of features (B, T_mel, C); - - mel_outputs_postnet: output sequence of features after postnet (B, T_mel, C); - - alignments: attention weights (B, T_mel, T_text); - - stop_logits: output sequence of stop logits (B, T_mel) - """ - # input of embedding must be int64 - text_inputs = paddle.cast(text_inputs, 'int64') - embedded_inputs = self.embedding(text_inputs) - if self.toned: - embedded_inputs += self.embedding_tones(tones) - - encoder_outputs = self.encoder(embedded_inputs, text_lens) - - if global_condition is not None: - global_condition = global_condition.unsqueeze(1) - global_condition = paddle.expand(global_condition, - [-1, encoder_outputs.shape[1], -1]) - encoder_outputs = paddle.concat([encoder_outputs, global_condition], - -1) - - # [B, T_enc, 1] - mask = sequence_mask( - text_lens, dtype=encoder_outputs.dtype).unsqueeze(-1) - if self.decoder.use_stop_token: - mel_outputs, alignments, stop_logits = self.decoder( - encoder_outputs, mels, mask=mask) - else: - mel_outputs, alignments = self.decoder( - encoder_outputs, mels, mask=mask) - mel_outputs_postnet = self.postnet(mel_outputs) - mel_outputs_postnet = mel_outputs + mel_outputs_postnet - - if output_lens is not None: - # [B, T_dec, 1] - mask = sequence_mask(output_lens).unsqueeze(-1) - mel_outputs = mel_outputs * mask # [B, T, C] - mel_outputs_postnet = mel_outputs_postnet * mask # [B, T, C] - outputs = { - "mel_output": mel_outputs, - "mel_outputs_postnet": mel_outputs_postnet, - "alignments": alignments - } - if self.decoder.use_stop_token: - outputs["stop_logits"] = stop_logits - - return outputs - - @paddle.no_grad() - def infer(self, - text_inputs, - max_decoder_steps=1000, - tones=None, - global_condition=None): - """Generate the mel sepctrogram of features given the sequences of character ids. - - Parameters - ---------- - text_inputs: Tensor [shape=(B, T_text)] - Batch of the sequencees of padded character ids. - - max_decoder_steps: int, optional - Number of max step when synthesize. Defaults to 1000. - - Returns - ------- - outputs : Dict[str, Tensor] - - mel_output: output sequence of sepctrogram (B, T_mel, C); - - mel_outputs_postnet: output sequence of sepctrogram after postnet (B, T_mel, C); - - stop_logits: output sequence of stop logits (B, T_mel); - - alignments: attention weights (B, T_mel, T_text). This key is only - present when `use_stop_token` is True. - """ - # input of embedding must be int64 - text_inputs = paddle.cast(text_inputs, 'int64') - embedded_inputs = self.embedding(text_inputs) - if self.toned: - embedded_inputs += self.embedding_tones(tones) - encoder_outputs = self.encoder(embedded_inputs) - - if global_condition is not None: - global_condition = global_condition.unsqueeze(1) - global_condition = paddle.expand(global_condition, - [-1, encoder_outputs.shape[1], -1]) - encoder_outputs = paddle.concat([encoder_outputs, global_condition], - -1) - if self.decoder.use_stop_token: - mel_outputs, alignments, stop_logits = self.decoder.infer( - encoder_outputs, max_decoder_steps=max_decoder_steps) - else: - mel_outputs, alignments = self.decoder.infer( - encoder_outputs, max_decoder_steps=max_decoder_steps) - - mel_outputs_postnet = self.postnet(mel_outputs) - mel_outputs_postnet = mel_outputs + mel_outputs_postnet - - outputs = { - "mel_output": mel_outputs, - "mel_outputs_postnet": mel_outputs_postnet, - "alignments": alignments - } - if self.decoder.use_stop_token: - outputs["stop_logits"] = stop_logits - - return outputs - - @classmethod - def from_pretrained(cls, config, checkpoint_path): - """Build a Tacotron2 model from a pretrained model. - - Parameters - ---------- - config: yacs.config.CfgNode - model configs - - checkpoint_path: Path or str - the path of pretrained model checkpoint, without extension name - - Returns - ------- - ConditionalWaveFlow - The model built from pretrained result. - """ - model = cls(vocab_size=config.model.vocab_size, - n_tones=config.model.n_tones, - d_mels=config.data.n_mels, - d_encoder=config.model.d_encoder, - encoder_conv_layers=config.model.encoder_conv_layers, - encoder_kernel_size=config.model.encoder_kernel_size, - d_prenet=config.model.d_prenet, - d_attention_rnn=config.model.d_attention_rnn, - d_decoder_rnn=config.model.d_decoder_rnn, - attention_filters=config.model.attention_filters, - attention_kernel_size=config.model.attention_kernel_size, - d_attention=config.model.d_attention, - d_postnet=config.model.d_postnet, - postnet_kernel_size=config.model.postnet_kernel_size, - postnet_conv_layers=config.model.postnet_conv_layers, - reduction_factor=config.model.reduction_factor, - p_encoder_dropout=config.model.p_encoder_dropout, - p_prenet_dropout=config.model.p_prenet_dropout, - p_attention_dropout=config.model.p_attention_dropout, - p_decoder_dropout=config.model.p_decoder_dropout, - p_postnet_dropout=config.model.p_postnet_dropout, - d_global_condition=config.model.d_global_condition, - use_stop_token=config.model.use_stop_token) - checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) - return model - - -class Tacotron2Loss(nn.Layer): - """ Tacotron2 Loss module - """ - - def __init__(self, - use_stop_token_loss=True, - use_guided_attention_loss=False, - sigma=0.2): - """Tacotron 2 Criterion. - - Args: - use_stop_token_loss (bool, optional): Whether to use a loss for stop token prediction. Defaults to True. - use_guided_attention_loss (bool, optional): Whether to use a loss for attention weights. Defaults to False. - sigma (float, optional): Hyper-parameter sigma for guided attention loss. Defaults to 0.2. - """ - super().__init__() - self.spec_criterion = nn.MSELoss() - self.use_stop_token_loss = use_stop_token_loss - self.use_guided_attention_loss = use_guided_attention_loss - self.attn_criterion = guided_attention_loss - self.stop_criterion = nn.BCEWithLogitsLoss() - self.sigma = sigma - - def forward(self, - mel_outputs, - mel_outputs_postnet, - mel_targets, - attention_weights=None, - slens=None, - plens=None, - stop_logits=None): - """Calculate tacotron2 loss. - - Parameters - ---------- - mel_outputs: Tensor [shape=(B, T_mel, C)] - Output mel spectrogram sequence. - - mel_outputs_postnet: Tensor [shape(B, T_mel, C)] - Output mel spectrogram sequence after postnet. - - mel_targets: Tensor [shape=(B, T_mel, C)] - Target mel spectrogram sequence. - - attention_weights: Tensor [shape=(B, T_mel, T_enc)] - Attention weights. This should be provided when - `use_guided_attention_loss` is True. - - slens: Tensor [shape=(B,)] - Number of frames of mel spectrograms. This should be provided when - `use_guided_attention_loss` is True. - - plens: Tensor [shape=(B, )] - Number of text or phone ids of each utterance. This should be - provided when `use_guided_attention_loss` is True. - - stop_logits: Tensor [shape=(B, T_mel)] - Stop logits of each mel spectrogram frame. This should be provided - when `use_stop_token_loss` is True. - - Returns - ------- - losses : Dict[str, Tensor] - - loss: the sum of the other three losses; - - mel_loss: MSE loss compute by mel_targets and mel_outputs; - - post_mel_loss: MSE loss compute by mel_targets and mel_outputs_postnet; - - guided_attn_loss: Guided attention loss for attention weights; - - stop_loss: Binary cross entropy loss for stop token prediction. - """ - mel_loss = self.spec_criterion(mel_outputs, mel_targets) - post_mel_loss = self.spec_criterion(mel_outputs_postnet, mel_targets) - total_loss = mel_loss + post_mel_loss - if self.use_guided_attention_loss: - gal_loss = self.attn_criterion(attention_weights, slens, plens, - self.sigma) - total_loss += gal_loss - if self.use_stop_token_loss: - T_dec = mel_targets.shape[1] - stop_labels = F.one_hot(slens - 1, num_classes=T_dec) - stop_token_loss = self.stop_criterion(stop_logits, stop_labels) - total_loss += stop_token_loss - - losses = { - "loss": total_loss, - "mel_loss": mel_loss, - "post_mel_loss": post_mel_loss - } - if self.use_guided_attention_loss: - losses["guided_attn_loss"] = gal_loss - if self.use_stop_token_loss: - losses["stop_loss"] = stop_token_loss - return losses diff --git a/paddlespeech/t2s/data/__init__.py b/paddlespeech/t2s/models/tacotron2/__init__.py similarity index 77% rename from paddlespeech/t2s/data/__init__.py rename to paddlespeech/t2s/models/tacotron2/__init__.py index c605205d..ea63257c 100644 --- a/paddlespeech/t2s/data/__init__.py +++ b/paddlespeech/t2s/models/tacotron2/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,7 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""t2s's infrastructure for data processing. -""" -from .batch import * -from .dataset import * +from .tacotron2 import * +from .tacotron2_updater import * diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py new file mode 100644 index 00000000..abb691b4 --- /dev/null +++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py @@ -0,0 +1,440 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tacotron 2 related modules for paddle""" +import logging +from typing import Dict +from typing import Optional +from typing import Tuple + +import paddle +import paddle.nn.functional as F +from paddle import nn +from typeguard import check_argument_types + +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.t2s.modules.nets_utils import make_pad_mask +from paddlespeech.t2s.modules.tacotron2.attentions import AttForward +from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA +from paddlespeech.t2s.modules.tacotron2.attentions import AttLoc +from paddlespeech.t2s.modules.tacotron2.decoder import Decoder +from paddlespeech.t2s.modules.tacotron2.encoder import Encoder + + +class Tacotron2(nn.Layer): + """Tacotron2 module for end-to-end text-to-speech. + + This is a module of Spectrogram prediction network in Tacotron2 described + in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_, + which converts the sequence of characters into the sequence of Mel-filterbanks. + + .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`: + https://arxiv.org/abs/1712.05884 + + """ + + def __init__( + self, + # network structure related + idim: int, + odim: int, + embed_dim: int=512, + elayers: int=1, + eunits: int=512, + econv_layers: int=3, + econv_chans: int=512, + econv_filts: int=5, + atype: str="location", + adim: int=512, + aconv_chans: int=32, + aconv_filts: int=15, + cumulate_att_w: bool=True, + dlayers: int=2, + dunits: int=1024, + prenet_layers: int=2, + prenet_units: int=256, + postnet_layers: int=5, + postnet_chans: int=512, + postnet_filts: int=5, + output_activation: str=None, + use_batch_norm: bool=True, + use_concate: bool=True, + use_residual: bool=False, + reduction_factor: int=1, + # extra embedding related + spk_num: Optional[int]=None, + lang_num: Optional[int]=None, + spk_embed_dim: Optional[int]=None, + spk_embed_integration_type: str="concat", + dropout_rate: float=0.5, + zoneout_rate: float=0.1, + # training related + init_type: str="xavier_uniform", ): + """Initialize Tacotron2 module. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + embed_dim (int): Dimension of the token embedding. + elayers (int): Number of encoder blstm layers. + eunits (int): Number of encoder blstm units. + econv_layers (int): Number of encoder conv layers. + econv_filts (int): Number of encoder conv filter size. + econv_chans (int): Number of encoder conv filter channels. + dlayers (int): Number of decoder lstm layers. + dunits (int): Number of decoder lstm units. + prenet_layers (int): Number of prenet layers. + prenet_units (int): Number of prenet units. + postnet_layers (int): Number of postnet layers. + postnet_filts (int): Number of postnet filter size. + postnet_chans (int): Number of postnet filter channels. + output_activation (str): Name of activation function for outputs. + adim (int): Number of dimension of mlp in attention. + aconv_chans (int): Number of attention conv filter channels. + aconv_filts (int): Number of attention conv filter size. + cumulate_att_w (bool): Whether to cumulate previous attention weight. + use_batch_norm (bool): Whether to use batch normalization. + use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs. + reduction_factor (int): Reduction factor. + spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the + sids will be provided as the input and use sid embedding layer. + lang_num (Optional[int]): Number of languages. If set to > 1, assume that the + lids will be provided as the input and use sid embedding layer. + spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0, + assume that spk_emb will be provided as the input. + spk_embed_integration_type (str): How to integrate speaker embedding. + dropout_rate (float): Dropout rate. + zoneout_rate (float): Zoneout rate. + """ + assert check_argument_types() + super().__init__() + + # store hyperparameters + self.idim = idim + self.odim = odim + self.eos = idim - 1 + self.cumulate_att_w = cumulate_att_w + self.reduction_factor = reduction_factor + + # define activation function for the final output + if output_activation is None: + self.output_activation_fn = None + elif hasattr(F, output_activation): + self.output_activation_fn = getattr(F, output_activation) + else: + raise ValueError(f"there is no such an activation function. " + f"({output_activation})") + + # set padding idx + padding_idx = 0 + self.padding_idx = padding_idx + + # initialize parameters + initialize(self, init_type) + + # define network modules + self.enc = Encoder( + idim=idim, + embed_dim=embed_dim, + elayers=elayers, + eunits=eunits, + econv_layers=econv_layers, + econv_chans=econv_chans, + econv_filts=econv_filts, + use_batch_norm=use_batch_norm, + use_residual=use_residual, + dropout_rate=dropout_rate, + padding_idx=padding_idx, ) + + self.spk_num = None + if spk_num is not None and spk_num > 1: + self.spk_num = spk_num + self.sid_emb = nn.Embedding(spk_num, eunits) + self.lang_num = None + if lang_num is not None and lang_num > 1: + self.lang_num = lang_num + self.lid_emb = nn.Embedding(lang_num, eunits) + + self.spk_embed_dim = None + if spk_embed_dim is not None and spk_embed_dim > 0: + self.spk_embed_dim = spk_embed_dim + self.spk_embed_integration_type = spk_embed_integration_type + if self.spk_embed_dim is None: + dec_idim = eunits + elif self.spk_embed_integration_type == "concat": + dec_idim = eunits + spk_embed_dim + elif self.spk_embed_integration_type == "add": + dec_idim = eunits + self.projection = nn.Linear(self.spk_embed_dim, eunits) + else: + raise ValueError(f"{spk_embed_integration_type} is not supported.") + + if atype == "location": + att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts) + elif atype == "forward": + att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts) + if self.cumulate_att_w: + logging.warning("cumulation of attention weights is disabled " + "in forward attention.") + self.cumulate_att_w = False + elif atype == "forward_ta": + att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts, + odim) + if self.cumulate_att_w: + logging.warning("cumulation of attention weights is disabled " + "in forward attention.") + self.cumulate_att_w = False + else: + raise NotImplementedError("Support only location or forward") + self.dec = Decoder( + idim=dec_idim, + odim=odim, + att=att, + dlayers=dlayers, + dunits=dunits, + prenet_layers=prenet_layers, + prenet_units=prenet_units, + postnet_layers=postnet_layers, + postnet_chans=postnet_chans, + postnet_filts=postnet_filts, + output_activation_fn=self.output_activation_fn, + cumulate_att_w=self.cumulate_att_w, + use_batch_norm=use_batch_norm, + use_concate=use_concate, + dropout_rate=dropout_rate, + zoneout_rate=zoneout_rate, + reduction_factor=reduction_factor, ) + + nn.initializer.set_global_initializer(None) + + def forward( + self, + text: paddle.Tensor, + text_lengths: paddle.Tensor, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + spk_emb: Optional[paddle.Tensor]=None, + spk_id: Optional[paddle.Tensor]=None, + lang_id: Optional[paddle.Tensor]=None + ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: + """Calculate forward propagation. + + Args: + text (Tensor(int64)): Batch of padded character ids (B, T_text). + text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,). + speech (Tensor): Batch of padded target features (B, T_feats, odim). + speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,). + spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim). + spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1). + lang_id (Optional[Tensor]): Batch of language IDs (B, 1). + + Returns: + Tensor: Loss scalar value. + Dict: Statistics to be monitored. + Tensor: Weight value if not joint training else model outputs. + + """ + text = text[:, :text_lengths.max()] + speech = speech[:, :speech_lengths.max()] + + batch_size = paddle.shape(text)[0] + + # Add eos at the last of sequence + xs = F.pad(text, [0, 0, 0, 1], "constant", self.padding_idx) + for i, l in enumerate(text_lengths): + xs[i, l] = self.eos + ilens = text_lengths + 1 + + ys = speech + olens = speech_lengths + + # make labels for stop prediction + stop_labels = make_pad_mask(olens - 1) + # bool 类型无法切片 + stop_labels = paddle.cast(stop_labels, dtype='float32') + stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0) + + # calculate tacotron2 outputs + after_outs, before_outs, logits, att_ws = self._forward( + xs=xs, + ilens=ilens, + ys=ys, + olens=olens, + spk_emb=spk_emb, + spk_id=spk_id, + lang_id=lang_id, ) + + # modify mod part of groundtruth + if self.reduction_factor > 1: + assert olens.ge(self.reduction_factor).all( + ), "Output length must be greater than or equal to reduction factor." + olens = olens - olens % self.reduction_factor + max_out = max(olens) + ys = ys[:, :max_out] + stop_labels = stop_labels[:, :max_out] + stop_labels = paddle.scatter(stop_labels, 1, + (olens - 1).unsqueeze(1), 1.0) + olens_in = olens // self.reduction_factor + else: + olens_in = olens + return after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in + + def _forward( + self, + xs: paddle.Tensor, + ilens: paddle.Tensor, + ys: paddle.Tensor, + olens: paddle.Tensor, + spk_emb: paddle.Tensor, + spk_id: paddle.Tensor, + lang_id: paddle.Tensor, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + + hs, hlens = self.enc(xs, ilens) + if self.spk_num is not None: + sid_embs = self.sid_emb(spk_id.reshape([-1])) + hs = hs + sid_embs.unsqueeze(1) + if self.lang_num is not None: + lid_embs = self.lid_emb(lang_id.reshape([-1])) + hs = hs + lid_embs.unsqueeze(1) + if self.spk_embed_dim is not None: + hs = self._integrate_with_spk_embed(hs, spk_emb) + + return self.dec(hs, hlens, ys) + + def inference( + self, + text: paddle.Tensor, + speech: Optional[paddle.Tensor]=None, + spk_emb: Optional[paddle.Tensor]=None, + spk_id: Optional[paddle.Tensor]=None, + lang_id: Optional[paddle.Tensor]=None, + threshold: float=0.5, + minlenratio: float=0.0, + maxlenratio: float=10.0, + use_att_constraint: bool=False, + backward_window: int=1, + forward_window: int=3, + use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: + """Generate the sequence of features given the sequences of characters. + + Args: + text (Tensor(int64)): Input sequence of characters (T_text,). + speech (Optional[Tensor]): Feature sequence to extract style (N, idim). + spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,). + spk_id (Optional[Tensor]): Speaker ID (1,). + lang_id (Optional[Tensor]): Language ID (1,). + threshold (float): Threshold in inference. + minlenratio (float): Minimum length ratio in inference. + maxlenratio (float): Maximum length ratio in inference. + use_att_constraint (bool): Whether to apply attention constraint. + backward_window (int): Backward window in attention constraint. + forward_window (int): Forward window in attention constraint. + use_teacher_forcing (bool): Whether to use teacher forcing. + + Returns: + Dict[str, Tensor] + Output dict including the following items: + * feat_gen (Tensor): Output sequence of features (T_feats, odim). + * prob (Tensor): Output sequence of stop probabilities (T_feats,). + * att_w (Tensor): Attention weights (T_feats, T). + + """ + x = text + y = speech + + # add eos at the last of sequence + x = F.pad(x, [0, 1], "constant", self.eos) + + # inference with teacher forcing + if use_teacher_forcing: + assert speech is not None, "speech must be provided with teacher forcing." + + xs, ys = x.unsqueeze(0), y.unsqueeze(0) + spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0) + ilens = paddle.shape(xs)[1] + olens = paddle.shape(ys)[1] + outs, _, _, att_ws = self._forward( + xs=xs, + ilens=ilens, + ys=ys, + olens=olens, + spk_emb=spk_emb, + spk_id=spk_id, + lang_id=lang_id, ) + + return dict(feat_gen=outs[0], att_w=att_ws[0]) + + # inference + h = self.enc.inference(x) + + if self.spk_num is not None: + sid_emb = self.sid_emb(spk_id.reshape([-1])) + h = h + sid_emb + if self.lang_num is not None: + lid_emb = self.lid_emb(lang_id.reshape([-1])) + h = h + lid_emb + if self.spk_embed_dim is not None: + hs, spk_emb = h.unsqueeze(0), spk_emb.unsqueeze(0) + h = self._integrate_with_spk_embed(hs, spk_emb)[0] + out, prob, att_w = self.dec.inference( + h, + threshold=threshold, + minlenratio=minlenratio, + maxlenratio=maxlenratio, + use_att_constraint=use_att_constraint, + backward_window=backward_window, + forward_window=forward_window, ) + + return dict(feat_gen=out, prob=prob, att_w=att_w) + + def _integrate_with_spk_embed(self, + hs: paddle.Tensor, + spk_emb: paddle.Tensor) -> paddle.Tensor: + """Integrate speaker embedding with hidden states. + + Args: + hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits). + spk_emb (Tensor): Batch of speaker embeddings (B, spk_embed_dim). + + Returns: + Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if + integration_type is "add" else (B, Tmax, eunits + spk_embed_dim). + + """ + if self.spk_embed_integration_type == "add": + # apply projection and then add to hidden states + spk_emb = self.projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) + elif self.spk_embed_integration_type == "concat": + # concat hidden states with spk embeds + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( + shape=[-1, paddle.shape(hs)[1], -1]) + hs = paddle.concat([hs, spk_emb], axis=-1) + else: + raise NotImplementedError("support only add or concat.") + + return hs + + +class Tacotron2Inference(nn.Layer): + def __init__(self, normalizer, model): + super().__init__() + self.normalizer = normalizer + self.acoustic_model = model + + def forward(self, text, spk_id=None, spk_emb=None): + out = self.acoustic_model.inference( + text, spk_id=spk_id, spk_emb=spk_emb) + normalized_mel = out["feat_gen"] + logmel = self.normalizer.inverse(normalized_mel) + return logmel diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py new file mode 100644 index 00000000..09e6827d --- /dev/null +++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py @@ -0,0 +1,219 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from pathlib import Path + +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer + +from paddlespeech.t2s.modules.losses import GuidedAttentionLoss +from paddlespeech.t2s.modules.losses import Tacotron2Loss +from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator +from paddlespeech.t2s.training.reporter import report +from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater +logging.basicConfig( + format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class Tacotron2Updater(StandardUpdater): + def __init__(self, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, + init_state=None, + use_masking: bool=True, + use_weighted_masking: bool=False, + bce_pos_weight: float=5.0, + loss_type: str="L1+L2", + use_guided_attn_loss: bool=True, + guided_attn_loss_sigma: float=0.4, + guided_attn_loss_lambda: float=1.0, + output_dir: Path=None): + super().__init__(model, optimizer, dataloader, init_state=None) + + self.loss_type = loss_type + self.use_guided_attn_loss = use_guided_attn_loss + + self.taco2_loss = Tacotron2Loss( + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight, ) + if self.use_guided_attn_loss: + self.attn_loss = GuidedAttentionLoss( + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def update_core(self, batch): + self.msg = "Rank: {}, ".format(dist.get_rank()) + losses_dict = {} + # spk_id!=None in multiple spk fastspeech2 + spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + if spk_emb is not None: + spk_id = None + + after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + spk_id=spk_id, + spk_emb=spk_emb) + + # calculate taco2 loss + l1_loss, mse_loss, bce_loss = self.taco2_loss( + after_outs=after_outs, + before_outs=before_outs, + logits=logits, + ys=ys, + stop_labels=stop_labels, + olens=olens) + + if self.loss_type == "L1+L2": + loss = l1_loss + mse_loss + bce_loss + elif self.loss_type == "L1": + loss = l1_loss + bce_loss + elif self.loss_type == "L2": + loss = mse_loss + bce_loss + else: + raise ValueError(f"unknown --loss-type {self.loss_type}") + + # calculate attention loss + if self.use_guided_attn_loss: + # NOTE: length of output for auto-regressive + # input will be changed when r > 1 + attn_loss = self.attn_loss( + att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) + loss = loss + attn_loss + + optimizer = self.optimizer + optimizer.clear_grad() + loss.backward() + optimizer.step() + + report("train/l1_loss", float(l1_loss)) + report("train/mse_loss", float(mse_loss)) + report("train/bce_loss", float(bce_loss)) + report("train/attn_loss", float(attn_loss)) + report("train/loss", float(loss)) + + losses_dict["l1_loss"] = float(l1_loss) + losses_dict["mse_loss"] = float(mse_loss) + losses_dict["bce_loss"] = float(bce_loss) + losses_dict["attn_loss"] = float(attn_loss) + losses_dict["loss"] = float(loss) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + +class Tacotron2Evaluator(StandardEvaluator): + def __init__(self, + model: Layer, + dataloader: DataLoader, + use_masking: bool=True, + use_weighted_masking: bool=False, + bce_pos_weight: float=5.0, + loss_type: str="L1+L2", + use_guided_attn_loss: bool=True, + guided_attn_loss_sigma: float=0.4, + guided_attn_loss_lambda: float=1.0, + output_dir=None): + super().__init__(model, dataloader) + + self.loss_type = loss_type + self.use_guided_attn_loss = use_guided_attn_loss + + self.taco2_loss = Tacotron2Loss( + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight, ) + if self.use_guided_attn_loss: + self.attn_loss = GuidedAttentionLoss( + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def evaluate_core(self, batch): + self.msg = "Evaluate: " + losses_dict = {} + # spk_id!=None in multiple spk fastspeech2 + spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + if spk_emb is not None: + spk_id = None + + after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + spk_id=spk_id, + spk_emb=spk_emb) + + # calculate taco2 loss + l1_loss, mse_loss, bce_loss = self.taco2_loss( + after_outs=after_outs, + before_outs=before_outs, + logits=logits, + ys=ys, + stop_labels=stop_labels, + olens=olens) + + if self.loss_type == "L1+L2": + loss = l1_loss + mse_loss + bce_loss + elif self.loss_type == "L1": + loss = l1_loss + bce_loss + elif self.loss_type == "L2": + loss = mse_loss + bce_loss + else: + raise ValueError(f"unknown --loss-type {self.loss_type}") + + # calculate attention loss + if self.use_guided_attn_loss: + # NOTE: length of output for auto-regressive + # input will be changed when r > 1 + attn_loss = self.attn_loss( + att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) + loss = loss + attn_loss + + report("eval/l1_loss", float(l1_loss)) + report("eval/mse_loss", float(mse_loss)) + report("eval/bce_loss", float(bce_loss)) + report("eval/attn_loss", float(attn_loss)) + report("eval/loss", float(loss)) + + losses_dict["l1_loss"] = float(l1_loss) + losses_dict["mse_loss"] = float(mse_loss) + losses_dict["bce_loss"] = float(bce_loss) + losses_dict["attn_loss"] = float(attn_loss) + losses_dict["loss"] = float(loss) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + self.logger.info(self.msg) diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index ae6d7365..92754c30 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -48,127 +48,67 @@ class TransformerTTS(nn.Layer): .. _`Neural Speech Synthesis with Transformer Network`: https://arxiv.org/pdf/1809.08895.pdf - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - embed_dim : int, optional - Dimension of character embedding. - eprenet_conv_layers : int, optional - Number of encoder prenet convolution layers. - eprenet_conv_chans : int, optional - Number of encoder prenet convolution channels. - eprenet_conv_filts : int, optional - Filter size of encoder prenet convolution. - dprenet_layers : int, optional - Number of decoder prenet layers. - dprenet_units : int, optional - Number of decoder prenet hidden units. - elayers : int, optional - Number of encoder layers. - eunits : int, optional - Number of encoder hidden units. - adim : int, optional - Number of attention transformation dimensions. - aheads : int, optional - Number of heads for multi head attention. - dlayers : int, optional - Number of decoder layers. - dunits : int, optional - Number of decoder hidden units. - postnet_layers : int, optional - Number of postnet layers. - postnet_chans : int, optional - Number of postnet channels. - postnet_filts : int, optional - Filter size of postnet. - use_scaled_pos_enc : pool, optional - Whether to use trainable scaled positional encoding. - use_batch_norm : bool, optional - Whether to use batch normalization in encoder prenet. - encoder_normalize_before : bool, optional - Whether to perform layer normalization before encoder block. - decoder_normalize_before : bool, optional - Whether to perform layer normalization before decoder block. - encoder_concat_after : bool, optional - Whether to concatenate attention layer's input and output in encoder. - decoder_concat_after : bool, optional - Whether to concatenate attention layer's input and output in decoder. - positionwise_layer_type : str, optional - Position-wise operation type. - positionwise_conv_kernel_size : int, optional - Kernel size in position wise conv 1d. - reduction_factor : int, optional - Reduction factor. - spk_embed_dim : int, optional - Number of speaker embedding dimenstions. - spk_embed_integration_type : str, optional - How to integrate speaker embedding. - use_gst : str, optional - Whether to use global style token. - gst_tokens : int, optional - The number of GST embeddings. - gst_heads : int, optional - The number of heads in GST multihead attention. - gst_conv_layers : int, optional - The number of conv layers in GST. - gst_conv_chans_list : Sequence[int], optional - List of the number of channels of conv layers in GST. - gst_conv_kernel_size : int, optional - Kernal size of conv layers in GST. - gst_conv_stride : int, optional - Stride size of conv layers in GST. - gst_gru_layers : int, optional - The number of GRU layers in GST. - gst_gru_units : int, optional - The number of GRU units in GST. - transformer_lr : float, optional - Initial value of learning rate. - transformer_warmup_steps : int, optional - Optimizer warmup steps. - transformer_enc_dropout_rate : float, optional - Dropout rate in encoder except attention and positional encoding. - transformer_enc_positional_dropout_rate : float, optional - Dropout rate after encoder positional encoding. - transformer_enc_attn_dropout_rate : float, optional - Dropout rate in encoder self-attention module. - transformer_dec_dropout_rate : float, optional - Dropout rate in decoder except attention & positional encoding. - transformer_dec_positional_dropout_rate : float, optional - Dropout rate after decoder positional encoding. - transformer_dec_attn_dropout_rate : float, optional - Dropout rate in deocoder self-attention module. - transformer_enc_dec_attn_dropout_rate : float, optional - Dropout rate in encoder-deocoder attention module. - init_type : str, optional - How to initialize transformer parameters. - init_enc_alpha : float, optional - Initial value of alpha in scaled pos encoding of the encoder. - init_dec_alpha : float, optional - Initial value of alpha in scaled pos encoding of the decoder. - eprenet_dropout_rate : float, optional - Dropout rate in encoder prenet. - dprenet_dropout_rate : float, optional - Dropout rate in decoder prenet. - postnet_dropout_rate : float, optional - Dropout rate in postnet. - use_masking : bool, optional - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool, optional - Whether to apply weighted masking in loss calculation. - bce_pos_weight : float, optional - Positive sample weight in bce calculation (only for use_masking=true). - loss_type : str, optional - How to calculate loss. - use_guided_attn_loss : bool, optional - Whether to use guided attention loss. - num_heads_applied_guided_attn : int, optional - Number of heads in each layer to apply guided attention loss. - num_layers_applied_guided_attn : int, optional - Number of layers to apply guided attention loss. - List of module names to apply guided attention loss. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + embed_dim (int, optional): Dimension of character embedding. + eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers. + eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels. + eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution. + dprenet_layers (int, optional): Number of decoder prenet layers. + dprenet_units (int, optional): Number of decoder prenet hidden units. + elayers (int, optional): Number of encoder layers. + eunits (int, optional): Number of encoder hidden units. + adim (int, optional): Number of attention transformation dimensions. + aheads (int, optional): Number of heads for multi head attention. + dlayers (int, optional): Number of decoder layers. + dunits (int, optional): Number of decoder hidden units. + postnet_layers (int, optional): Number of postnet layers. + postnet_chans (int, optional): Number of postnet channels. + postnet_filts (int, optional): Filter size of postnet. + use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding. + use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet. + encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block. + decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block. + encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder. + decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder. + positionwise_layer_type (str, optional): Position-wise operation type. + positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d. + reduction_factor (int, optional): Reduction factor. + spk_embed_dim (int, optional): Number of speaker embedding dimenstions. + spk_embed_integration_type (str, optional): How to integrate speaker embedding. + use_gst (str, optional): Whether to use global style token. + gst_tokens (int, optional): The number of GST embeddings. + gst_heads (int, optional): The number of heads in GST multihead attention. + gst_conv_layers (int, optional): The number of conv layers in GST. + gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST. + gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST. + gst_conv_stride (int, optional): Stride size of conv layers in GST. + gst_gru_layers (int, optional): The number of GRU layers in GST. + gst_gru_units (int, optional): The number of GRU units in GST. + transformer_lr (float, optional): Initial value of learning rate. + transformer_warmup_steps (int, optional): Optimizer warmup steps. + transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding. + transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding. + transformer_enc_attn_dropout_rate (float, optional): Dropout rate in encoder self-attention module. + transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding. + transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding. + transformer_dec_attn_dropout_rate (float, optional): Dropout rate in deocoder self-attention module. + transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module. + init_type (str, optional): How to initialize transformer parameters. + init_enc_alpha (float, optional): Initial value of alpha in scaled pos encoding of the encoder. + init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder. + eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet. + dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet. + postnet_dropout_rate (float, optional): Dropout rate in postnet. + use_masking (bool, optional): Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation. + bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true). + loss_type (str, optional): How to calculate loss. + use_guided_attn_loss (bool, optional): Whether to use guided attention loss. + num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss. + num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss. + List of module names to apply guided attention loss. """ def __init__( @@ -398,25 +338,16 @@ class TransformerTTS(nn.Layer): ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. - Parameters - ---------- - text : Tensor(int64) - Batch of padded character ids (B, Tmax). - text_lengths : Tensor(int64) - Batch of lengths of each input batch (B,). - speech : Tensor - Batch of padded target features (B, Lmax, odim). - speech_lengths : Tensor(int64) - Batch of the lengths of each target (B,). - spk_emb : Tensor, optional - Batch of speaker embeddings (B, spk_embed_dim). - - Returns - ---------- - Tensor - Loss scalar value. - Dict - Statistics to be monitored. + Args: + text(Tensor(int64)): Batch of padded character ids (B, Tmax). + text_lengths(Tensor(int64)): Batch of lengths of each input batch (B,). + speech(Tensor): Batch of padded target features (B, Lmax, odim). + speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,). + spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim). + + Returns: + Tensor: Loss scalar value. + Dict: Statistics to be monitored. """ # input of embedding must be int64 @@ -433,12 +364,10 @@ class TransformerTTS(nn.Layer): olens = paddle.cast(speech_lengths, 'int64') # make labels for stop prediction - labels = make_pad_mask(olens - 1) - labels = numpy.pad( - labels.numpy(), ((0, 0), (0, 1)), 'constant', constant_values=1.0) - labels = paddle.to_tensor(labels) - labels = paddle.cast(labels, dtype="float32") - # labels = F.pad(labels, [0, 1], "constant", 1.0) + stop_labels = make_pad_mask(olens - 1) + # bool 类型无法切片 + stop_labels = paddle.cast(stop_labels, dtype='float32') + stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0) # calculate transformer outputs after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens, @@ -447,12 +376,15 @@ class TransformerTTS(nn.Layer): # modifiy mod part of groundtruth if self.reduction_factor > 1: - olens = paddle.to_tensor( - [olen - olen % self.reduction_factor for olen in olens.numpy()]) + olens = olens - olens % self.reduction_factor max_olen = max(olens) ys = ys[:, :max_olen] - labels = labels[:, :max_olen] - labels[:, -1] = 1.0 # make sure at least one frame has 1 + stop_labels = stop_labels[:, :max_olen] + stop_labels[:, -1] = 1.0 # make sure at least one frame has 1 + olens_in = olens // self.reduction_factor + else: + olens_in = olens + need_dict = {} need_dict['encoder'] = self.encoder need_dict['decoder'] = self.decoder @@ -462,7 +394,7 @@ class TransformerTTS(nn.Layer): 'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc - return after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict + return after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict def _forward( self, @@ -488,8 +420,7 @@ class TransformerTTS(nn.Layer): # thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim) if self.reduction_factor > 1: ys_in = ys[:, self.reduction_factor - 1::self.reduction_factor] - olens_in = olens.new( - [olen // self.reduction_factor for olen in olens]) + olens_in = olens // self.reduction_factor else: ys_in, olens_in = ys, olens @@ -525,31 +456,19 @@ class TransformerTTS(nn.Layer): ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Generate the sequence of features given the sequences of characters. - Parameters - ---------- - text : Tensor(int64) - Input sequence of characters (T,). - speech : Tensor, optional - Feature sequence to extract style (N, idim). - spk_emb : Tensor, optional - Speaker embedding vector (spk_embed_dim,). - threshold : float, optional - Threshold in inference. - minlenratio : float, optional - Minimum length ratio in inference. - maxlenratio : float, optional - Maximum length ratio in inference. - use_teacher_forcing : bool, optional - Whether to use teacher forcing. - - Returns - ---------- - Tensor - Output sequence of features (L, odim). - Tensor - Output sequence of stop probabilities (L,). - Tensor - Encoder-decoder (source) attention weights (#layers, #heads, L, T). + Args: + text(Tensor(int64)): Input sequence of characters (T,). + speech(Tensor, optional): Feature sequence to extract style (N, idim). + spk_emb(Tensor, optional): Speaker embedding vector (spk_embed_dim,). + threshold(float, optional): Threshold in inference. + minlenratio(float, optional): Minimum length ratio in inference. + maxlenratio(float, optional): Maximum length ratio in inference. + use_teacher_forcing(bool, optional): Whether to use teacher forcing. + + Returns: + Tensor: Output sequence of features (L, odim). + Tensor: Output sequence of stop probabilities (L,). + Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T). """ # input of embedding must be int64 @@ -671,23 +590,17 @@ class TransformerTTS(nn.Layer): def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor: """Make masks for self-attention. - Parameters - ---------- - ilens : Tensor - Batch of lengths (B,). + Args: + ilens(Tensor): Batch of lengths (B,). - Returns - ------- - Tensor - Mask tensor for self-attention. - dtype=paddle.bool + Returns: + Tensor: Mask tensor for self-attention. dtype=paddle.bool - Examples - ------- - >>> ilens = [5, 3] - >>> self._source_mask(ilens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 0, 0]]]) bool + Examples: + >>> ilens = [5, 3] + >>> self._source_mask(ilens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 0, 0]]]) bool """ x_masks = make_non_pad_mask(ilens) @@ -696,30 +609,25 @@ class TransformerTTS(nn.Layer): def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor: """Make masks for masked self-attention. - Parameters - ---------- - olens : LongTensor - Batch of lengths (B,). - - Returns - ---------- - Tensor - Mask tensor for masked self-attention. - - Examples - ---------- - >>> olens = [5, 3] - >>> self._target_mask(olens) - tensor([[[1, 0, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 1, 0, 0], - [1, 1, 1, 1, 0], - [1, 1, 1, 1, 1]], - [[1, 0, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 1, 0, 0], - [1, 1, 1, 0, 0], - [1, 1, 1, 0, 0]]], dtype=paddle.uint8) + Args: + olens (Tensor(int64)): Batch of lengths (B,). + + Returns: + Tensor: Mask tensor for masked self-attention. + + Examples: + >>> olens = [5, 3] + >>> self._target_mask(olens) + tensor([[[1, 0, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 1, 0, 0], + [1, 1, 1, 1, 0], + [1, 1, 1, 1, 1]], + [[1, 0, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 1, 0, 0], + [1, 1, 1, 0, 0], + [1, 1, 1, 0, 0]]], dtype=paddle.uint8) """ y_masks = make_non_pad_mask(olens) @@ -731,17 +639,12 @@ class TransformerTTS(nn.Layer): spk_emb: paddle.Tensor) -> paddle.Tensor: """Integrate speaker embedding with hidden states. - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, adim). - spk_emb : Tensor - Batch of speaker embeddings (B, spk_embed_dim). + Args: + hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). + spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim). - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, adim). + Returns: + Tensor: Batch of integrated hidden state sequences (B, Tmax, adim). """ if self.spk_embed_integration_type == "add": @@ -769,318 +672,3 @@ class TransformerTTSInference(nn.Layer): normalized_mel = self.acoustic_model.inference(text)[0] logmel = self.normalizer.inverse(normalized_mel) return logmel - - -class TransformerTTSLoss(nn.Layer): - """Loss function module for Tacotron2.""" - - def __init__(self, - use_masking=True, - use_weighted_masking=False, - bce_pos_weight=5.0): - """Initialize Tactoron2 loss module. - - Parameters - ---------- - use_masking : bool - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool - Whether to apply weighted masking in loss calculation. - bce_pos_weight : float - Weight of positive sample of stop token. - - """ - super().__init__() - assert (use_masking != use_weighted_masking) or not use_masking - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - - # define criterions - reduction = "none" if self.use_weighted_masking else "mean" - self.l1_criterion = nn.L1Loss(reduction=reduction) - self.mse_criterion = nn.MSELoss(reduction=reduction) - self.bce_criterion = nn.BCEWithLogitsLoss( - reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight)) - - def forward(self, after_outs, before_outs, logits, ys, labels, olens): - """Calculate forward propagation. - - Parameters - ---------- - after_outs : Tensor - Batch of outputs after postnets (B, Lmax, odim). - before_outs : Tensor - Batch of outputs before postnets (B, Lmax, odim). - logits : Tensor - Batch of stop logits (B, Lmax). - ys : Tensor - Batch of padded target features (B, Lmax, odim). - labels : LongTensor - Batch of the sequences of stop token labels (B, Lmax). - olens : LongTensor - Batch of the lengths of each target (B,). - - Returns - ---------- - Tensor - L1 loss value. - Tensor - Mean square error loss value. - Tensor - Binary cross entropy loss value. - - """ - # make mask and apply it - if self.use_masking: - masks = make_non_pad_mask(olens).unsqueeze(-1) - ys = ys.masked_select(masks.broadcast_to(ys.shape)) - after_outs = after_outs.masked_select( - masks.broadcast_to(after_outs.shape)) - before_outs = before_outs.masked_select( - masks.broadcast_to(before_outs.shape)) - # Operator slice does not have kernel for data_type[bool] - tmp_masks = paddle.cast(masks, dtype='int64') - tmp_masks = tmp_masks[:, :, 0] - tmp_masks = paddle.cast(tmp_masks, dtype='bool') - labels = labels.masked_select(tmp_masks.broadcast_to(labels.shape)) - logits = logits.masked_select(tmp_masks.broadcast_to(logits.shape)) - - # calculate loss - l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion( - before_outs, ys) - mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion( - before_outs, ys) - bce_loss = self.bce_criterion(logits, labels) - - # make weighted mask and apply it - if self.use_weighted_masking: - masks = make_non_pad_mask(olens).unsqueeze(-1) - weights = masks.float() / masks.sum(dim=1, keepdim=True).float() - out_weights = weights.div(ys.shape[0] * ys.shape[2]) - logit_weights = weights.div(ys.shape[0]) - - # apply weight - l1_loss = l1_loss.multiply(out_weights) - l1_loss = l1_loss.masked_select( - masks.broadcast_to(l1_loss.shape)).sum() - - mse_loss = mse_loss.multiply(out_weights) - mse_loss = mse_loss.masked_select( - masks.broadcast_to(mse_loss.shape)).sum() - - bce_loss = bce_loss.multiply(logit_weights.squeeze(-1)) - bce_loss = bce_loss.masked_select( - masks.squeeze(-1).broadcast_to(bce_loss.shape)).sum() - - return l1_loss, mse_loss, bce_loss - - -class GuidedAttentionLoss(nn.Layer): - """Guided attention loss function module. - - This module calculates the guided attention loss described - in `Efficiently Trainable Text-to-Speech System Based - on Deep Convolutional Networks with Guided Attention`_, - which forces the attention to be diagonal. - - .. _`Efficiently Trainable Text-to-Speech System - Based on Deep Convolutional Networks with Guided Attention`: - https://arxiv.org/abs/1710.08969 - - """ - - def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): - """Initialize guided attention loss module. - - Parameters - ---------- - sigma : float, optional - Standard deviation to control how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. - - """ - super(GuidedAttentionLoss, self).__init__() - self.sigma = sigma - self.alpha = alpha - self.reset_always = reset_always - self.guided_attn_masks = None - self.masks = None - - def _reset_masks(self): - self.guided_attn_masks = None - self.masks = None - - def forward(self, att_ws, ilens, olens): - """Calculate forward propagation. - - Parameters - ---------- - att_ws : Tensor - Batch of attention weights (B, T_max_out, T_max_in). - ilens : LongTensor - Batch of input lenghts (B,). - olens : LongTensor - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. - - """ - if self.guided_attn_masks is None: - self.guided_attn_masks = self._make_guided_attention_masks(ilens, - olens) - if self.masks is None: - self.masks = self._make_masks(ilens, olens) - losses = self.guided_attn_masks * att_ws - loss = paddle.mean( - losses.masked_select(self.masks.broadcast_to(losses.shape))) - if self.reset_always: - self._reset_masks() - return self.alpha * loss - - def _make_guided_attention_masks(self, ilens, olens): - n_batches = len(ilens) - max_ilen = max(ilens) - max_olen = max(olens) - guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen)) - - for idx, (ilen, olen) in enumerate(zip(ilens, olens)): - - ilen = int(ilen) - olen = int(olen) - guided_attn_masks[idx, :olen, : - ilen] = self._make_guided_attention_mask( - ilen, olen, self.sigma) - return guided_attn_masks - - @staticmethod - def _make_guided_attention_mask(ilen, olen, sigma): - """Make guided attention mask. - - Examples - ---------- - >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4) - >>> guided_attn_mask.shape - [5, 5] - >>> guided_attn_mask - tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647], - [0.1175, 0.0000, 0.1175, 0.3935, 0.6753], - [0.3935, 0.1175, 0.0000, 0.1175, 0.3935], - [0.6753, 0.3935, 0.1175, 0.0000, 0.1175], - [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]]) - >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4) - >>> guided_attn_mask.shape - [6, 3] - >>> guided_attn_mask - tensor([[0.0000, 0.2934, 0.7506], - [0.0831, 0.0831, 0.5422], - [0.2934, 0.0000, 0.2934], - [0.5422, 0.0831, 0.0831], - [0.7506, 0.2934, 0.0000], - [0.8858, 0.5422, 0.0831]]) - - """ - grid_x, grid_y = paddle.meshgrid( - paddle.arange(olen), paddle.arange(ilen)) - grid_x = grid_x.cast(dtype=paddle.float32) - grid_y = grid_y.cast(dtype=paddle.float32) - return 1.0 - paddle.exp(-( - (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2))) - - @staticmethod - def _make_masks(ilens, olens): - """Make masks indicating non-padded part. - - Parameters - ---------- - ilens (LongTensor or List): Batch of lengths (B,). - olens (LongTensor or List): Batch of lengths (B,). - - Returns - ---------- - Tensor - Mask tensor indicating non-padded part. - - Examples - ---------- - >>> ilens, olens = [5, 2], [8, 5] - >>> _make_mask(ilens, olens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1]], - - [[1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]]], dtype=paddle.uint8) - - """ - # (B, T_in) - in_masks = make_non_pad_mask(ilens) - # (B, T_out) - out_masks = make_non_pad_mask(olens) - # (B, T_out, T_in) - - return paddle.logical_and( - out_masks.unsqueeze(-1), in_masks.unsqueeze(-2)) - - -class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): - """Guided attention loss function module for multi head attention. - - Parameters - ---------- - sigma : float, optional - Standard deviation to controlGuidedAttentionLoss - how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. - - """ - - def forward(self, att_ws, ilens, olens): - """Calculate forward propagation. - - Parameters - ---------- - att_ws : Tensor - Batch of multi head attention weights (B, H, T_max_out, T_max_in). - ilens : Tensor - Batch of input lenghts (B,). - olens : Tensor - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. - - """ - if self.guided_attn_masks is None: - self.guided_attn_masks = ( - self._make_guided_attention_masks(ilens, olens).unsqueeze(1)) - if self.masks is None: - self.masks = self._make_masks(ilens, olens).unsqueeze(1) - losses = self.guided_attn_masks * att_ws - loss = paddle.mean( - losses.masked_select(self.masks.broadcast_to(losses.shape))) - if self.reset_always: - self._reset_masks() - - return self.alpha * loss diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py index f16cf4dd..dff908e0 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py @@ -12,13 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from pathlib import Path from typing import Sequence import paddle from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer -from paddlespeech.t2s.models.transformer_tts import GuidedMultiHeadAttentionLoss -from paddlespeech.t2s.models.transformer_tts import TransformerTTSLoss +from paddlespeech.t2s.modules.losses import GuidedMultiHeadAttentionLoss +from paddlespeech.t2s.modules.losses import Tacotron2Loss as TransformerTTSLoss from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator from paddlespeech.t2s.training.reporter import report from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater @@ -32,38 +36,34 @@ logger.setLevel(logging.INFO) class TransformerTTSUpdater(StandardUpdater): def __init__( self, - model, - optimizer, - dataloader, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, init_state=None, - use_masking=False, - use_weighted_masking=False, - output_dir=None, - bce_pos_weight=5.0, + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None, + bce_pos_weight: float=5.0, loss_type: str="L1", use_guided_attn_loss: bool=True, modules_applied_guided_attn: Sequence[str]=("encoder-decoder"), guided_attn_loss_sigma: float=0.4, guided_attn_loss_lambda: float=1.0, ): super().__init__(model, optimizer, dataloader, init_state=None) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - self.bce_pos_weight = bce_pos_weight + self.loss_type = loss_type self.use_guided_attn_loss = use_guided_attn_loss - self.guided_attn_loss_sigma = guided_attn_loss_sigma - self.guided_attn_loss_lambda = guided_attn_loss_lambda self.modules_applied_guided_attn = modules_applied_guided_attn self.criterion = TransformerTTSLoss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking, - bce_pos_weight=self.bce_pos_weight) + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight) if self.use_guided_attn_loss: self.attn_criterion = GuidedMultiHeadAttentionLoss( - sigma=self.guided_attn_loss_sigma, - alpha=self.guided_attn_loss_lambda, ) + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -75,7 +75,7 @@ class TransformerTTSUpdater(StandardUpdater): self.msg = "Rank: {}, ".format(dist.get_rank()) losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -86,7 +86,7 @@ class TransformerTTSUpdater(StandardUpdater): before_outs=before_outs, logits=logits, ys=ys, - labels=labels, + stop_labels=stop_labels, olens=olens) report("train/bce_loss", float(bce_loss)) @@ -120,7 +120,10 @@ class TransformerTTSUpdater(StandardUpdater): break # (B, H*L, T_in, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens) + enc_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=batch["text_lengths"] + 1) loss = loss + enc_attn_loss report("train/enc_attn_loss", float(enc_attn_loss)) losses_dict["enc_attn_loss"] = float(enc_attn_loss) @@ -137,7 +140,8 @@ class TransformerTTSUpdater(StandardUpdater): break # (B, H*L, T_out, T_out) att_ws = paddle.concat(att_ws, axis=1) - dec_attn_loss = self.attn_criterion(att_ws, olens, olens) + dec_attn_loss = self.attn_criterion( + att_ws=att_ws, ilens=olens_in, olens=olens_in) report("train/dec_attn_loss", float(dec_attn_loss)) losses_dict["dec_attn_loss"] = float(dec_attn_loss) loss = loss + dec_attn_loss @@ -154,7 +158,10 @@ class TransformerTTSUpdater(StandardUpdater): break # (B, H*L, T_out, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens) + enc_dec_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=olens_in) report("train/enc_dec_attn_loss", float(enc_dec_attn_loss)) losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss) loss = loss + enc_dec_attn_loss @@ -182,37 +189,33 @@ class TransformerTTSUpdater(StandardUpdater): class TransformerTTSEvaluator(StandardEvaluator): def __init__( self, - model, - dataloader, + model: Layer, + dataloader: DataLoader, init_state=None, - use_masking=False, - use_weighted_masking=False, - output_dir=None, - bce_pos_weight=5.0, + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None, + bce_pos_weight: float=5.0, loss_type: str="L1", use_guided_attn_loss: bool=True, modules_applied_guided_attn: Sequence[str]=("encoder-decoder"), guided_attn_loss_sigma: float=0.4, guided_attn_loss_lambda: float=1.0, ): super().__init__(model, dataloader) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - self.bce_pos_weight = bce_pos_weight + self.loss_type = loss_type self.use_guided_attn_loss = use_guided_attn_loss - self.guided_attn_loss_sigma = guided_attn_loss_sigma - self.guided_attn_loss_lambda = guided_attn_loss_lambda self.modules_applied_guided_attn = modules_applied_guided_attn self.criterion = TransformerTTSLoss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking, - bce_pos_weight=self.bce_pos_weight) + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight) if self.use_guided_attn_loss: self.attn_criterion = GuidedMultiHeadAttentionLoss( - sigma=self.guided_attn_loss_sigma, - alpha=self.guided_attn_loss_lambda, ) + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -223,7 +226,7 @@ class TransformerTTSEvaluator(StandardEvaluator): def evaluate_core(self, batch): self.msg = "Evaluate: " losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -234,7 +237,7 @@ class TransformerTTSEvaluator(StandardEvaluator): before_outs=before_outs, logits=logits, ys=ys, - labels=labels, + stop_labels=stop_labels, olens=olens) report("eval/bce_loss", float(bce_loss)) @@ -268,7 +271,10 @@ class TransformerTTSEvaluator(StandardEvaluator): break # (B, H*L, T_in, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens) + enc_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=batch["text_lengths"] + 1) loss = loss + enc_attn_loss report("train/enc_attn_loss", float(enc_attn_loss)) losses_dict["enc_attn_loss"] = float(enc_attn_loss) @@ -285,7 +291,8 @@ class TransformerTTSEvaluator(StandardEvaluator): break # (B, H*L, T_out, T_out) att_ws = paddle.concat(att_ws, axis=1) - dec_attn_loss = self.attn_criterion(att_ws, olens, olens) + dec_attn_loss = self.attn_criterion( + att_ws=att_ws, ilens=olens_in, olens=olens_in) report("eval/dec_attn_loss", float(dec_attn_loss)) losses_dict["dec_attn_loss"] = float(dec_attn_loss) loss = loss + dec_attn_loss @@ -303,7 +310,10 @@ class TransformerTTSEvaluator(StandardEvaluator): break # (B, H*L, T_out, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens) + enc_dec_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=olens_in) report("eval/enc_dec_attn_loss", float(enc_dec_attn_loss)) losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss) loss = loss + enc_dec_attn_loss diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py index e519e0c5..52e6005b 100644 --- a/paddlespeech/t2s/models/waveflow.py +++ b/paddlespeech/t2s/models/waveflow.py @@ -30,20 +30,14 @@ __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"] def fold(x, n_group): - r"""Fold audio or spectrogram's temporal dimension in to groups. + """Fold audio or spectrogram's temporal dimension in to groups. - Parameters - ---------- - x : Tensor [shape=(\*, time_steps) - The input tensor. + Args: + x(Tensor): The input tensor. shape=(*, time_steps) + n_group(int): The size of a group. - n_group : int - The size of a group. - - Returns - --------- - Tensor : [shape=(\*, time_steps // n_group, group)] - Folded tensor. + Returns: + Tensor: Folded tensor. shape=(*, time_steps // n_group, group) """ spatial_shape = list(x.shape[:-1]) time_steps = paddle.shape(x)[-1] @@ -58,27 +52,23 @@ class UpsampleNet(nn.LayerList): It consists of several conv2dtranspose layers which perform deconvolution on mel and time dimension. - Parameters - ---------- - upscale_factors : List[int], optional - Time upsampling factors for each Conv2DTranspose Layer. - - The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose - Layers. Each upscale_factor is used as the ``stride`` for the - corresponding Conv2DTranspose. Defaults to [16, 16], this the default - upsampling factor is 256. + Args: + upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer. + The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose + Layers. Each upscale_factor is used as the ``stride`` for the + corresponding Conv2DTranspose. Defaults to [16, 16], this the default + upsampling factor is 256. - Notes - ------ - ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft - transformation used to extract spectrogram features from audio. + Notes: + ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft + transformation used to extract spectrogram features from audio. - For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft - transformation whose ``hop_length`` equals 256 is suitable. + For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft + transformation whose ``hop_length`` equals 256 is suitable. - See Also - --------- - ``librosa.core.stft`` + See Also + + ``librosa.core.stft`` """ def __init__(self, upsample_factors): @@ -101,25 +91,18 @@ class UpsampleNet(nn.LayerList): self.upsample_factors = upsample_factors def forward(self, x, trim_conv_artifact=False): - r"""Forward pass of the ``UpsampleNet``. + """Forward pass of the ``UpsampleNet`` - Parameters - ----------- - x : Tensor [shape=(batch_size, input_channels, time_steps)] - The input spectrogram. + Args: + x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps) + trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False. - trim_conv_artifact : bool, optional - Trim deconvolution artifact at each layer. Defaults to False. + Returns: + Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps * upsample_factor) - Returns - -------- - Tensor: [shape=(batch_size, input_channels, time_steps \* upsample_factor)] - The upsampled spectrogram. - - Notes - -------- - If trim_conv_artifact is ``True``, the output time steps is less - than ``time_steps \* upsample_factors``. + Notes: + If trim_conv_artifact is ``True``, the output time steps is less + than ``time_steps * upsample_factors``. """ x = paddle.unsqueeze(x, 1) # (B, C, T) -> (B, 1, C, T) for layer in self: @@ -139,19 +122,11 @@ class ResidualBlock(nn.Layer): same paddign in width dimension. It also has projection for the condition and output. - Parameters - ---------- - channels : int - Feature size of the input. - - cond_channels : int - Featuer size of the condition. - - kernel_size : Tuple[int] - Kernel size of the Convolution2d applied to the input. - - dilations : int - Dilations of the Convolution2d applied to the input. + Args: + channels (int): Feature size of the input. + cond_channels (int): Featuer size of the condition. + kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input. + dilations (int): Dilations of the Convolution2d applied to the input. """ def __init__(self, channels, cond_channels, kernel_size, dilations): @@ -197,21 +172,13 @@ class ResidualBlock(nn.Layer): def forward(self, x, condition): """Compute output for a whole folded sequence. - Parameters - ---------- - x : Tensor [shape=(batch_size, channel, height, width)] - The input. - - condition : Tensor [shape=(batch_size, condition_channel, height, width)] - The local condition. + Args: + x (Tensor): The input. [shape=(batch_size, channel, height, width)] + condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition. - Returns - ------- - res : Tensor [shape=(batch_size, channel, height, width)] - The residual output. - - skip : Tensor [shape=(batch_size, channel, height, width)] - The skip output. + Returns: + res (Tensor): The residual output. [shape=(batch_size, channel, height, width)] + skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)] """ x_in = x x = self.conv(x) @@ -248,21 +215,14 @@ class ResidualBlock(nn.Layer): def add_input(self, x_row, condition_row): """Compute the output for a row and update the buffer. - Parameters - ---------- - x_row : Tensor [shape=(batch_size, channel, 1, width)] - A row of the input. - - condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)] - A row of the condition. + Args: + x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width) + condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width) - Returns - ------- - res : Tensor [shape=(batch_size, channel, 1, width)] - A row of the the residual output. + Returns: + res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) + skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width) - skip : Tensor [shape=(batch_size, channel, 1, width)] - A row of the skip output. """ x_row_in = x_row if len(paddle.shape(self._conv_buffer)) == 1: @@ -297,27 +257,15 @@ class ResidualBlock(nn.Layer): class ResidualNet(nn.LayerList): """A stack of several ResidualBlocks. It merges condition at each layer. - Parameters - ---------- - n_layer : int - Number of ResidualBlocks in the ResidualNet. - - residual_channels : int - Feature size of each ResidualBlocks. - - condition_channels : int - Feature size of the condition. + Args: + n_layer (int): Number of ResidualBlocks in the ResidualNet. + residual_channels (int): Feature size of each ResidualBlocks. + condition_channels (int): Feature size of the condition. + kernel_size (Tuple[int]): Kernel size of each ResidualBlock. + dilations_h (List[int]): Dilation in height dimension of every ResidualBlock. - kernel_size : Tuple[int] - Kernel size of each ResidualBlock. - - dilations_h : List[int] - Dilation in height dimension of every ResidualBlock. - - Raises - ------ - ValueError - If the length of dilations_h does not equals n_layers. + Raises: + ValueError: If the length of dilations_h does not equals n_layers. """ def __init__(self, @@ -339,18 +287,13 @@ class ResidualNet(nn.LayerList): def forward(self, x, condition): """Comput the output of given the input and the condition. - Parameters - ----------- - x : Tensor [shape=(batch_size, channel, height, width)] - The input. - - condition : Tensor [shape=(batch_size, condition_channel, height, width)] - The local condition. - - Returns - -------- - Tensor : [shape=(batch_size, channel, height, width)] - The output, which is an aggregation of all the skip outputs. + Args: + x (Tensor): The input. shape=(batch_size, channel, height, width) + condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width) + + Returns: + Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width) + """ skip_connections = [] for layer in self: @@ -368,21 +311,14 @@ class ResidualNet(nn.LayerList): def add_input(self, x_row, condition_row): """Compute the output for a row and update the buffers. - Parameters - ---------- - x_row : Tensor [shape=(batch_size, channel, 1, width)] - A row of the input. - - condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)] - A row of the condition. - - Returns - ------- - res : Tensor [shape=(batch_size, channel, 1, width)] - A row of the the residual output. - - skip : Tensor [shape=(batch_size, channel, 1, width)] - A row of the skip output. + Args: + x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width) + condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width) + + Returns: + res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) + skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width) + """ skip_connections = [] for layer in self: @@ -400,22 +336,12 @@ class Flow(nn.Layer): probability density estimation. The ``inverse`` method implements the sampling. - Parameters - ---------- - n_layers : int - Number of ResidualBlocks in the Flow. - - channels : int - Feature size of the ResidualBlocks. - - mel_bands : int - Feature size of the mel spectrogram (mel bands). - - kernel_size : Tuple[int] - Kernel size of each ResisualBlocks in the Flow. - - n_group : int - Number of timesteps to the folded into a group. + Args: + n_layers (int): Number of ResidualBlocks in the Flow. + channels (int): Feature size of the ResidualBlocks. + mel_bands (int): Feature size of the mel spectrogram (mel bands). + kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow. + n_group (int): Number of timesteps to the folded into a group. """ dilations_dict = { 8: [1, 1, 1, 1, 1, 1, 1, 1], @@ -466,26 +392,16 @@ class Flow(nn.Layer): """Probability density estimation. It is done by inversely transform a sample from p(X) into a sample from p(Z). - Parameters - ----------- - x : Tensor [shape=(batch, 1, height, width)] - A input sample of the distribution p(X). - - condition : Tensor [shape=(batch, condition_channel, height, width)] - The local condition. - - Returns - -------- - z (Tensor): shape(batch, 1, height, width), the transformed sample. - - Tuple[Tensor, Tensor] - The parameter of the transformation. - - logs (Tensor): shape(batch, 1, height - 1, width), the log scale - of the transformation from x to z. - - b (Tensor): shape(batch, 1, height - 1, width), the shift of the - transformation from x to z. + Args: + x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width) + condition (Tensor): The local condition. shape=(batch, condition_channel, height, width) + + Returns: + z (Tensor): shape(batch, 1, height, width), the transformed sample. + Tuple[Tensor, Tensor]: + The parameter of the transformation. + logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z. + b (Tensor): shape(batch, 1, height - 1, width), the shift of the transformation from x to z. """ # (B, C, H-1, W) logs, b = self._predict_parameters(x[:, :, :-1, :], @@ -516,27 +432,12 @@ class Flow(nn.Layer): """Sampling from the the distrition p(X). It is done by sample form p(Z) and transform the sample. It is a auto regressive transformation. - Parameters - ----------- - z : Tensor [shape=(batch, 1, height, width)] - A sample of the distribution p(Z). - - condition : Tensor [shape=(batch, condition_channel, height, width)] - The local condition. - - Returns - --------- - x : Tensor [shape=(batch, 1, height, width)] - The transformed sample. - - Tuple[Tensor, Tensor] - The parameter of the transformation. - - logs (Tensor): shape(batch, 1, height - 1, width), the log scale - of the transformation from x to z. - - b (Tensor): shape(batch, 1, height - 1, width), the shift of the - transformation from x to z. + Args: + z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps + condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps) + Returns: + Tensor: + The transformed sample. shape=(batch, 1, height, width) """ z_0 = z[:, :, :1, :] x = paddle.zeros_like(z) @@ -560,25 +461,13 @@ class WaveFlow(nn.LayerList): """An Deep Reversible layer that is composed of severel auto regressive flows. - Parameters - ----------- - n_flows : int - Number of flows in the WaveFlow model. - - n_layers : int - Number of ResidualBlocks in each Flow. - - n_group : int - Number of timesteps to fold as a group. - - channels : int - Feature size of each ResidualBlock. - - mel_bands : int - Feature size of mel spectrogram (mel bands). - - kernel_size : Union[int, List[int]] - Kernel size of the convolution layer in each ResidualBlock. + Args: + n_flows (int): Number of flows in the WaveFlow model. + n_layers (int): Number of ResidualBlocks in each Flow. + n_group (int): Number of timesteps to fold as a group. + channels (int): Feature size of each ResidualBlock. + mel_bands (int): Feature size of mel spectrogram (mel bands). + kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock. """ def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, @@ -628,22 +517,13 @@ class WaveFlow(nn.LayerList): """Probability density estimation of random variable x given the condition. - Parameters - ----------- - x : Tensor [shape=(batch_size, time_steps)] - The audio. - - condition : Tensor [shape=(batch_size, condition channel, time_steps)] - The local condition (mel spectrogram here). - - Returns - -------- - z : Tensor [shape=(batch_size, time_steps)] - The transformed random variable. - - log_det_jacobian: Tensor [shape=(1,)] - The log determinant of the jacobian of the transformation from x - to z. + Args: + x (Tensor): The audio. shape=(batch_size, time_steps) + condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps) + + Returns: + Tensor: The transformed random variable. shape=(batch_size, time_steps) + Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,) """ # x: (B, T) # condition: (B, C, T) upsampled condition @@ -678,18 +558,13 @@ class WaveFlow(nn.LayerList): Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an autoregressive manner. - Parameters - ---------- - z : Tensor [shape=(batch, 1, time_steps] - A sample of the distribution p(Z). - - condition : Tensor [shape=(batch, condition_channel, time_steps)] - The local condition. + Args: + z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps + condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps) - Returns - -------- - x : Tensor [shape=(batch_size, time_steps)] - The transformed sample (audio here). + Returns: + Tensor: The transformed sample (audio here). shape=(batch_size, time_steps) + """ z, condition = self._trim(z, condition) @@ -714,29 +589,15 @@ class WaveFlow(nn.LayerList): class ConditionalWaveFlow(nn.LayerList): """ConditionalWaveFlow, a UpsampleNet with a WaveFlow model. - Parameters - ---------- - upsample_factors : List[int] - Upsample factors for the upsample net. - - n_flows : int - Number of flows in the WaveFlow model. - - n_layers : int - Number of ResidualBlocks in each Flow. - - n_group : int - Number of timesteps to fold as a group. - - channels : int - Feature size of each ResidualBlock. - - n_mels : int - Feature size of mel spectrogram (mel bands). - - kernel_size : Union[int, List[int]] - Kernel size of the convolution layer in each ResidualBlock. - """ + Args: + upsample_factors (List[int]): Upsample factors for the upsample net. + n_flows (int): Number of flows in the WaveFlow model. + n_layers (int): Number of ResidualBlocks in each Flow. + n_group (int): Number of timesteps to fold as a group. + channels (int): Feature size of each ResidualBlock. + n_mels (int): Feature size of mel spectrogram (mel bands). + kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock. + """ def __init__(self, upsample_factors: List[int], @@ -760,22 +621,13 @@ class ConditionalWaveFlow(nn.LayerList): """Compute the transformed random variable z (x to z) and the log of the determinant of the jacobian of the transformation from x to z. - Parameters - ---------- - audio : Tensor [shape=(B, T)] - The audio. + Args: + audio(Tensor): The audio. shape=(B, T) + mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel) - mel : Tensor [shape=(B, C_mel, T_mel)] - The mel spectrogram. - - Returns - ------- - z : Tensor [shape=(B, T)] - The inversely transformed random variable z (x to z) - - log_det_jacobian: Tensor [shape=(1,)] - the log of the determinant of the jacobian of the transformation - from x to z. + Returns: + Tensor: The inversely transformed random variable z (x to z). shape=(B, T) + Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,) """ condition = self.encoder(mel) z, log_det_jacobian = self.decoder(audio, condition) @@ -783,17 +635,13 @@ class ConditionalWaveFlow(nn.LayerList): @paddle.no_grad() def infer(self, mel): - r"""Generate raw audio given mel spectrogram. + """Generate raw audio given mel spectrogram. - Parameters - ---------- - mel : Tensor [shape=(B, C_mel, T_mel)] - Mel spectrogram (in log-magnitude). + Args: + mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) - Returns - ------- - Tensor : [shape=(B, T)] - The synthesized audio, where``T <= T_mel \* upsample_factors``. + Returns: + Tensor: The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T) """ start = time.time() condition = self.encoder(mel, trim_conv_artifact=True) # (B, C, T) @@ -808,15 +656,11 @@ class ConditionalWaveFlow(nn.LayerList): def predict(self, mel): """Generate raw audio given mel spectrogram. - Parameters - ---------- - mel : np.ndarray [shape=(C_mel, T_mel)] - Mel spectrogram of an utterance(in log-magnitude). + Args: + mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) - Returns - ------- - np.ndarray [shape=(T,)] - The synthesized audio. + Returns: + np.ndarray: The synthesized audio. shape=(T,) """ mel = paddle.to_tensor(mel) mel = paddle.unsqueeze(mel, 0) @@ -828,18 +672,12 @@ class ConditionalWaveFlow(nn.LayerList): def from_pretrained(cls, config, checkpoint_path): """Build a ConditionalWaveFlow model from a pretrained model. - Parameters - ---------- - config: yacs.config.CfgNode - model configs + Args: + config(yacs.config.CfgNode): model configs + checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name - checkpoint_path: Path or str - the path of pretrained model checkpoint, without extension name - - Returns - ------- - ConditionalWaveFlow - The model built from pretrained result. + Returns: + ConditionalWaveFlow The model built from pretrained result. """ model = cls(upsample_factors=config.model.upsample_factors, n_flows=config.model.n_flows, @@ -855,11 +693,9 @@ class ConditionalWaveFlow(nn.LayerList): class WaveFlowLoss(nn.Layer): """Criterion of a WaveFlow model. - Parameters - ---------- - sigma : float - The standard deviation of the gaussian noise used in WaveFlow, by - default 1.0. + Args: + sigma (float): The standard deviation of the gaussian noise used in WaveFlow, + by default 1.0. """ def __init__(self, sigma=1.0): @@ -871,19 +707,13 @@ class WaveFlowLoss(nn.Layer): """Compute the loss given the transformed random variable z and the log_det_jacobian of transformation from x to z. - Parameters - ---------- - z : Tensor [shape=(B, T)] - The transformed random variable (x to z). - - log_det_jacobian : Tensor [shape=(1,)] - The log of the determinant of the jacobian matrix of the - transformation from x to z. + Args: + z(Tensor): The transformed random variable (x to z). shape=(B, T) + log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the + transformation from x to z. shape=(1,) - Returns - ------- - Tensor [shape=(1,)] - The loss. + Returns: + Tensor: The loss. shape=(1,) """ loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma ) - log_det_jacobian @@ -895,15 +725,12 @@ class ConditionalWaveFlow2Infer(ConditionalWaveFlow): def forward(self, mel): """Generate raw audio given mel spectrogram. - Parameters - ---------- - mel : np.ndarray [shape=(C_mel, T_mel)] - Mel spectrogram of an utterance(in log-magnitude). - - Returns - ------- - np.ndarray [shape=(T,)] - The synthesized audio. + Args: + mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) + + Returns: + np.ndarray: The synthesized audio. shape=(T,) + """ audio = self.predict(mel) return audio diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/__init__.py b/paddlespeech/t2s/models/wavernn/__init__.py similarity index 91% rename from paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/__init__.py rename to paddlespeech/t2s/models/wavernn/__init__.py index abf198b9..80ffd068 100644 --- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/__init__.py +++ b/paddlespeech/t2s/models/wavernn/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .wavernn import * +from .wavernn_updater import * diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py new file mode 100644 index 00000000..1320ffa3 --- /dev/null +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -0,0 +1,577 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import time +from typing import List + +import numpy as np +import paddle +from paddle import nn +from paddle.nn import functional as F + +from paddlespeech.t2s.audio.codec import decode_mu_law +from paddlespeech.t2s.modules.losses import sample_from_discretized_mix_logistic +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.t2s.modules.upsample import Stretch2D + + +class ResBlock(nn.Layer): + def __init__(self, dims): + super().__init__() + self.conv1 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False) + self.conv2 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False) + self.batch_norm1 = nn.BatchNorm1D(dims) + self.batch_norm2 = nn.BatchNorm1D(dims) + + def forward(self, x): + ''' + conv -> bn -> relu -> conv -> bn + residual connection + ''' + residual = x + x = self.conv1(x) + x = self.batch_norm1(x) + x = F.relu(x) + x = self.conv2(x) + x = self.batch_norm2(x) + return x + residual + + +class MelResNet(nn.Layer): + def __init__(self, + res_blocks: int=10, + compute_dims: int=128, + res_out_dims: int=128, + aux_channels: int=80, + aux_context_window: int=0): + super().__init__() + k_size = aux_context_window * 2 + 1 + # pay attention here, the dim reduces aux_context_window * 2 + self.conv_in = nn.Conv1D( + aux_channels, compute_dims, kernel_size=k_size, bias_attr=False) + self.batch_norm = nn.BatchNorm1D(compute_dims) + self.layers = nn.LayerList() + for _ in range(res_blocks): + self.layers.append(ResBlock(compute_dims)) + self.conv_out = nn.Conv1D(compute_dims, res_out_dims, kernel_size=1) + + def forward(self, x): + ''' + Args: + x (Tensor): Input tensor (B, in_dims, T). + Returns: + Tensor: Output tensor (B, res_out_dims, T). + ''' + + x = self.conv_in(x) + x = self.batch_norm(x) + x = F.relu(x) + for f in self.layers: + x = f(x) + x = self.conv_out(x) + return x + + +class UpsampleNetwork(nn.Layer): + def __init__(self, + aux_channels: int=80, + upsample_scales: List[int]=[4, 5, 3, 5], + compute_dims: int=128, + res_blocks: int=10, + res_out_dims: int=128, + aux_context_window: int=2): + super().__init__() + # total_scale is the total Up sampling multiple + total_scale = np.prod(upsample_scales) + # TODO pad*total_scale is numpy.int64 + self.indent = int(aux_context_window * total_scale) + self.resnet = MelResNet( + res_blocks=res_blocks, + aux_channels=aux_channels, + compute_dims=compute_dims, + res_out_dims=res_out_dims, + aux_context_window=aux_context_window) + self.resnet_stretch = Stretch2D(total_scale, 1) + self.up_layers = nn.LayerList() + for scale in upsample_scales: + k_size = (1, scale * 2 + 1) + padding = (0, scale) + stretch = Stretch2D(scale, 1) + + conv = nn.Conv2D( + 1, 1, kernel_size=k_size, padding=padding, bias_attr=False) + weight_ = paddle.full_like(conv.weight, 1. / k_size[1]) + conv.weight.set_value(weight_) + self.up_layers.append(stretch) + self.up_layers.append(conv) + + def forward(self, m): + ''' + Args: + c (Tensor): Input tensor (B, C_aux, T). + Returns: + Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux). + Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims). + ''' + # aux: [B, C_aux, T] + # -> [B, res_out_dims, T - 2 * aux_context_window] + # -> [B, 1, res_out_dims, T - 2 * aux_context_window] + aux = self.resnet(m).unsqueeze(1) + # aux: [B, 1, res_out_dims, T - 2 * aux_context_window] + # -> [B, 1, res_out_dims, (T - 2 * pad) * prob(upsample_scales)] + aux = self.resnet_stretch(aux) + # aux: [B, 1, res_out_dims, T * prob(upsample_scales)] + # -> [B, res_out_dims, T * prob(upsample_scales)] + aux = aux.squeeze(1) + # m: [B, C_aux, T] -> [B, 1, C_aux, T] + m = m.unsqueeze(1) + for f in self.up_layers: + m = f(m) + # m: [B, 1, C_aux, T*prob(upsample_scales)] + # -> [B, C_aux, T * prob(upsample_scales)] + # -> [B, C_aux, (T - 2 * pad) * prob(upsample_scales)] + m = m.squeeze(1)[:, :, self.indent:-self.indent] + # m: [B, (T - 2 * pad) * prob(upsample_scales), C_aux] + # aux: [B, (T - 2 * pad) * prob(upsample_scales), res_out_dims] + return m.transpose([0, 2, 1]), aux.transpose([0, 2, 1]) + + +class WaveRNN(nn.Layer): + def __init__( + self, + rnn_dims: int=512, + fc_dims: int=512, + bits: int=9, + aux_context_window: int=2, + upsample_scales: List[int]=[4, 5, 3, 5], + aux_channels: int=80, + compute_dims: int=128, + res_out_dims: int=128, + res_blocks: int=10, + hop_length: int=300, + sample_rate: int=24000, + mode='RAW', + init_type: str="xavier_uniform", ): + ''' + Args: + rnn_dims (int, optional): Hidden dims of RNN Layers. + fc_dims (int, optional): Dims of FC Layers. + bits (int, optional): bit depth of signal. + aux_context_window (int, optional): The context window size of the first convolution applied to the + auxiliary input, by default 2 + upsample_scales (List[int], optional): Upsample scales of the upsample network. + aux_channels (int, optional): Auxiliary channel of the residual blocks. + compute_dims (int, optional): Dims of Conv1D in MelResNet. + res_out_dims (int, optional): Dims of output in MelResNet. + res_blocks (int, optional): Number of residual blocks. + mode (str, optional): Output mode of the WaveRNN vocoder. + `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output. + init_type (str): How to initialize parameters. + ''' + super().__init__() + self.mode = mode + self.aux_context_window = aux_context_window + if self.mode == 'RAW': + self.n_classes = 2**bits + elif self.mode == 'MOL': + self.n_classes = 10 * 3 + else: + RuntimeError('Unknown model mode value - ', self.mode) + + # List of rnns to call 'flatten_parameters()' on + self._to_flatten = [] + + self.rnn_dims = rnn_dims + self.aux_dims = res_out_dims // 4 + self.hop_length = hop_length + self.sample_rate = sample_rate + + # initialize parameters + initialize(self, init_type) + + self.upsample = UpsampleNetwork( + aux_channels=aux_channels, + upsample_scales=upsample_scales, + compute_dims=compute_dims, + res_blocks=res_blocks, + res_out_dims=res_out_dims, + aux_context_window=aux_context_window) + self.I = nn.Linear(aux_channels + self.aux_dims + 1, rnn_dims) + + self.rnn1 = nn.GRU(rnn_dims, rnn_dims) + self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims) + + self._to_flatten += [self.rnn1, self.rnn2] + + self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims) + self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims) + self.fc3 = nn.Linear(fc_dims, self.n_classes) + + # Avoid fragmentation of RNN parameters and associated warning + self._flatten_parameters() + + nn.initializer.set_global_initializer(None) + + def forward(self, x, c): + ''' + Args: + x (Tensor): wav sequence, [B, T] + c (Tensor): mel spectrogram [B, C_aux, T'] + + T = (T' - 2 * aux_context_window ) * hop_length + Returns: + Tensor: [B, T, n_classes] + ''' + # Although we `_flatten_parameters()` on init, when using DataParallel + # the model gets replicated, making it no longer guaranteed that the + # weights are contiguous in GPU memory. Hence, we must call it again + self._flatten_parameters() + + bsize = paddle.shape(x)[0] + h1 = paddle.zeros([1, bsize, self.rnn_dims]) + h2 = paddle.zeros([1, bsize, self.rnn_dims]) + # c: [B, T, C_aux] + # aux: [B, T, res_out_dims] + c, aux = self.upsample(c) + + aux_idx = [self.aux_dims * i for i in range(5)] + a1 = aux[:, :, aux_idx[0]:aux_idx[1]] + a2 = aux[:, :, aux_idx[1]:aux_idx[2]] + a3 = aux[:, :, aux_idx[2]:aux_idx[3]] + a4 = aux[:, :, aux_idx[3]:aux_idx[4]] + + x = paddle.concat([x.unsqueeze(-1), c, a1], axis=2) + x = self.I(x) + res = x + x, _ = self.rnn1(x, h1) + + x = x + res + res = x + x = paddle.concat([x, a2], axis=2) + x, _ = self.rnn2(x, h2) + + x = x + res + x = paddle.concat([x, a3], axis=2) + x = F.relu(self.fc1(x)) + + x = paddle.concat([x, a4], axis=2) + x = F.relu(self.fc2(x)) + + return self.fc3(x) + + @paddle.no_grad() + def generate(self, + c, + batched: bool=True, + target: int=12000, + overlap: int=600, + mu_law: bool=True, + gen_display: bool=False): + """ + Args: + c(Tensor): input mels, (T', C_aux) + batched(bool): generate in batch or not + target(int): target number of samples to be generated in each batch entry + overlap(int): number of samples for crossfading between batches + mu_law(bool) + Returns: + wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out). + """ + + self.eval() + + mu_law = mu_law if self.mode == 'RAW' else False + + output = [] + start = time.time() + + # pseudo batch + # (T, C_aux) -> (1, C_aux, T) + c = paddle.transpose(c, [1, 0]).unsqueeze(0) + T = paddle.shape(c)[-1] + wave_len = T * self.hop_length + # TODO remove two transpose op by modifying function pad_tensor + c = self.pad_tensor( + c.transpose([0, 2, 1]), pad=self.aux_context_window, + side='both').transpose([0, 2, 1]) + + c, aux = self.upsample(c) + + if batched: + # (num_folds, target + 2 * overlap, features) + c = self.fold_with_overlap(c, target, overlap) + aux = self.fold_with_overlap(aux, target, overlap) + + # for dygraph to static graph, if use seq_len of `b_size, seq_len, _ = paddle.shape(c)` in for + # will not get TensorArray + # see https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/04_dygraph_to_static/case_analysis_cn.html#list-lodtensorarray + # b_size, seq_len, _ = paddle.shape(c) + b_size = paddle.shape(c)[0] + seq_len = paddle.shape(c)[1] + + h1 = paddle.zeros([b_size, self.rnn_dims]) + h2 = paddle.zeros([b_size, self.rnn_dims]) + x = paddle.zeros([b_size, 1]) + + d = self.aux_dims + aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)] + + for i in range(seq_len): + m_t = c[:, i, :] + # for dygraph to static graph + # a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split) + a1_t = aux_split[0][:, i, :] + a2_t = aux_split[1][:, i, :] + a3_t = aux_split[2][:, i, :] + a4_t = aux_split[3][:, i, :] + x = paddle.concat([x, m_t, a1_t], axis=1) + x = self.I(x) + # use GRUCell here + h1, _ = self.rnn1[0].cell(x, h1) + x = x + h1 + inp = paddle.concat([x, a2_t], axis=1) + # use GRUCell here + h2, _ = self.rnn2[0].cell(inp, h2) + + x = x + h2 + x = paddle.concat([x, a3_t], axis=1) + x = F.relu(self.fc1(x)) + + x = paddle.concat([x, a4_t], axis=1) + x = F.relu(self.fc2(x)) + + logits = self.fc3(x) + + if self.mode == 'MOL': + sample = sample_from_discretized_mix_logistic( + logits.unsqueeze(0).transpose([0, 2, 1])) + output.append(sample.reshape([-1])) + x = sample.transpose([1, 0, 2]) + + elif self.mode == 'RAW': + posterior = F.softmax(logits, axis=1) + distrib = paddle.distribution.Categorical(posterior) + # corresponding operate [np.floor((fx + 1) / 2 * mu + 0.5)] in enocde_mu_law + # distrib.sample([1])[0].cast('float32'): [0, 2**bits-1] + # sample: [-1, 1] + sample = 2 * distrib.sample([1])[0].cast('float32') / ( + self.n_classes - 1.) - 1. + output.append(sample) + x = sample.unsqueeze(-1) + else: + raise RuntimeError('Unknown model mode value - ', self.mode) + + if gen_display: + if i % 1000 == 0: + self.gen_display(i, int(seq_len), int(b_size), start) + + output = paddle.stack(output).transpose([1, 0]) + + if mu_law: + output = decode_mu_law(output, self.n_classes, False) + + if batched: + output = self.xfade_and_unfold(output, target, overlap) + else: + output = output[0] + + # Fade-out at the end to avoid signal cutting out suddenly + fade_out = paddle.linspace(1, 0, 10 * self.hop_length) + output = output[:wave_len] + output[-10 * self.hop_length:] *= fade_out + + self.train() + + # 增加 C_out 维度 + return output.unsqueeze(-1) + + def _flatten_parameters(self): + [m.flatten_parameters() for m in self._to_flatten] + + def pad_tensor(self, x, pad, side='both'): + ''' + Args: + x(Tensor): mel, [1, n_frames, 80] + pad(int): + side(str, optional): (Default value = 'both') + + Returns: + Tensor + ''' + b, t, _ = paddle.shape(x) + # for dygraph to static graph + c = x.shape[-1] + total = t + 2 * pad if side == 'both' else t + pad + padded = paddle.zeros([b, total, c]) + if side == 'before' or side == 'both': + padded[:, pad:pad + t, :] = x + elif side == 'after': + padded[:, :t, :] = x + return padded + + def fold_with_overlap(self, x, target, overlap): + ''' + Fold the tensor with overlap for quick batched inference. + Overlap will be used for crossfading in xfade_and_unfold() + + Args: + x(Tensor): Upsampled conditioning features. mels or aux + shape=(1, T, features) + mels: [1, T, 80] + aux: [1, T, 128] + target(int): Target timesteps for each index of batch + overlap(int): Timesteps for both xfade and rnn warmup + + Returns: + Tensor: + shape=(num_folds, target + 2 * overlap, features) + num_flods = (time_seq - overlap) // (target + overlap) + mel: [num_folds, target + 2 * overlap, 80] + aux: [num_folds, target + 2 * overlap, 128] + + Details: + x = [[h1, h2, ... hn]] + Where each h is a vector of conditioning features + Eg: target=2, overlap=1 with x.size(1)=10 + + folded = [[h1, h2, h3, h4], + [h4, h5, h6, h7], + [h7, h8, h9, h10]] + ''' + + _, total_len, features = paddle.shape(x) + + # Calculate variables needed + num_folds = (total_len - overlap) // (target + overlap) + extended_len = num_folds * (overlap + target) + overlap + remaining = total_len - extended_len + + # Pad if some time steps poking out + if remaining != 0: + num_folds += 1 + padding = target + 2 * overlap - remaining + x = self.pad_tensor(x, padding, side='after') + + folded = paddle.zeros([num_folds, target + 2 * overlap, features]) + + # Get the values for the folded tensor + for i in range(num_folds): + start = i * (target + overlap) + end = start + target + 2 * overlap + folded[i] = x[0][start:end, :] + return folded + + def xfade_and_unfold(self, y, target: int=12000, overlap: int=600): + ''' Applies a crossfade and unfolds into a 1d array. + + Args: + y (Tensor): + Batched sequences of audio samples + shape=(num_folds, target + 2 * overlap) + dtype=paddle.float32 + overlap (int): Timesteps for both xfade and rnn warmup + + Returns: + Tensor + audio samples in a 1d array + shape=(total_len) + dtype=paddle.float32 + + Details: + y = [[seq1], + [seq2], + [seq3]] + + Apply a gain envelope at both ends of the sequences + + y = [[seq1_in, seq1_target, seq1_out], + [seq2_in, seq2_target, seq2_out], + [seq3_in, seq3_target, seq3_out]] + + Stagger and add up the groups of samples: + + [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...] + + ''' + # num_folds = (total_len - overlap) // (target + overlap) + num_folds, length = paddle.shape(y) + target = length - 2 * overlap + total_len = num_folds * (target + overlap) + overlap + + # Need some silence for the run warmup + slience_len = overlap // 2 + fade_len = overlap - slience_len + slience = paddle.zeros([slience_len], dtype=paddle.float32) + linear = paddle.ones([fade_len], dtype=paddle.float32) + + # Equal power crossfade + # fade_in increase from 0 to 1, fade_out reduces from 1 to 0 + t = paddle.linspace(-1, 1, fade_len, dtype=paddle.float32) + fade_in = paddle.sqrt(0.5 * (1 + t)) + fade_out = paddle.sqrt(0.5 * (1 - t)) + # Concat the silence to the fades + fade_out = paddle.concat([linear, fade_out]) + fade_in = paddle.concat([slience, fade_in]) + + # Apply the gain to the overlap samples + y[:, :overlap] *= fade_in + y[:, -overlap:] *= fade_out + + unfolded = paddle.zeros([total_len], dtype=paddle.float32) + + # Loop to add up all the samples + for i in range(num_folds): + start = i * (target + overlap) + end = start + target + 2 * overlap + unfolded[start:end] += y[i] + + return unfolded + + def gen_display(self, i, seq_len, b_size, start): + gen_rate = (i + 1) / (time.time() - start) * b_size / 1000 + pbar = self.progbar(i, seq_len) + msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | ' + sys.stdout.write(f"\r{msg}") + + def progbar(self, i, n, size=16): + done = int(i * size) // n + bar = '' + for i in range(size): + bar += '█' if i <= done else '░' + return bar + + +class WaveRNNInference(nn.Layer): + def __init__(self, normalizer, wavernn): + super().__init__() + self.normalizer = normalizer + self.wavernn = wavernn + + def forward(self, + logmel, + batched: bool=True, + target: int=12000, + overlap: int=600, + mu_law: bool=True, + gen_display: bool=False): + normalized_mel = self.normalizer(logmel) + + wav = self.wavernn.generate( + normalized_mel, ) + # batched=batched, + # target=target, + # overlap=overlap, + # mu_law=mu_law, + # gen_display=gen_display) + + return wav diff --git a/paddlespeech/t2s/models/wavernn/wavernn_updater.py b/paddlespeech/t2s/models/wavernn/wavernn_updater.py new file mode 100644 index 00000000..b2756d00 --- /dev/null +++ b/paddlespeech/t2s/models/wavernn/wavernn_updater.py @@ -0,0 +1,201 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from pathlib import Path + +import paddle +import soundfile as sf +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer + +from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator +from paddlespeech.t2s.training.reporter import report +from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater +logging.basicConfig( + format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def calculate_grad_norm(parameters, norm_type: str=2): + ''' + calculate grad norm of mdoel's parameters + parameters: + model's parameters + norm_type: str + Returns + ------------ + Tensor + grad_norm + ''' + + grad_list = [ + paddle.to_tensor(p.grad) for p in parameters if p.grad is not None + ] + norm_list = paddle.stack( + [paddle.norm(grad, norm_type) for grad in grad_list]) + total_norm = paddle.norm(norm_list) + return total_norm + + +# for save name in gen_valid_samples() +ITERATION = 0 + + +class WaveRNNUpdater(StandardUpdater): + def __init__(self, + model: Layer, + optimizer: Optimizer, + criterion: Layer, + dataloader: DataLoader, + init_state=None, + output_dir: Path=None, + mode='RAW'): + super().__init__(model, optimizer, dataloader, init_state=None) + + self.criterion = criterion + # self.scheduler = scheduler + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + self.mode = mode + + def update_core(self, batch): + + self.msg = "Rank: {}, ".format(dist.get_rank()) + losses_dict = {} + # parse batch + self.model.train() + self.optimizer.clear_grad() + + wav, y, mel = batch + + y_hat = self.model(wav, mel) + if self.mode == 'RAW': + y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1) + elif self.mode == 'MOL': + y_hat = paddle.cast(y, dtype='float32') + + y = y.unsqueeze(-1) + loss = self.criterion(y_hat, y) + loss.backward() + grad_norm = float( + calculate_grad_norm(self.model.parameters(), norm_type=2)) + + self.optimizer.step() + + report("train/loss", float(loss)) + report("train/grad_norm", float(grad_norm)) + + losses_dict["loss"] = float(loss) + losses_dict["grad_norm"] = float(grad_norm) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + global ITERATION + ITERATION = self.state.iteration + 1 + + +class WaveRNNEvaluator(StandardEvaluator): + def __init__(self, + model: Layer, + criterion: Layer, + dataloader: Optimizer, + output_dir: Path=None, + valid_generate_loader=None, + config=None): + super().__init__(model, dataloader) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + self.criterion = criterion + self.valid_generate_loader = valid_generate_loader + self.config = config + self.mode = config.model.mode + + self.valid_samples_dir = output_dir / "valid_samples" + self.valid_samples_dir.mkdir(parents=True, exist_ok=True) + + def evaluate_core(self, batch): + self.msg = "Evaluate: " + losses_dict = {} + # parse batch + wav, y, mel = batch + y_hat = self.model(wav, mel) + + if self.mode == 'RAW': + y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1) + elif self.mode == 'MOL': + y_hat = paddle.cast(y, dtype='float32') + + y = y.unsqueeze(-1) + loss = self.criterion(y_hat, y) + report("eval/loss", float(loss)) + + losses_dict["loss"] = float(loss) + + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + self.logger.info(self.msg) + + def gen_valid_samples(self): + + for i, item in enumerate(self.valid_generate_loader): + if i >= self.config.generate_num: + break + print( + '\n| Generating: {}/{}'.format(i + 1, self.config.generate_num)) + + mel = item['feats'] + wav = item['wave'] + wav = wav.squeeze(0) + + origin_save_path = self.valid_samples_dir / '{}_steps_{}_target.wav'.format( + self.iteration, i) + sf.write(origin_save_path, wav.numpy(), samplerate=self.config.fs) + + if self.config.inference.gen_batched: + batch_str = 'gen_batched_target{}_overlap{}'.format( + self.config.inference.target, self.config.inference.overlap) + else: + batch_str = 'gen_not_batched' + gen_save_path = str(self.valid_samples_dir / + '{}_steps_{}_{}.wav'.format(self.iteration, i, + batch_str)) + # (1, T, C_aux) -> (T, C_aux) + mel = mel.squeeze(0) + gen_sample = self.model.generate( + mel, self.config.inference.gen_batched, + self.config.inference.target, self.config.inference.overlap, + self.config.mu_law) + sf.write( + gen_save_path, gen_sample.numpy(), samplerate=self.config.fs) + + def __call__(self, trainer=None): + summary = self.evaluate() + for k, v in summary.items(): + report(k, v) + # gen samples at then end of evaluate + self.iteration = ITERATION + if self.iteration % self.config.gen_eval_samples_interval_steps == 0: + self.gen_valid_samples() diff --git a/paddlespeech/t2s/modules/causal_conv.py b/paddlespeech/t2s/modules/causal_conv.py index c0d4f955..3abccc15 100644 --- a/paddlespeech/t2s/modules/causal_conv.py +++ b/paddlespeech/t2s/modules/causal_conv.py @@ -41,14 +41,10 @@ class CausalConv1D(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T). + Args: + x (Tensor): Input tensor (B, in_channels, T). + Returns: + Tensor: Output tensor (B, out_channels, T). """ return self.conv(self.pad(x))[:, :, :x.shape[2]] @@ -70,13 +66,9 @@ class CausalConv1DTranspose(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T_in). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T_out). + Args: + x (Tensor): Input tensor (B, in_channels, T_in). + Returns: + Tensor: Output tensor (B, out_channels, T_out). """ return self.deconv(x)[:, :, :-self.stride] diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py index e4a6c8c6..185c62fb 100644 --- a/paddlespeech/t2s/modules/conformer/convolution.py +++ b/paddlespeech/t2s/modules/conformer/convolution.py @@ -18,12 +18,10 @@ from paddle import nn class ConvolutionModule(nn.Layer): """ConvolutionModule in Conformer model. - Parameters - ---------- - channels : int - The number of channels of conv layers. - kernel_size : int - Kernerl size of conv layers. + + Args: + channels (int): The number of channels of conv layers. + kernel_size (int): Kernerl size of conv layers. """ def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True): @@ -59,14 +57,11 @@ class ConvolutionModule(nn.Layer): def forward(self, x): """Compute convolution module. - Parameters - ---------- - x : paddle.Tensor - Input tensor (#batch, time, channels). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, channels). + + Args: + x (Tensor): Input tensor (#batch, time, channels). + Returns: + Tensor: Output tensor (#batch, time, channels). """ # exchange the temporal dimension and the feature dimension x = x.transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py index 2949dc37..61c32612 100644 --- a/paddlespeech/t2s/modules/conformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py @@ -21,38 +21,29 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm class EncoderLayer(nn.Layer): """Encoder layer module. - Parameters - ---------- - size : int - Input dimension. - self_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance - can be used as the argument. - feed_forward : nn.Layer - Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance - can be used as the argument. - feed_forward_macaron : nn.Layer - Additional feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance - can be used as the argument. - conv_module : nn.Layer - Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate : float - Dropout rate. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - stochastic_depth_rate : float - Proability to skip this layer. - During training, the layer may skip residual computation and return input - as-is with given probability. + + Args: + size (int): Input dimension. + self_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance + can be used as the argument. + feed_forward (nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + feed_forward_macaron (nn.Layer): Additional feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + conv_module (nn.Layer): Convolution module instance. + `ConvlutionModule` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + stochastic_depth_rate (float): Proability to skip this layer. + During training, the layer may skip residual computation and return input + as-is with given probability. """ def __init__( @@ -93,22 +84,17 @@ class EncoderLayer(nn.Layer): def forward(self, x_input, mask, cache=None): """Compute encoded features. - Parameters - ---------- - x_input : Union[Tuple, paddle.Tensor] - Input tensor w/ or w/o pos emb. - - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. - - w/o pos emb: Tensor (#batch, time, size). - mask : paddle.Tensor - Mask tensor for the input (#batch, time). - cache paddle.Tensor - Cache tensor of the input (#batch, time - 1, size). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, size). - paddle.Tensor - Mask tensor (#batch, time). + + Args: + x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb. + - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. + - w/o pos emb: Tensor (#batch, time, size). + mask(Tensor): Mask tensor for the input (#batch, time). + cache (Tensor): + + Returns: + Tensor: Output tensor (#batch, time, size). + Tensor: Mask tensor (#batch, time). """ if isinstance(x_input, tuple): x, pos_emb = x_input[0], x_input[1] diff --git a/paddlespeech/t2s/modules/conv.py b/paddlespeech/t2s/modules/conv.py index 68766d5e..aa875bd5 100644 --- a/paddlespeech/t2s/modules/conv.py +++ b/paddlespeech/t2s/modules/conv.py @@ -40,36 +40,29 @@ class Conv1dCell(nn.Conv1D): 2. padding must be a causal padding (recpetive_field - 1, 0). Thus, these arguments are removed from the ``__init__`` method of this class. - - Parameters - ---------- - in_channels: int - The feature size of the input. - out_channels: int - The feature size of the output. - kernel_size: int or Tuple[int] - The size of the kernel. - dilation: int or Tuple[int] - The dilation of the convolution, by default 1 - weight_attr: ParamAttr, Initializer, str or bool, optional - The parameter attribute of the convolution kernel, by default None. - bias_attr: ParamAttr, Initializer, str or bool, optional - The parameter attribute of the bias. If ``False``, this layer does not - have a bias, by default None. - - Examples - -------- - >>> cell = Conv1dCell(3, 4, kernel_size=5) - >>> inputs = [paddle.randn([4, 3]) for _ in range(16)] - >>> outputs = [] - >>> cell.eval() - >>> cell.start_sequence() - >>> for xt in inputs: - >>> outputs.append(cell.add_input(xt)) - >>> len(outputs)) - 16 - >>> outputs[0].shape - [4, 4] + + Args: + in_channels (int): The feature size of the input. + out_channels (int): The feature size of the output. + kernel_size (int or Tuple[int]): The size of the kernel. + dilation (int or Tuple[int]): The dilation of the convolution, by default 1 + weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, + by default None. + bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. + If ``False``, this layer does not have a bias, by default None. + + Examples: + >>> cell = Conv1dCell(3, 4, kernel_size=5) + >>> inputs = [paddle.randn([4, 3]) for _ in range(16)] + >>> outputs = [] + >>> cell.eval() + >>> cell.start_sequence() + >>> for xt in inputs: + >>> outputs.append(cell.add_input(xt)) + >>> len(outputs)) + 16 + >>> outputs[0].shape + [4, 4] """ def __init__(self, @@ -103,15 +96,13 @@ class Conv1dCell(nn.Conv1D): def start_sequence(self): """Prepare the layer for a series of incremental forward. - Warnings - --------- - This method should be called before a sequence of calls to - ``add_input``. + Warnings: + This method should be called before a sequence of calls to + ``add_input``. - Raises - ------ - Exception - If this method is called when the layer is in training mode. + Raises: + Exception + If this method is called when the layer is in training mode. """ if self.training: raise Exception("only use start_sequence in evaluation") @@ -130,10 +121,9 @@ class Conv1dCell(nn.Conv1D): def initialize_buffer(self, x_t): """Initialize the buffer for the step input. - Parameters - ---------- - x_t : Tensor [shape=(batch_size, in_channels)] - The step input. + Args: + x_t (Tensor): The step input. shape=(batch_size, in_channels) + """ batch_size, _ = x_t.shape self._buffer = paddle.zeros( @@ -143,26 +133,22 @@ class Conv1dCell(nn.Conv1D): def update_buffer(self, x_t): """Shift the buffer by one step. - Parameters - ---------- - x_t : Tensor [shape=(batch_size, in_channels)] - The step input. + Args: + x_t (Tensor): The step input. shape=(batch_size, in_channels) + """ self._buffer = paddle.concat( [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1) def add_input(self, x_t): """Add step input and compute step output. - - Parameters - ----------- - x_t : Tensor [shape=(batch_size, in_channels)] - The step input. - - Returns - ------- - y_t :Tensor [shape=(batch_size, out_channels)] - The step output. + + Args: + x_t (Tensor): The step input. shape=(batch_size, in_channels) + + Returns: + y_t (Tensor): The step output. shape=(batch_size, out_channels) + """ batch_size = x_t.shape[0] if self.receptive_field > 1: @@ -186,33 +172,26 @@ class Conv1dCell(nn.Conv1D): class Conv1dBatchNorm(nn.Layer): """A Conv1D Layer followed by a BatchNorm1D. - Parameters - ---------- - in_channels : int - The feature size of the input. - out_channels : int - The feature size of the output. - kernel_size : int - The size of the convolution kernel. - stride : int, optional - The stride of the convolution, by default 1. - padding : int, str or Tuple[int], optional - The padding of the convolution. - If int, a symmetrical padding is applied before convolution; - If str, it should be "same" or "valid"; - If Tuple[int], its length should be 2, meaning - ``(pad_before, pad_after)``, by default 0. - weight_attr : ParamAttr, Initializer, str or bool, optional - The parameter attribute of the convolution kernel, by default None. - bias_attr : ParamAttr, Initializer, str or bool, optional - The parameter attribute of the bias of the convolution, by default - None. - data_format : str ["NCL" or "NLC"], optional - The data layout of the input, by default "NCL" - momentum : float, optional - The momentum of the BatchNorm1D layer, by default 0.9 - epsilon : [type], optional - The epsilon of the BatchNorm1D layer, by default 1e-05 + Args: + in_channels (int): The feature size of the input. + out_channels (int): The feature size of the output. + kernel_size (int): The size of the convolution kernel. + stride (int, optional): The stride of the convolution, by default 1. + padding (int, str or Tuple[int], optional): + The padding of the convolution. + If int, a symmetrical padding is applied before convolution; + If str, it should be "same" or "valid"; + If Tuple[int], its length should be 2, meaning + ``(pad_before, pad_after)``, by default 0. + weight_attr (ParamAttr, Initializer, str or bool, optional): + The parameter attribute of the convolution kernel, + by default None. + bias_attr (ParamAttr, Initializer, str or bool, optional): + The parameter attribute of the bias of the convolution, + by defaultNone. + data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL" + momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9 + epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05 """ def __init__(self, @@ -244,16 +223,15 @@ class Conv1dBatchNorm(nn.Layer): def forward(self, x): """Forward pass of the Conv1dBatchNorm layer. - - Parameters - ---------- - x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)] - The input tensor. Its data layout depends on ``data_format``. - - Returns - ------- - Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)] - The output tensor. + + Args: + x (Tensor): The input tensor. Its data layout depends on ``data_format``. + shape=(B, C_in, T_in) or (B, T_in, C_in) + + Returns: + Tensor: The output tensor. + shape=(B, C_out, T_out) or (B, T_out, C_out) + """ x = self.conv(x) x = self.bn(x) diff --git a/paddlespeech/t2s/modules/geometry.py b/paddlespeech/t2s/modules/geometry.py index a3d56f7d..01eb5ad0 100644 --- a/paddlespeech/t2s/modules/geometry.py +++ b/paddlespeech/t2s/modules/geometry.py @@ -17,24 +17,18 @@ import paddle def shuffle_dim(x, axis, perm=None): """Permute input tensor along aixs given the permutation or randomly. + + Args: + x (Tensor): The input tensor. + axis (int): The axis to shuffle. + perm (List[int], ndarray, optional): + The order to reorder the tensor along the ``axis``-th dimension. + It is a permutation of ``[0, d)``, where d is the size of the + ``axis``-th dimension of the input tensor. If not provided, + a random permutation is used. Defaults to None. - Parameters - ---------- - x : Tensor - The input tensor. - axis : int - The axis to shuffle. - perm : List[int], ndarray, optional - The order to reorder the tensor along the ``axis``-th dimension. - - It is a permutation of ``[0, d)``, where d is the size of the - ``axis``-th dimension of the input tensor. If not provided, - a random permutation is used. Defaults to None. - - Returns - --------- - Tensor - The shuffled tensor, which has the same shape as x does. + Returns: + Tensor: The shuffled tensor, which has the same shape as x does. """ size = x.shape[axis] if perm is not None and len(perm) != size: diff --git a/paddlespeech/t2s/modules/layer_norm.py b/paddlespeech/t2s/modules/layer_norm.py index 4edd22c9..088b98e0 100644 --- a/paddlespeech/t2s/modules/layer_norm.py +++ b/paddlespeech/t2s/modules/layer_norm.py @@ -18,13 +18,9 @@ from paddle import nn class LayerNorm(nn.LayerNorm): """Layer normalization module. - - Parameters - ---------- - nout : int - Output dim size. - dim : int - Dimension to be normalized. + Args: + nout (int): Output dim size. + dim (int): Dimension to be normalized. """ def __init__(self, nout, dim=-1): @@ -35,15 +31,11 @@ class LayerNorm(nn.LayerNorm): def forward(self, x): """Apply layer normalization. - Parameters - ---------- - x : paddle.Tensor - Input tensor. + Args: + x (Tensor):Input tensor. - Returns - ---------- - paddle.Tensor - Normalized tensor. + Returns: + Tensor: Normalized tensor. """ if self.dim == -1: diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 569e96ad..93644e24 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -14,12 +14,419 @@ import math import librosa +import numpy as np import paddle from paddle import nn from paddle.fluid.layers import sequence_mask from paddle.nn import functional as F from scipy import signal +from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask + + +# Losses for WaveRNN +def log_sum_exp(x): + """ numerically stable log_sum_exp implementation that prevents overflow """ + # TF ordering + axis = len(x.shape) - 1 + m = paddle.max(x, axis=axis) + m2 = paddle.max(x, axis=axis, keepdim=True) + return m + paddle.log(paddle.sum(paddle.exp(x - m2), axis=axis)) + + +# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py +def discretized_mix_logistic_loss(y_hat, + y, + num_classes=65536, + log_scale_min=None, + reduce=True): + if log_scale_min is None: + log_scale_min = float(np.log(1e-14)) + y_hat = y_hat.transpose([0, 2, 1]) + assert y_hat.dim() == 3 + assert y_hat.shape[1] % 3 == 0 + nr_mix = y_hat.shape[1] // 3 + + # (B x T x C) + y_hat = y_hat.transpose([0, 2, 1]) + + # unpack parameters. (B, T, num_mixtures) x 3 + logit_probs = y_hat[:, :, :nr_mix] + means = y_hat[:, :, nr_mix:2 * nr_mix] + log_scales = paddle.clip( + y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) + + # B x T x 1 -> B x T x num_mixtures + y = y.expand_as(means) + centered_y = paddle.cast(y, dtype=paddle.get_default_dtype()) - means + inv_stdv = paddle.exp(-log_scales) + plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) + cdf_plus = F.sigmoid(plus_in) + min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) + cdf_min = F.sigmoid(min_in) + + # log probability for edge case of 0 (before scaling) + # equivalent: torch.log(F.sigmoid(plus_in)) + # softplus: log(1+ e^{-x}) + log_cdf_plus = plus_in - F.softplus(plus_in) + + # log probability for edge case of 255 (before scaling) + # equivalent: (1 - F.sigmoid(min_in)).log() + log_one_minus_cdf_min = -F.softplus(min_in) + + # probability for all other cases + cdf_delta = cdf_plus - cdf_min + + mid_in = inv_stdv * centered_y + # log probability in the center of the bin, to be used in extreme cases + # (not actually used in our code) + log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) + + # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value + # for num_classes=65536 case? 1e-7? not sure.. + inner_inner_cond = cdf_delta > 1e-5 + + inner_inner_cond = paddle.cast( + inner_inner_cond, dtype=paddle.get_default_dtype()) + + # inner_inner_out = inner_inner_cond * \ + # paddle.log(paddle.clip(cdf_delta, min=1e-12)) + \ + # (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) + + inner_inner_out = inner_inner_cond * paddle.log( + paddle.clip(cdf_delta, min=1e-12)) + (1. - inner_inner_cond) * ( + log_pdf_mid - np.log((num_classes - 1) / 2)) + + inner_cond = y > 0.999 + + inner_cond = paddle.cast(inner_cond, dtype=paddle.get_default_dtype()) + + inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond + ) * inner_inner_out + cond = y < -0.999 + cond = paddle.cast(cond, dtype=paddle.get_default_dtype()) + + log_probs = cond * log_cdf_plus + (1. - cond) * inner_out + log_probs = log_probs + F.log_softmax(logit_probs, -1) + + if reduce: + return -paddle.mean(log_sum_exp(log_probs)) + else: + return -log_sum_exp(log_probs).unsqueeze(-1) + + +def sample_from_discretized_mix_logistic(y, log_scale_min=None): + """ + Sample from discretized mixture of logistic distributions + + Args: + y(Tensor): (B, C, T) + log_scale_min(float, optional): (Default value = None) + + Returns: + Tensor: sample in range of [-1, 1]. + """ + if log_scale_min is None: + log_scale_min = float(np.log(1e-14)) + + assert y.shape[1] % 3 == 0 + nr_mix = y.shape[1] // 3 + + # (B, T, C) + y = y.transpose([0, 2, 1]) + logit_probs = y[:, :, :nr_mix] + + # sample mixture indicator from softmax + temp = paddle.uniform( + logit_probs.shape, dtype=logit_probs.dtype, min=1e-5, max=1.0 - 1e-5) + temp = logit_probs - paddle.log(-paddle.log(temp)) + argmax = paddle.argmax(temp, axis=-1) + + # (B, T) -> (B, T, nr_mix) + one_hot = F.one_hot(argmax, nr_mix) + one_hot = paddle.cast(one_hot, dtype=paddle.get_default_dtype()) + + # select logistic parameters + means = paddle.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, axis=-1) + log_scales = paddle.clip( + paddle.sum(y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, axis=-1), + min=log_scale_min) + # sample from logistic & clip to interval + # we don't actually round to the nearest 8bit value when sampling + u = paddle.uniform(means.shape, min=1e-5, max=1.0 - 1e-5) + x = means + paddle.exp(log_scales) * (paddle.log(u) - paddle.log(1. - u)) + x = paddle.clip(x, min=-1., max=-1.) + + return x + + +# Loss for new Tacotron2 +class GuidedAttentionLoss(nn.Layer): + """Guided attention loss function module. + + This module calculates the guided attention loss described + in `Efficiently Trainable Text-to-Speech System Based + on Deep Convolutional Networks with Guided Attention`_, + which forces the attention to be diagonal. + + .. _`Efficiently Trainable Text-to-Speech System + Based on Deep Convolutional Networks with Guided Attention`: + https://arxiv.org/abs/1710.08969 + + """ + + def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): + """Initialize guided attention loss module. + + Args: + sigma (float, optional): Standard deviation to control how close attention to a diagonal. + alpha (float, optional): Scaling coefficient (lambda). + reset_always (bool, optional): Whether to always reset masks. + + """ + super().__init__() + self.sigma = sigma + self.alpha = alpha + self.reset_always = reset_always + self.guided_attn_masks = None + self.masks = None + + def _reset_masks(self): + self.guided_attn_masks = None + self.masks = None + + def forward(self, att_ws, ilens, olens): + """Calculate forward propagation. + + Args: + att_ws(Tensor): Batch of attention weights (B, T_max_out, T_max_in). + ilens(Tensor(int64)): Batch of input lenghts (B,). + olens(Tensor(int64)): Batch of output lenghts (B,). + + Returns: + Tensor: Guided attention loss value. + + """ + if self.guided_attn_masks is None: + self.guided_attn_masks = self._make_guided_attention_masks(ilens, + olens) + if self.masks is None: + self.masks = self._make_masks(ilens, olens) + losses = self.guided_attn_masks * att_ws + loss = paddle.mean( + losses.masked_select(self.masks.broadcast_to(losses.shape))) + if self.reset_always: + self._reset_masks() + return self.alpha * loss + + def _make_guided_attention_masks(self, ilens, olens): + n_batches = len(ilens) + max_ilen = max(ilens) + max_olen = max(olens) + guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen)) + + for idx, (ilen, olen) in enumerate(zip(ilens, olens)): + guided_attn_masks[idx, :olen, : + ilen] = self._make_guided_attention_mask( + ilen, olen, self.sigma) + return guided_attn_masks + + @staticmethod + def _make_guided_attention_mask(ilen, olen, sigma): + """Make guided attention mask. + + Examples + ---------- + >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4) + >>> guided_attn_mask.shape + [5, 5] + >>> guided_attn_mask + tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647], + [0.1175, 0.0000, 0.1175, 0.3935, 0.6753], + [0.3935, 0.1175, 0.0000, 0.1175, 0.3935], + [0.6753, 0.3935, 0.1175, 0.0000, 0.1175], + [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]]) + >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4) + >>> guided_attn_mask.shape + [6, 3] + >>> guided_attn_mask + tensor([[0.0000, 0.2934, 0.7506], + [0.0831, 0.0831, 0.5422], + [0.2934, 0.0000, 0.2934], + [0.5422, 0.0831, 0.0831], + [0.7506, 0.2934, 0.0000], + [0.8858, 0.5422, 0.0831]]) + + """ + grid_x, grid_y = paddle.meshgrid( + paddle.arange(olen), paddle.arange(ilen)) + grid_x = grid_x.cast(dtype=paddle.float32) + grid_y = grid_y.cast(dtype=paddle.float32) + return 1.0 - paddle.exp(-( + (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2))) + + @staticmethod + def _make_masks(ilens, olens): + """Make masks indicating non-padded part. + + Args: + ilens(Tensor(int64) or List): Batch of lengths (B,). + olens(Tensor(int64) or List): Batch of lengths (B,). + + Returns: + Tensor: Mask tensor indicating non-padded part. + + Examples: + >>> ilens, olens = [5, 2], [8, 5] + >>> _make_mask(ilens, olens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1]], + + [[1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]]], dtype=paddle.uint8) + + """ + # (B, T_in) + in_masks = make_non_pad_mask(ilens) + # (B, T_out) + out_masks = make_non_pad_mask(olens) + # (B, T_out, T_in) + + return paddle.logical_and( + out_masks.unsqueeze(-1), in_masks.unsqueeze(-2)) + + +class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): + """Guided attention loss function module for multi head attention. + + Args: + sigma (float, optional): Standard deviation to controlGuidedAttentionLoss + how close attention to a diagonal. + alpha (float, optional): Scaling coefficient (lambda). + reset_always (bool, optional): Whether to always reset masks. + + """ + + def forward(self, att_ws, ilens, olens): + """Calculate forward propagation. + + Args: + att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in). + ilens(Tensor): Batch of input lenghts (B,). + olens(Tensor): Batch of output lenghts (B,). + + Returns: + Tensor: Guided attention loss value. + + """ + if self.guided_attn_masks is None: + self.guided_attn_masks = ( + self._make_guided_attention_masks(ilens, olens).unsqueeze(1)) + if self.masks is None: + self.masks = self._make_masks(ilens, olens).unsqueeze(1) + losses = self.guided_attn_masks * att_ws + loss = paddle.mean( + losses.masked_select(self.masks.broadcast_to(losses.shape))) + if self.reset_always: + self._reset_masks() + + return self.alpha * loss + + +class Tacotron2Loss(nn.Layer): + """Loss function module for Tacotron2.""" + + def __init__(self, + use_masking=True, + use_weighted_masking=False, + bce_pos_weight=20.0): + """Initialize Tactoron2 loss module. + + Args: + use_masking (bool): Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): Whether to apply weighted masking in loss calculation. + bce_pos_weight (float): Weight of positive sample of stop token. + """ + super().__init__() + assert (use_masking != use_weighted_masking) or not use_masking + self.use_masking = use_masking + self.use_weighted_masking = use_weighted_masking + + # define criterions + reduction = "none" if self.use_weighted_masking else "mean" + self.l1_criterion = nn.L1Loss(reduction=reduction) + self.mse_criterion = nn.MSELoss(reduction=reduction) + self.bce_criterion = nn.BCEWithLogitsLoss( + reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight)) + + def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens): + """Calculate forward propagation. + + Args: + after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim). + logits(Tensor): Batch of stop logits (B, Lmax). + ys(Tensor): Batch of padded target features (B, Lmax, odim). + stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax). + olens(Tensor(int64)): + + Returns: + Tensor: L1 loss value. + Tensor: Mean square error loss value. + Tensor: Binary cross entropy loss value. + """ + # make mask and apply it + if self.use_masking: + masks = make_non_pad_mask(olens).unsqueeze(-1) + ys = ys.masked_select(masks.broadcast_to(ys.shape)) + after_outs = after_outs.masked_select( + masks.broadcast_to(after_outs.shape)) + before_outs = before_outs.masked_select( + masks.broadcast_to(before_outs.shape)) + stop_labels = stop_labels.masked_select( + masks[:, :, 0].broadcast_to(stop_labels.shape)) + logits = logits.masked_select( + masks[:, :, 0].broadcast_to(logits.shape)) + + # calculate loss + l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion( + before_outs, ys) + mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion( + before_outs, ys) + bce_loss = self.bce_criterion(logits, stop_labels) + + # make weighted mask and apply it + if self.use_weighted_masking: + masks = make_non_pad_mask(olens).unsqueeze(-1) + weights = masks.float() / masks.sum(axis=1, keepdim=True).float() + out_weights = weights.divide( + paddle.shape(ys)[0] * paddle.shape(ys)[2]) + logit_weights = weights.divide(paddle.shape(ys)[0]) + + # apply weight + l1_loss = l1_loss.multiply(out_weights) + l1_loss = l1_loss.masked_select(masks.broadcast_to(l1_loss)).sum() + mse_loss = mse_loss.multiply(out_weights) + mse_loss = mse_loss.masked_select( + masks.broadcast_to(mse_loss)).sum() + bce_loss = bce_loss.multiply(logit_weights.squeeze(-1)) + bce_loss = bce_loss.masked_select( + masks.squeeze(-1).broadcast_to(bce_loss)).sum() + + return l1_loss, mse_loss, bce_loss + # Loss for Tacotron2 def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None): @@ -65,28 +472,20 @@ def stft(x, center=True, pad_mode='reflect'): """Perform STFT and convert to magnitude spectrogram. - Parameters - ---------- - x : Tensor - Input signal tensor (B, T). - fft_size : int - FFT size. - hop_size : int - Hop size. - win_length : int - window : str, optional - window : str - Name of window function, see `scipy.signal.get_window` for more - details. Defaults to "hann". - center : bool, optional - center (bool, optional): Whether to pad `x` to make that the - :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`. - pad_mode : str, optional - Choose padding pattern when `center` is `True`. - Returns - ---------- - Tensor: - Magnitude spectrogram (B, #frames, fft_size // 2 + 1). + Args: + x(Tensor): Input signal tensor (B, T). + fft_size(int): FFT size. + hop_size(int): Hop size. + win_length(int, optional): window : str, optional (Default value = None) + window(str, optional): Name of window function, see `scipy.signal.get_window` for more + details. Defaults to "hann". + center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the + :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`. + pad_mode(str, optional, optional): (Default value = 'reflect') + hop_length: (Default value = None) + + Returns: + Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). """ # calculate window window = signal.get_window(window, win_length, fftbins=True) @@ -116,16 +515,11 @@ class SpectralConvergenceLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. - Parameters - ---------- - x_mag : Tensor - Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag : Tensor) - Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). - Returns - ---------- - Tensor - Spectral convergence loss value. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns: + Tensor: Spectral convergence loss value. """ return paddle.norm( y_mag - x_mag, p="fro") / paddle.clip( @@ -142,16 +536,11 @@ class LogSTFTMagnitudeLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. - Parameters - ---------- - x_mag : Tensor - Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag : Tensor - Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). - Returns - ---------- - Tensor - Log STFT magnitude loss value. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns: + Tensor: Log STFT magnitude loss value. """ return F.l1_loss( paddle.log(paddle.clip(y_mag, min=self.epsilon)), @@ -177,18 +566,12 @@ class STFTLoss(nn.Layer): def forward(self, x, y): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Predicted signal (B, T). - y : Tensor - Groundtruth signal (B, T). - Returns - ---------- - Tensor - Spectral convergence loss value. - Tensor - Log STFT magnitude loss value. + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + Returns: + Tensor: Spectral convergence loss value. + Tensor: Log STFT magnitude loss value. """ x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) @@ -210,16 +593,11 @@ class MultiResolutionSTFTLoss(nn.Layer): win_lengths=[600, 1200, 240], window="hann", ): """Initialize Multi resolution STFT loss module. - Parameters - ---------- - fft_sizes : list - List of FFT sizes. - hop_sizes : list - List of hop sizes. - win_lengths : list - List of window lengths. - window : str - Window function type. + Args: + fft_sizes (list): List of FFT sizes. + hop_sizes (list): List of hop sizes. + win_lengths (list): List of window lengths. + window (str): Window function type. """ super().__init__() assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) @@ -229,18 +607,13 @@ class MultiResolutionSTFTLoss(nn.Layer): def forward(self, x, y): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Predicted signal (B, T) or (B, #subband, T). - y : Tensor - Groundtruth signal (B, T) or (B, #subband, T). - Returns - ---------- - Tensor - Multi resolution spectral convergence loss value. - Tensor - Multi resolution log STFT magnitude loss value. + + Args: + x (Tensor): Predicted signal (B, T) or (B, #subband, T). + y (Tensor): Groundtruth signal (B, T) or (B, #subband, T). + Returns: + Tensor: Multi resolution spectral convergence loss value. + Tensor: Multi resolution log STFT magnitude loss value. """ if len(x.shape) == 3: # (B, C, T) -> (B x C, T) @@ -277,14 +650,10 @@ class GeneratorAdversarialLoss(nn.Layer): def forward(self, outputs): """Calcualate generator adversarial loss. - Parameters - ---------- - outputs: Tensor or List - Discriminator outputs or list of discriminator outputs. - Returns - ---------- - Tensor - Generator adversarial loss value. + Args: + outputs (Tensor or List): Discriminator outputs or list of discriminator outputs. + Returns: + Tensor: Generator adversarial loss value. """ if isinstance(outputs, (tuple, list)): adv_loss = 0.0 @@ -324,20 +693,15 @@ class DiscriminatorAdversarialLoss(nn.Layer): def forward(self, outputs_hat, outputs): """Calcualate discriminator adversarial loss. - Parameters - ---------- - outputs_hat : Tensor or list - Discriminator outputs or list of - discriminator outputs calculated from generator outputs. - outputs : Tensor or list - Discriminator outputs or list of - discriminator outputs calculated from groundtruth. - Returns - ---------- - Tensor - Discriminator real loss value. - Tensor - Discriminator fake loss value. + + Args: + outputs_hat (Tensor or list): Discriminator outputs or list of + discriminator outputs calculated from generator outputs. + outputs (Tensor or list): Discriminator outputs or list of + discriminator outputs calculated from groundtruth. + Returns: + Tensor: Discriminator real loss value. + Tensor: Discriminator fake loss value. """ if isinstance(outputs, (tuple, list)): real_loss = 0.0 @@ -420,40 +784,32 @@ def ssim(img1, img2, window_size=11, size_average=True): def weighted_mean(input, weight): """Weighted mean. It can also be used as masked mean. - Parameters - ----------- - input : Tensor - The input tensor. - weight : Tensor - The weight tensor with broadcastable shape with the input. - - Returns - ---------- - Tensor [shape=(1,)] - Weighted mean tensor with the same dtype as input. + Args: + input(Tensor): The input tensor. + weight(Tensor): The weight tensor with broadcastable shape with the input. + + Returns: + Tensor: Weighted mean tensor with the same dtype as input. shape=(1,) + """ weight = paddle.cast(weight, input.dtype) - broadcast_ratio = input.size / weight.size + # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__ + broadcast_ratio = input.numel() / weight.numel() return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_ratio) def masked_l1_loss(prediction, target, mask): """Compute maksed L1 loss. - Parameters - ---------- - prediction : Tensor - The prediction. - target : Tensor - The target. The shape should be broadcastable to ``prediction``. - mask : Tensor - The mask. The shape should be broadcatable to the broadcasted shape of - ``prediction`` and ``target``. - - Returns - ------- - Tensor [shape=(1,)] - The masked L1 loss. + Args: + prediction(Tensor): The prediction. + target(Tensor): The target. The shape should be broadcastable to ``prediction``. + mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of + ``prediction`` and ``target``. + + Returns: + Tensor: The masked L1 loss. shape=(1,) + """ abs_error = F.l1_loss(prediction, target, reduction='none') loss = weighted_mean(abs_error, mask) @@ -526,14 +882,11 @@ class MelSpectrogram(nn.Layer): def forward(self, x): """Calculate Mel-spectrogram. - Parameters - ---------- - x : Tensor - Input waveform tensor (B, T) or (B, 1, T). - Returns - ---------- - Tensor - Mel-spectrogram (B, #mels, #frames). + Args: + + x (Tensor): Input waveform tensor (B, T) or (B, 1, T). + Returns: + Tensor: Mel-spectrogram (B, #mels, #frames). """ if len(x.shape) == 3: # (B, C, T) -> (B*C, T) @@ -598,16 +951,12 @@ class MelSpectrogramLoss(nn.Layer): def forward(self, y_hat, y): """Calculate Mel-spectrogram loss. - Parameters - ---------- - y_hat : Tensor - Generated single tensor (B, 1, T). - y : Tensor - Groundtruth single tensor (B, 1, T). - Returns - ---------- - Tensor - Mel-spectrogram loss value. + Args: + y_hat(Tensor): Generated single tensor (B, 1, T). + y(Tensor): Groundtruth single tensor (B, 1, T). + + Returns: + Tensor: Mel-spectrogram loss value. """ mel_hat = self.mel_spectrogram(y_hat) mel = self.mel_spectrogram(y) @@ -632,18 +981,14 @@ class FeatureMatchLoss(nn.Layer): def forward(self, feats_hat, feats): """Calcualate feature matching loss. - Parameters - ---------- - feats_hat : list - List of list of discriminator outputs - calcuated from generater outputs. - feats : list - List of list of discriminator outputs - calcuated from groundtruth. - Returns - ---------- - Tensor - Feature matching loss value. + + Args: + feats_hat(list): List of list of discriminator outputs + calcuated from generater outputs. + feats(list): List of list of discriminator outputs + + Returns: + Tensor: Feature matching loss value. """ feat_match_loss = 0.0 diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 3822b33d..4207d316 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -20,27 +20,21 @@ from typeguard import check_argument_types def pad_list(xs, pad_value): """Perform padding for the list of tensors. - Parameters - ---------- - xs : List[Tensor] - List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value : float) - Value for padding. - - Returns - ---------- - Tensor - Padded tensor (B, Tmax, `*`). - - Examples - ---------- - >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) + Args: + xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. + pad_value (float): Value for padding. + + Returns: + Tensor: Padded tensor (B, Tmax, `*`). + + Examples: + >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])] + >>> x + [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] + >>> pad_list(x, 0) + tensor([[1., 1., 1., 1.], + [1., 1., 0., 0.], + [1., 0., 0., 0.]]) """ n_batch = len(xs) max_len = max(x.shape[0] for x in xs) @@ -55,25 +49,20 @@ def pad_list(xs, pad_value): def make_pad_mask(lengths, length_dim=-1): """Make mask tensor containing indices of padded part. - Parameters - ---------- - lengths : LongTensor - Batch of lengths (B,). - - Returns - ---------- - Tensor(bool) - Mask tensor containing indices of padded part bool. - - Examples - ---------- - With only lengths. - - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] + Args: + lengths (Tensor(int64)): Batch of lengths (B,). + + Returns: + Tensor(bool): Mask tensor containing indices of padded part bool. + + Examples: + With only lengths. + + >>> lengths = [5, 3, 2] + >>> make_non_pad_mask(lengths) + masks = [[0, 0, 0, 0 ,0], + [0, 0, 0, 1, 1], + [0, 0, 1, 1, 1]] """ if length_dim == 0: raise ValueError("length_dim cannot be 0: {}".format(length_dim)) @@ -91,31 +80,24 @@ def make_pad_mask(lengths, length_dim=-1): def make_non_pad_mask(lengths, length_dim=-1): """Make mask tensor containing indices of non-padded part. - Parameters - ---------- - lengths : LongTensor or List - Batch of lengths (B,). - xs : Tensor, optional - The reference tensor. - If set, masks will be the same shape as this tensor. - length_dim : int, optional - Dimension indicator of the above tensor. - See the example. - - Returns - ---------- - Tensor(bool) - mask tensor containing indices of padded part bool. - - Examples - ---------- - With only lengths. - - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] + Args: + lengths (Tensor(int64) or List): Batch of lengths (B,). + xs (Tensor, optional): The reference tensor. + If set, masks will be the same shape as this tensor. + length_dim (int, optional): Dimension indicator of the above tensor. + See the example. + + Returns: + Tensor(bool): mask tensor containing indices of padded part bool. + + Examples: + With only lengths. + + >>> lengths = [5, 3, 2] + >>> make_non_pad_mask(lengths) + masks = [[1, 1, 1, 1 ,1], + [1, 1, 1, 0, 0], + [1, 1, 0, 0, 0]] """ return paddle.logical_not(make_pad_mask(lengths, length_dim)) @@ -127,12 +109,9 @@ def initialize(model: nn.Layer, init: str): Custom initialization routines can be implemented into submodules - Parameters - ---------- - model : nn.Layer - Target. - init : str - Method of initialization. + Args: + model (nn.Layer): Target. + init (str): Method of initialization. """ assert check_argument_types() diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py index fb850a4d..9860da90 100644 --- a/paddlespeech/t2s/modules/pqmf.py +++ b/paddlespeech/t2s/modules/pqmf.py @@ -24,20 +24,16 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0): """Design prototype filter for PQMF. This method is based on `A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`_. - Parameters - ---------- - taps : int - The number of filter taps. - cutoff_ratio : float - Cut-off frequency ratio. - beta : float - Beta coefficient for kaiser window. - Returns - ---------- - ndarray - Impluse response of prototype filter (taps + 1,). - .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: - https://ieeexplore.ieee.org/abstract/document/681427 + + Args: + taps (int): The number of filter taps. + cutoff_ratio (float): Cut-off frequency ratio. + beta (float): Beta coefficient for kaiser window. + Returns: + ndarray: + Impluse response of prototype filter (taps + 1,). + .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: + https://ieeexplore.ieee.org/abstract/document/681427 """ # check the arguments are valid assert taps % 2 == 0, "The number of taps mush be even number." @@ -68,16 +64,12 @@ class PQMF(nn.Layer): """Initilize PQMF module. The cutoff_ratio and beta parameters are optimized for #subbands = 4. See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195. - Parameters - ---------- - subbands : int - The number of subbands. - taps : int - The number of filter taps. - cutoff_ratio : float - Cut-off frequency ratio. - beta : float - Beta coefficient for kaiser window. + + Args: + subbands (int): The number of subbands. + taps (int): The number of filter taps. + cutoff_ratio (float): Cut-off frequency ratio. + beta (float): Beta coefficient for kaiser window. """ super().__init__() @@ -110,28 +102,20 @@ class PQMF(nn.Layer): def analysis(self, x): """Analysis with PQMF. - Parameters - ---------- - x : Tensor - Input tensor (B, 1, T). - Returns - ---------- - Tensor - Output tensor (B, subbands, T // subbands). + Args: + x (Tensor): Input tensor (B, 1, T). + Returns: + Tensor: Output tensor (B, subbands, T // subbands). """ x = F.conv1d(self.pad_fn(x), self.analysis_filter) return F.conv1d(x, self.updown_filter, stride=self.subbands) def synthesis(self, x): """Synthesis with PQMF. - Parameters - ---------- - x : Tensor - Input tensor (B, subbands, T // subbands). - Returns - ---------- - Tensor - Output tensor (B, 1, T). + Args: + x (Tensor): Input tensor (B, subbands, T // subbands). + Returns: + Tensor: Output tensor (B, 1, T). """ x = F.conv1d_transpose( x, self.updown_filter * self.subbands, stride=self.subbands) diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py index 6b7c6a6b..33ed575b 100644 --- a/paddlespeech/t2s/modules/predictor/duration_predictor.py +++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py @@ -49,20 +49,13 @@ class DurationPredictor(nn.Layer): offset=1.0): """Initilize duration predictor module. - Parameters - ---------- - idim : int - Input dimension. - n_layers : int, optional - Number of convolutional layers. - n_chans : int, optional - Number of channels of convolutional layers. - kernel_size : int, optional - Kernel size of convolutional layers. - dropout_rate : float, optional - Dropout rate. - offset : float, optional - Offset value to avoid nan in log domain. + Args: + idim (int):Input dimension. + n_layers (int, optional): Number of convolutional layers. + n_chans (int, optional): Number of channels of convolutional layers. + kernel_size (int, optional): Kernel size of convolutional layers. + dropout_rate (float, optional): Dropout rate. + offset (float, optional): Offset value to avoid nan in log domain. """ super().__init__() @@ -105,35 +98,23 @@ class DurationPredictor(nn.Layer): def forward(self, xs, x_masks=None): """Calculate forward propagation. + Args: + xs(Tensor): Batch of input sequences (B, Tmax, idim). + x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None) - Parameters - ---------- - xs : Tensor - Batch of input sequences (B, Tmax, idim). - x_masks : ByteTensor, optional - Batch of masks indicating padded part (B, Tmax). - - Returns - ---------- - Tensor - Batch of predicted durations in log domain (B, Tmax). + Returns: + Tensor: Batch of predicted durations in log domain (B, Tmax). """ return self._forward(xs, x_masks, False) def inference(self, xs, x_masks=None): """Inference duration. + Args: + xs(Tensor): Batch of input sequences (B, Tmax, idim). + x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None) - Parameters - ---------- - xs : Tensor - Batch of input sequences (B, Tmax, idim). - x_masks : Tensor(bool), optional - Batch of masks indicating padded part (B, Tmax). - - Returns - ---------- - Tensor - Batch of predicted durations in linear domain int64 (B, Tmax). + Returns: + Tensor: Batch of predicted durations in linear domain int64 (B, Tmax). """ return self._forward(xs, x_masks, True) @@ -147,13 +128,9 @@ class DurationPredictorLoss(nn.Layer): def __init__(self, offset=1.0, reduction="mean"): """Initilize duration predictor loss module. - - Parameters - ---------- - offset : float, optional - Offset value to avoid nan in log domain. - reduction : str - Reduction type in loss calculation. + Args: + offset (float, optional): Offset value to avoid nan in log domain. + reduction (str): Reduction type in loss calculation. """ super().__init__() self.criterion = nn.MSELoss(reduction=reduction) @@ -162,21 +139,15 @@ class DurationPredictorLoss(nn.Layer): def forward(self, outputs, targets): """Calculate forward propagation. - Parameters - ---------- - outputs : Tensor - Batch of prediction durations in log domain (B, T) - targets : Tensor - Batch of groundtruth durations in linear domain (B, T) - - Returns - ---------- - Tensor - Mean squared error loss value. - - Note - ---------- - `outputs` is in log domain but `targets` is in linear domain. + Args: + outputs(Tensor): Batch of prediction durations in log domain (B, T) + targets(Tensor): Batch of groundtruth durations in linear domain (B, T) + + Returns: + Tensor: Mean squared error loss value. + + Note: + `outputs` is in log domain but `targets` is in linear domain. """ # NOTE: outputs is in log domain while targets in linear targets = paddle.log(targets.cast(dtype='float32') + self.offset) diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py index f1ecfb7c..62d707d2 100644 --- a/paddlespeech/t2s/modules/predictor/length_regulator.py +++ b/paddlespeech/t2s/modules/predictor/length_regulator.py @@ -13,6 +13,7 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) """Length regulator related modules.""" +import numpy as np import paddle from paddle import nn @@ -34,15 +35,35 @@ class LengthRegulator(nn.Layer): def __init__(self, pad_value=0.0): """Initilize length regulator module. - Parameters - ---------- - pad_value : float, optional - Value used for padding. + Args: + pad_value (float, optional): Value used for padding. """ super().__init__() self.pad_value = pad_value + # expand_numpy is faster than expand + def expand_numpy(self, encodings: paddle.Tensor, + durations: paddle.Tensor) -> paddle.Tensor: + """ + encodings: (B, T, C) + durations: (B, T) + """ + batch_size, t_enc = durations.shape + durations = durations.numpy() + slens = np.sum(durations, -1) + t_dec = np.max(slens) + M = np.zeros([batch_size, t_dec, t_enc]) + for i in range(batch_size): + k = 0 + for j in range(t_enc): + d = durations[i, j] + M[i, k:k + d, j] = 1 + k += d + M = paddle.to_tensor(M, dtype=encodings.dtype) + encodings = paddle.matmul(M, encodings) + return encodings + def expand(self, encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor: """ @@ -50,39 +71,37 @@ class LengthRegulator(nn.Layer): durations: (B, T) """ batch_size, t_enc = paddle.shape(durations) - slens = durations.sum(-1) - t_dec = slens.max() + slens = paddle.sum(durations, -1) + t_dec = paddle.max(slens) M = paddle.zeros([batch_size, t_dec, t_enc]) for i in range(batch_size): k = 0 for j in range(t_enc): d = durations[i, j] + # If the d == 0, slice action is meaningless and not supported in paddle if d >= 1: M[i, k:k + d, j] = 1 k += d encodings = paddle.matmul(M, encodings) return encodings - def forward(self, xs, ds, alpha=1.0): + def forward(self, xs, ds, alpha=1.0, is_inference=False): """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of sequences of char or phoneme embeddings (B, Tmax, D). - ds : Tensor(int64) - Batch of durations of each frame (B, T). - alpha : float, optional - Alpha value to control speed of speech. + Args: + xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D). + ds (Tensor(int64)): Batch of durations of each frame (B, T). + alpha (float, optional): Alpha value to control speed of speech. - Returns - ---------- - Tensor - replicated input tensor based on durations (B, T*, D). + Returns: + Tensor: replicated input tensor based on durations (B, T*, D). """ if alpha != 1.0: assert alpha > 0 ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha) ds = ds.cast(dtype=paddle.int64) - return self.expand(xs, ds) + if is_inference: + return self.expand(xs, ds) + else: + return self.expand_numpy(xs, ds) diff --git a/paddlespeech/t2s/modules/predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py index 417fca82..8afbf257 100644 --- a/paddlespeech/t2s/modules/predictor/variance_predictor.py +++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py @@ -42,18 +42,12 @@ class VariancePredictor(nn.Layer): dropout_rate: float=0.5, ): """Initilize duration predictor module. - Parameters - ---------- - idim : int - Input dimension. - n_layers : int, optional - Number of convolutional layers. - n_chans : int, optional - Number of channels of convolutional layers. - kernel_size : int, optional - Kernel size of convolutional layers. - dropout_rate : float, optional - Dropout rate. + Args: + idim (int): Input dimension. + n_layers (int, optional): Number of convolutional layers. + n_chans (int, optional): Number of channels of convolutional layers. + kernel_size (int, optional): Kernel size of convolutional layers. + dropout_rate (float, optional): Dropout rate. """ assert check_argument_types() super().__init__() @@ -79,17 +73,12 @@ class VariancePredictor(nn.Layer): x_masks: paddle.Tensor=None) -> paddle.Tensor: """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of input sequences (B, Tmax, idim). - x_masks : Tensor(bool), optional - Batch of masks indicating padded part (B, Tmax, 1). + Args: + xs (Tensor): Batch of input sequences (B, Tmax, idim). + x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1). - Returns - ---------- - Tensor - Batch of predicted sequences (B, Tmax, 1). + Returns: + Tensor: Batch of predicted sequences (B, Tmax, 1). """ # (B, idim, Tmax) xs = xs.transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/residual_block.py b/paddlespeech/t2s/modules/residual_block.py index a96a8946..efbfce27 100644 --- a/paddlespeech/t2s/modules/residual_block.py +++ b/paddlespeech/t2s/modules/residual_block.py @@ -28,26 +28,16 @@ class WaveNetResidualBlock(nn.Layer): unit and parametric redidual and skip connections. For more details, refer to `WaveNet: A Generative Model for Raw Audio `_. - Parameters - ---------- - kernel_size : int, optional - Kernel size of the 1D convolution, by default 3 - residual_channels : int, optional - Feature size of the resiaudl output(and also the input), by default 64 - gate_channels : int, optional - Output feature size of the 1D convolution, by default 128 - skip_channels : int, optional - Feature size of the skip output, by default 64 - aux_channels : int, optional - Feature size of the auxiliary input (e.g. spectrogram), by default 80 - dropout : float, optional - Probability of the dropout before the 1D convolution, by default 0. - dilation : int, optional - Dilation of the 1D convolution, by default 1 - bias : bool, optional - Whether to use bias in the 1D convolution, by default True - use_causal_conv : bool, optional - Whether to use causal padding for the 1D convolution, by default False + Args: + kernel_size (int, optional): Kernel size of the 1D convolution, by default 3 + residual_channels (int, optional): Feature size of the resiaudl output(and also the input), by default 64 + gate_channels (int, optional): Output feature size of the 1D convolution, by default 128 + skip_channels (int, optional): Feature size of the skip output, by default 64 + aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80 + dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0. + dilation (int, optional): Dilation of the 1D convolution, by default 1 + bias (bool, optional): Whether to use bias in the 1D convolution, by default True + use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False """ def __init__(self, @@ -90,21 +80,15 @@ class WaveNetResidualBlock(nn.Layer): def forward(self, x, c): """ - Parameters - ---------- - x : Tensor - Shape (N, C_res, T), the input features. - c : Tensor - Shape (N, C_aux, T), the auxiliary input. - - Returns - ------- - res : Tensor - Shape (N, C_res, T), the residual output, which is used as the - input of the next ResidualBlock in a stack of ResidualBlocks. - skip : Tensor - Shape (N, C_skip, T), the skip output, which is collected among - each layer in a stack of ResidualBlocks. + Args: + x (Tensor): the input features. Shape (N, C_res, T) + c (Tensor): the auxiliary input. Shape (N, C_aux, T) + + Returns: + res (Tensor): Shape (N, C_res, T), the residual output, which is used as the + input of the next ResidualBlock in a stack of ResidualBlocks. + skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among + each layer in a stack of ResidualBlocks. """ x_input = x x = F.dropout(x, self.dropout, training=self.training) @@ -136,22 +120,14 @@ class HiFiGANResidualBlock(nn.Layer): nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1}, ): """Initialize HiFiGANResidualBlock module. - Parameters - ---------- - kernel_size : int - Kernel size of dilation convolution layer. - channels : int - Number of channels for convolution layer. - dilations : List[int] - List of dilation factors. - use_additional_convs : bool - Whether to use additional convolution layers. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. + Args: + kernel_size (int): Kernel size of dilation convolution layer. + channels (int): Number of channels for convolution layer. + dilations (List[int]): List of dilation factors. + use_additional_convs (bool): Whether to use additional convolution layers. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. """ super().__init__() @@ -190,14 +166,10 @@ class HiFiGANResidualBlock(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, channels, T). - Returns - ---------- - Tensor - Output tensor (B, channels, T). + Args: + x (Tensor): Input tensor (B, channels, T). + Returns: + Tensor: Output tensor (B, channels, T). """ for idx in range(len(self.convs1)): xt = self.convs1[idx](x) diff --git a/paddlespeech/t2s/modules/residual_stack.py b/paddlespeech/t2s/modules/residual_stack.py index c885dfe9..0d949b56 100644 --- a/paddlespeech/t2s/modules/residual_stack.py +++ b/paddlespeech/t2s/modules/residual_stack.py @@ -37,26 +37,17 @@ class ResidualStack(nn.Layer): pad_params: Dict[str, Any]={"mode": "reflect"}, use_causal_conv: bool=False, ): """Initialize ResidualStack module. - Parameters - ---------- - kernel_size : int - Kernel size of dilation convolution layer. - channels : int - Number of channels of convolution layers. - dilation : int - Dilation factor. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : Dict[str,Any] - Hyperparameters for activation function. - pad : str - Padding function module name before dilated convolution layer. - pad_params : Dict[str, Any] - Hyperparameters for padding function. - use_causal_conv : bool - Whether to use causal convolution. + + Args: + kernel_size (int): Kernel size of dilation convolution layer. + channels (int): Number of channels of convolution layers. + dilation (int): Dilation factor. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (Dict[str, Any]): Hyperparameters for padding function. + use_causal_conv (bool): Whether to use causal convolution. """ super().__init__() # for compatibility @@ -102,13 +93,10 @@ class ResidualStack(nn.Layer): def forward(self, c): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, channels, T). - Returns - ---------- - Tensor - Output tensor (B, chennels, T). + + Args: + c (Tensor): Input tensor (B, channels, T). + Returns: + Tensor: Output tensor (B, chennels, T). """ return self.stack(c) + self.skip_layer(c) diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py index 9d4b83a2..49091eac 100644 --- a/paddlespeech/t2s/modules/style_encoder.py +++ b/paddlespeech/t2s/modules/style_encoder.py @@ -30,33 +30,21 @@ class StyleEncoder(nn.Layer): .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis`: https://arxiv.org/abs/1803.09017 - - Parameters - ---------- - idim : int, optional - Dimension of the input mel-spectrogram. - gst_tokens : int, optional - The number of GST embeddings. - gst_token_dim : int, optional - Dimension of each GST embedding. - gst_heads : int, optional - The number of heads in GST multihead attention. - conv_layers : int, optional - The number of conv layers in the reference encoder. - conv_chans_list : Sequence[int], optional - List of the number of channels of conv layers in the referece encoder. - conv_kernel_size : int, optional - Kernal size of conv layers in the reference encoder. - conv_stride : int, optional - Stride size of conv layers in the reference encoder. - gru_layers : int, optional - The number of GRU layers in the reference encoder. - gru_units : int, optional - The number of GRU units in the reference encoder. - - Todo - ---------- - * Support manual weight specification in inference. + + Args: + idim (int, optional): Dimension of the input mel-spectrogram. + gst_tokens (int, optional): The number of GST embeddings. + gst_token_dim (int, optional): Dimension of each GST embedding. + gst_heads (int, optional): The number of heads in GST multihead attention. + conv_layers (int, optional): The number of conv layers in the reference encoder. + conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder. + conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder. + conv_stride (int, optional): Stride size of conv layers in the reference encoder. + gru_layers (int, optional): The number of GRU layers in the reference encoder. + gru_units (int, optional):The number of GRU units in the reference encoder. + + Todo: + * Support manual weight specification in inference. """ @@ -93,15 +81,11 @@ class StyleEncoder(nn.Layer): def forward(self, speech: paddle.Tensor) -> paddle.Tensor: """Calculate forward propagation. - Parameters - ---------- - speech : Tensor - Batch of padded target features (B, Lmax, odim). + Args: + speech (Tensor): Batch of padded target features (B, Lmax, odim). - Returns - ---------- - Tensor: - Style token embeddings (B, token_dim). + Returns: + Tensor: Style token embeddings (B, token_dim). """ ref_embs = self.ref_enc(speech) @@ -118,23 +102,15 @@ class ReferenceEncoder(nn.Layer): .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis`: https://arxiv.org/abs/1803.09017 - - Parameters - ---------- - idim : int, optional - Dimension of the input mel-spectrogram. - conv_layers : int, optional - The number of conv layers in the reference encoder. - conv_chans_list: : Sequence[int], optional - List of the number of channels of conv layers in the referece encoder. - conv_kernel_size : int, optional - Kernal size of conv layers in the reference encoder. - conv_stride : int, optional - Stride size of conv layers in the reference encoder. - gru_layers : int, optional - The number of GRU layers in the reference encoder. - gru_units : int, optional - The number of GRU units in the reference encoder. + + Args: + idim (int, optional): Dimension of the input mel-spectrogram. + conv_layers (int, optional): The number of conv layers in the reference encoder. + conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder. + conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder. + conv_stride (int, optional): Stride size of conv layers in the reference encoder. + gru_layers (int, optional): The number of GRU layers in the reference encoder. + gru_units (int, optional): The number of GRU units in the reference encoder. """ @@ -191,16 +167,11 @@ class ReferenceEncoder(nn.Layer): def forward(self, speech: paddle.Tensor) -> paddle.Tensor: """Calculate forward propagation. + Args: + speech (Tensor): Batch of padded target features (B, Lmax, idim). - Parameters - ---------- - speech : Tensor - Batch of padded target features (B, Lmax, idim). - - Return - ---------- - Tensor - Reference embedding (B, gru_units) + Returns: + Tensor: Reference embedding (B, gru_units) """ batch_size = speech.shape[0] @@ -228,19 +199,12 @@ class StyleTokenLayer(nn.Layer): .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis`: https://arxiv.org/abs/1803.09017 - - Parameters - ---------- - ref_embed_dim : int, optional - Dimension of the input reference embedding. - gst_tokens : int, optional - The number of GST embeddings. - gst_token_dim : int, optional - Dimension of each GST embedding. - gst_heads : int, optional - The number of heads in GST multihead attention. - dropout_rate : float, optional - Dropout rate in multi-head attention. + Args: + ref_embed_dim (int, optional): Dimension of the input reference embedding. + gst_tokens (int, optional): The number of GST embeddings. + gst_token_dim (int, optional): Dimension of each GST embedding. + gst_heads (int, optional): The number of heads in GST multihead attention. + dropout_rate (float, optional): Dropout rate in multi-head attention. """ @@ -271,15 +235,11 @@ class StyleTokenLayer(nn.Layer): def forward(self, ref_embs: paddle.Tensor) -> paddle.Tensor: """Calculate forward propagation. - Parameters - ---------- - ref_embs : Tensor - Reference embeddings (B, ref_embed_dim). + Args: + ref_embs (Tensor): Reference embeddings (B, ref_embed_dim). - Returns - ---------- - Tensor - Style token embeddings (B, gst_token_dim). + Returns: + Tensor: Style token embeddings (B, gst_token_dim). """ batch_size = ref_embs.shape[0] diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py new file mode 100644 index 00000000..a6fde742 --- /dev/null +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -0,0 +1,454 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Attention modules for RNN.""" +import paddle +import paddle.nn.functional as F +from paddle import nn + +from paddlespeech.t2s.modules.masked_fill import masked_fill +from paddlespeech.t2s.modules.nets_utils import make_pad_mask + + +def _apply_attention_constraint(e, + last_attended_idx, + backward_window=1, + forward_window=3): + """Apply monotonic attention constraint. + + This function apply the monotonic attention constraint + introduced in `Deep Voice 3: Scaling + Text-to-Speech with Convolutional Sequence Learning`_. + + Args: + e(Tensor): Attention energy before applying softmax (1, T). + last_attended_idx(int): The index of the inputs of the last attended [0, T]. + backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1) + forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3) + + Returns: + Tensor: Monotonic constrained attention energy (1, T). + + .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`: + https://arxiv.org/abs/1710.07654 + + """ + if paddle.shape(e)[0] != 1: + raise NotImplementedError( + "Batch attention constraining is not yet supported.") + backward_idx = last_attended_idx - backward_window + forward_idx = last_attended_idx + forward_window + if backward_idx > 0: + e[:, :backward_idx] = -float("inf") + if forward_idx < paddle.shape(e)[1]: + e[:, forward_idx:] = -float("inf") + return e + + +class AttLoc(nn.Layer): + """location-aware attention module. + + Reference: Attention-Based Models for Speech Recognition + (https://arxiv.org/pdf/1506.07503.pdf) + + Args: + eprojs (int): projection-units of encoder + dunits (int): units of decoder + att_dim (int): attention dimension + aconv_chans (int): channels of attention convolution + aconv_filts (int): filter size of attention convolution + han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h + """ + + def __init__(self, + eprojs, + dunits, + att_dim, + aconv_chans, + aconv_filts, + han_mode=False): + super().__init__() + self.mlp_enc = nn.Linear(eprojs, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + + self.dunits = dunits + self.eprojs = eprojs + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.han_mode = han_mode + + def reset(self): + """reset states""" + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + scaling=2.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttLoc forward propagation. + Args: + enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc) + enc_hs_len(Tensor): padded encoder hidden state length (B) + dec_z(Tensor dec_z): decoder hidden state (B, D_dec) + att_prev(Tensor): previous attention weight (B, T_max) + scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0) + forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3) + last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) + backward_window(int, optional): backward window size in attention constraint (Default value = 1) + forward_window(int, optional): forward window size in attetion constraint (Default value = 3) + Returns: + Tensor: attention weighted encoder state (B, D_enc) + Tensor: previous attention weights (B, T_max) + """ + batch = paddle.shape(enc_hs_pad)[0] + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None or self.han_mode: + # (utt, frame, hdim) + self.enc_h = enc_hs_pad + self.h_length = paddle.shape(self.enc_h)[1] + # (utt, frame, att_dim) + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + # initialize attention weight with uniform dist. + if paddle.sum(att_prev) == 0: + # if no bias, 0 0-pad goes 0 + att_prev = 1.0 - make_pad_mask(enc_hs_len) + att_prev = att_prev / enc_hs_len.unsqueeze(-1) + + # att_prev: (utt, frame) -> (utt, 1, 1, frame) + # -> (utt, att_conv_chans, 1, frame) + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: (utt, att_conv_chans, 1, frame) -> (utt, frame, att_conv_chans) + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: (utt, frame, att_conv_chans) -> (utt, frame, att_dim) + att_conv = self.mlp_att(att_conv) + # dec_z_tiled: (utt, frame, att_dim) + dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim]) + + # dot with gvec + # (utt, frame, att_dim) -> (utt, frame) + e = paddle.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled) + e = self.gvec(e).squeeze(2) + + # NOTE: consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + + e = masked_fill(e, self.mask, -float("inf")) + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # weighted sum over frames + # utt x hdim + c = paddle.sum( + self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1) + return c, w + + +class AttForward(nn.Layer): + """Forward attention module. + Reference + ---------- + Forward attention in sequence-to-sequence acoustic modeling for speech synthesis + (https://arxiv.org/pdf/1807.06736.pdf) + + Args: + eprojs (int): projection-units of encoder + dunits (int): units of decoder + att_dim (int): attention dimension + aconv_chans (int): channels of attention convolution + aconv_filts (int): filter size of attention convolution + """ + + def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts): + super().__init__() + self.mlp_enc = nn.Linear(eprojs, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + self.dunits = dunits + self.eprojs = eprojs + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def reset(self): + """reset states""" + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + scaling=1.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttForward forward propagation. + + Args: + enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc) + enc_hs_len(list): padded encoder hidden state length (B,) + dec_z(Tensor): decoder hidden state (B, D_dec) + att_prev(Tensor): attention weights of previous step (B, T_max) + scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0) + last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) + backward_window(int, optional): backward window size in attention constraint (Default value = 1) + forward_window(int, optional): (Default value = 3) + + Returns: + Tensor: attention weighted encoder state (B, D_enc) + Tensor: previous attention weights (B, T_max) + """ + batch = len(enc_hs_pad) + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None: + self.enc_h = enc_hs_pad # utt x frame x hdim + self.h_length = paddle.shape(self.enc_h)[1] + # utt x frame x att_dim + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + if att_prev is None: + # initial attention will be [1, 0, 0, ...] + att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]]) + att_prev[:, 0] = 1.0 + + # att_prev: utt x frame -> utt x 1 x 1 x frame + # -> utt x att_conv_chans x 1 x frame + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim + att_conv = self.mlp_att(att_conv) + + # dec_z_tiled: utt x frame x att_dim + dec_z_tiled = self.mlp_dec(dec_z).unsqueeze(1) + + # dot with gvec + # utt x frame x att_dim -> utt x frame + e = self.gvec( + paddle.tanh(self.pre_compute_enc_h + dec_z_tiled + + att_conv)).squeeze(2) + + # NOTE: consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + e = masked_fill(e, self.mask, -float("inf")) + + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # forward attention + att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1] + + w = (att_prev + att_prev_shift) * w + # NOTE: clip is needed to avoid nan gradient + w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1) + + # weighted sum over flames + # utt x hdim + # NOTE use bmm instead of sum(*) + c = paddle.sum(self.enc_h * w.unsqueeze(-1), axis=1) + + return c, w + + +class AttForwardTA(nn.Layer): + """Forward attention with transition agent module. + Reference: + Forward attention in sequence-to-sequence acoustic modeling for speech synthesis + (https://arxiv.org/pdf/1807.06736.pdf) + + Args: + eunits (int): units of encoder + dunits (int): units of decoder + att_dim (int): attention dimension + aconv_chans (int): channels of attention convolution + aconv_filts (int): filter size of attention convolution + odim (int): output dimension + """ + + def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim): + super().__init__() + self.mlp_enc = nn.Linear(eunits, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_ta = nn.Linear(eunits + dunits + odim, 1) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + self.dunits = dunits + self.eunits = eunits + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.trans_agent_prob = 0.5 + + def reset(self): + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.trans_agent_prob = 0.5 + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + out_prev, + scaling=1.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttForwardTA forward propagation. + + Args: + enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits) + enc_hs_len(list Tensor): padded encoder hidden state length (B,) + dec_z(Tensor): decoder hidden state (B, dunits) + att_prev(Tensor): attention weights of previous step (B, T_max) + out_prev(Tensor): decoder outputs of previous step (B, odim) + scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0) + last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) + backward_window(int, optional): backward window size in attention constraint (Default value = 1) + forward_window(int, optional): (Default value = 3) + + Returns: + Tensor: attention weighted encoder state (B, dunits) + Tensor: previous attention weights (B, Tmax) + """ + batch = len(enc_hs_pad) + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None: + self.enc_h = enc_hs_pad # utt x frame x hdim + self.h_length = paddle.shape(self.enc_h)[1] + # utt x frame x att_dim + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + if att_prev is None: + # initial attention will be [1, 0, 0, ...] + att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]]) + att_prev[:, 0] = 1.0 + + # att_prev: utt x frame -> utt x 1 x 1 x frame + # -> utt x att_conv_chans x 1 x frame + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim + att_conv = self.mlp_att(att_conv) + + # dec_z_tiled: utt x frame x att_dim + dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim]) + + # dot with gvec + # utt x frame x att_dim -> utt x frame + e = self.gvec( + paddle.tanh(att_conv + self.pre_compute_enc_h + + dec_z_tiled)).squeeze(2) + + # NOTE consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + e = masked_fill(e, self.mask, -float("inf")) + + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # forward attention + # att_prev_shift = F.pad(att_prev.unsqueeze(0), (1, 0), data_format='NCL').squeeze(0)[:, :-1] + att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1] + w = (self.trans_agent_prob * att_prev + + (1 - self.trans_agent_prob) * att_prev_shift) * w + # NOTE: clip is needed to avoid nan gradient + w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1) + + # weighted sum over flames + # utt x hdim + # NOTE use bmm instead of sum(*) + c = paddle.sum( + self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1) + + # update transition agent prob + self.trans_agent_prob = F.sigmoid( + self.mlp_ta(paddle.concat([c, out_prev, dec_z], axis=1))) + + return c, w diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py index 691bb3ee..ebdfa387 100644 --- a/paddlespeech/t2s/modules/tacotron2/decoder.py +++ b/paddlespeech/t2s/modules/tacotron2/decoder.py @@ -13,10 +13,12 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) """Tacotron2 decoder related modules.""" +import paddle import paddle.nn.functional as F -import six from paddle import nn +from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA + class Prenet(nn.Layer): """Prenet module for decoder of Spectrogram prediction network. @@ -42,21 +44,16 @@ class Prenet(nn.Layer): def __init__(self, idim, n_layers=2, n_units=256, dropout_rate=0.5): """Initialize prenet module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - n_layers : int, optional - The number of prenet layers. - n_units : int, optional - The number of prenet units. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + n_layers (int, optional): The number of prenet layers. + n_units (int, optional): The number of prenet units. """ super().__init__() self.dropout_rate = dropout_rate self.prenet = nn.LayerList() - for layer in six.moves.range(n_layers): + for layer in range(n_layers): n_inputs = idim if layer == 0 else n_units self.prenet.append( nn.Sequential(nn.Linear(n_inputs, n_units), nn.ReLU())) @@ -64,18 +61,14 @@ class Prenet(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Batch of input tensors (B, ..., idim). + Args: + x (Tensor): Batch of input tensors (B, ..., idim). - Returns - ---------- - Tensor - Batch of output tensors (B, ..., odim). + Returns: + Tensor: Batch of output tensors (B, ..., odim). """ - for i in six.moves.range(len(self.prenet)): + for i in range(len(self.prenet)): # F.dropout 引入了随机, tacotron2 的 dropout 是不能去掉的 x = F.dropout(self.prenet[i](x)) return x @@ -107,26 +100,18 @@ class Postnet(nn.Layer): use_batch_norm=True, ): """Initialize postnet module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - n_layers : int, optional - The number of layers. - n_filts : int, optional - The number of filter size. - n_units : int, optional - The number of filter channels. - use_batch_norm : bool, optional - Whether to use batch normalization.. - dropout_rate : float, optional - Dropout rate.. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + n_layers (int, optional): The number of layers. + n_filts (int, optional): The number of filter size. + n_units (int, optional): The number of filter channels. + use_batch_norm (bool, optional): Whether to use batch normalization.. + dropout_rate (float, optional): Dropout rate.. """ super().__init__() self.postnet = nn.LayerList() - for layer in six.moves.range(n_layers - 1): + for layer in range(n_layers - 1): ichans = odim if layer == 0 else n_chans ochans = odim if layer == n_layers - 1 else n_chans if use_batch_norm: @@ -182,17 +167,520 @@ class Postnet(nn.Layer): def forward(self, xs): """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of the sequences of padded input tensors (B, idim, Tmax). - - Returns - ---------- - Tensor - Batch of padded output tensor. (B, odim, Tmax). - + Args: + xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax). + Returns: + Tensor: Batch of padded output tensor. (B, odim, Tmax). """ - for i in six.moves.range(len(self.postnet)): + for i in range(len(self.postnet)): xs = self.postnet[i](xs) return xs + + +class ZoneOutCell(nn.Layer): + """ZoneOut Cell module. + This is a module of zoneout described in + `Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`_. + This code is modified from `eladhoffer/seq2seq.pytorch`_. + Examples + ---------- + >>> lstm = paddle.nn.LSTMCell(16, 32) + >>> lstm = ZoneOutCell(lstm, 0.5) + .. _`Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`: + https://arxiv.org/abs/1606.01305 + .. _`eladhoffer/seq2seq.pytorch`: + https://github.com/eladhoffer/seq2seq.pytorch + """ + + def __init__(self, cell, zoneout_rate=0.1): + """Initialize zone out cell module. + + Args: + cell (nn.Layer): Paddle recurrent cell module + e.g. `paddle.nn.LSTMCell`. + zoneout_rate (float, optional): Probability of zoneout from 0.0 to 1.0. + """ + super().__init__() + self.cell = cell + self.hidden_size = cell.hidden_size + self.zoneout_rate = zoneout_rate + if zoneout_rate > 1.0 or zoneout_rate < 0.0: + raise ValueError( + "zoneout probability must be in the range from 0.0 to 1.0.") + + def forward(self, inputs, hidden): + """Calculate forward propagation. + + Args: + inputs (Tensor): Batch of input tensor (B, input_size). + hidden (tuple): + - Tensor: Batch of initial hidden states (B, hidden_size). + - Tensor: Batch of initial cell states (B, hidden_size). + Returns: + Tensor: + Batch of next hidden states (B, hidden_size). + tuple: + - Tensor: Batch of next hidden states (B, hidden_size). + - Tensor: Batch of next cell states (B, hidden_size). + """ + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.cell(inputs, hidden) + next_hidden = self._zoneout(hidden, next_hidden, self.zoneout_rate) + # to have the same output format with LSTMCell in paddle + return next_hidden[0], next_hidden + + def _zoneout(self, h, next_h, prob): + # apply recursively + if isinstance(h, tuple): + num_h = len(h) + if not isinstance(prob, tuple): + prob = tuple([prob] * num_h) + return tuple( + [self._zoneout(h[i], next_h[i], prob[i]) for i in range(num_h)]) + if self.training: + mask = paddle.bernoulli(paddle.ones([*paddle.shape(h)]) * prob) + return mask * h + (1 - mask) * next_h + else: + return prob * h + (1 - prob) * next_h + + +class Decoder(nn.Layer): + """Decoder module of Spectrogram prediction network. + This is a module of decoder of Spectrogram prediction network in Tacotron2, + which described in `Natural TTS + Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_. + The decoder generates the sequence of + features from the sequence of the hidden states. + .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`: + https://arxiv.org/abs/1712.05884 + """ + + def __init__( + self, + idim, + odim, + att, + dlayers=2, + dunits=1024, + prenet_layers=2, + prenet_units=256, + postnet_layers=5, + postnet_chans=512, + postnet_filts=5, + output_activation_fn=None, + cumulate_att_w=True, + use_batch_norm=True, + use_concate=True, + dropout_rate=0.5, + zoneout_rate=0.1, + reduction_factor=1, ): + """Initialize Tacotron2 decoder module. + + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + att (nn.Layer): Instance of attention class. + dlayers (int, optional): The number of decoder lstm layers. + dunits (int, optional): The number of decoder lstm units. + prenet_layers (int, optional): The number of prenet layers. + prenet_units (int, optional): The number of prenet units. + postnet_layers (int, optional): The number of postnet layers. + postnet_filts (int, optional): The number of postnet filter size. + postnet_chans (int, optional): The number of postnet filter channels. + output_activation_fn (nn.Layer, optional): Activation function for outputs. + cumulate_att_w (bool, optional): Whether to cumulate previous attention weight. + use_batch_norm (bool, optional): Whether to use batch normalization. + use_concate : bool, optional + Whether to concatenate encoder embedding with decoder lstm outputs. + dropout_rate : float, optional + Dropout rate. + zoneout_rate : float, optional + Zoneout rate. + reduction_factor : int, optional + Reduction factor. + """ + super().__init__() + + # store the hyperparameters + self.idim = idim + self.odim = odim + self.att = att + self.output_activation_fn = output_activation_fn + self.cumulate_att_w = cumulate_att_w + self.use_concate = use_concate + self.reduction_factor = reduction_factor + + # check attention type + if isinstance(self.att, AttForwardTA): + self.use_att_extra_inputs = True + else: + self.use_att_extra_inputs = False + + # define lstm network + prenet_units = prenet_units if prenet_layers != 0 else odim + self.lstm = nn.LayerList() + for layer in range(dlayers): + iunits = idim + prenet_units if layer == 0 else dunits + lstm = nn.LSTMCell(iunits, dunits) + if zoneout_rate > 0.0: + lstm = ZoneOutCell(lstm, zoneout_rate) + self.lstm.append(lstm) + + # define prenet + if prenet_layers > 0: + self.prenet = Prenet( + idim=odim, + n_layers=prenet_layers, + n_units=prenet_units, + dropout_rate=dropout_rate, ) + else: + self.prenet = None + + # define postnet + if postnet_layers > 0: + self.postnet = Postnet( + idim=idim, + odim=odim, + n_layers=postnet_layers, + n_chans=postnet_chans, + n_filts=postnet_filts, + use_batch_norm=use_batch_norm, + dropout_rate=dropout_rate, ) + else: + self.postnet = None + + # define projection layers + iunits = idim + dunits if use_concate else dunits + self.feat_out = nn.Linear( + iunits, odim * reduction_factor, bias_attr=False) + self.prob_out = nn.Linear(iunits, reduction_factor) + + def _zero_state(self, hs): + init_hs = paddle.zeros([paddle.shape(hs)[0], self.lstm[0].hidden_size]) + return init_hs + + def forward(self, hs, hlens, ys): + """Calculate forward propagation. + + Args: + hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,). + ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim). + + Returns: + Tensor: Batch of output tensors after postnet (B, Lmax, odim). + Tensor: Batch of output tensors before postnet (B, Lmax, odim). + Tensor: Batch of logits of stop prediction (B, Lmax). + Tensor: Batch of attention weights (B, Lmax, Tmax). + + Note: + This computation is performed in teacher-forcing manner. + """ + # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) + if self.reduction_factor > 1: + ys = ys[:, self.reduction_factor - 1::self.reduction_factor] + + # length list should be list of int + # hlens = list(map(int, hlens)) + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in range(1, len(self.lstm)): + c_list.append(self._zero_state(hs)) + z_list.append(self._zero_state(hs)) + prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim]) + + # initialize attention + prev_att_ws = [] + prev_att_w = paddle.zeros(paddle.shape(hlens)) + prev_att_ws.append(prev_att_w) + self.att.reset() + + # loop for an output sequence + outs, logits, att_ws = [], [], [] + for y in ys.transpose([1, 0, 2]): + if self.use_att_extra_inputs: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_ws[-1], + prev_out) + else: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_ws[-1]) + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + z_list[0], c_list[0] = next_hidden + for i in range(1, len(self.lstm)): + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + z_list[i], c_list[i] = next_hidden + zcs = (paddle.concat([z_list[-1], att_c], axis=1) + if self.use_concate else z_list[-1]) + outs.append( + self.feat_out(zcs).reshape([paddle.shape(hs)[0], self.odim, -1 + ])) + logits.append(self.prob_out(zcs)) + att_ws.append(att_w) + # teacher forcing + prev_out = y + if self.cumulate_att_w and paddle.sum(prev_att_w) != 0: + prev_att_w = prev_att_w + att_w # Note: error when use += + else: + prev_att_w = att_w + prev_att_ws.append(prev_att_w) + # (B, Lmax) + logits = paddle.concat(logits, axis=1) + # (B, odim, Lmax) + before_outs = paddle.concat(outs, axis=2) + # (B, Lmax, Tmax) + att_ws = paddle.stack(att_ws, axis=1) + + if self.reduction_factor > 1: + # (B, odim, Lmax) + before_outs = before_outs.reshape( + [paddle.shape(before_outs)[0], self.odim, -1]) + + if self.postnet is not None: + # (B, odim, Lmax) + after_outs = before_outs + self.postnet(before_outs) + else: + after_outs = before_outs + # (B, Lmax, odim) + before_outs = before_outs.transpose([0, 2, 1]) + # (B, Lmax, odim) + after_outs = after_outs.transpose([0, 2, 1]) + logits = logits + + # apply activation function for scaling + if self.output_activation_fn is not None: + before_outs = self.output_activation_fn(before_outs) + after_outs = self.output_activation_fn(after_outs) + + return after_outs, before_outs, logits, att_ws + + def inference( + self, + h, + threshold=0.5, + minlenratio=0.0, + maxlenratio=10.0, + use_att_constraint=False, + backward_window=None, + forward_window=None, ): + """Generate the sequence of features given the sequences of characters. + Args: + h(Tensor): Input sequence of encoder hidden states (T, C). + threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5) + minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10, + the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0) + maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10, + the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0) + use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False) + backward_window(int, optional): Backward window size in attention constraint. (Default value = None) + forward_window(int, optional): (Default value = None) + + Returns: + Tensor: Output sequence of features (L, odim). + Tensor: Output sequence of stop probabilities (L,). + Tensor: Attention weights (L, T). + + Note: + This computation is performed in auto-regressive manner. + .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654 + """ + # setup + + assert len(paddle.shape(h)) == 2 + hs = h.unsqueeze(0) + ilens = paddle.shape(h)[0] + # 本来 maxlen 和 minlen 外面有 int(),防止动转静的问题此处删除 + maxlen = paddle.shape(h)[0] * maxlenratio + minlen = paddle.shape(h)[0] * minlenratio + # 本来是直接使用 threshold 的,此处为了防止动转静的问题把 threshold 转成 tensor + threshold = paddle.ones([1]) * threshold + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in range(1, len(self.lstm)): + c_list.append(self._zero_state(hs)) + z_list.append(self._zero_state(hs)) + prev_out = paddle.zeros([1, self.odim]) + + # initialize attention + prev_att_ws = [] + prev_att_w = paddle.zeros([ilens]) + prev_att_ws.append(prev_att_w) + + self.att.reset() + + # setup for attention constraint + if use_att_constraint: + last_attended_idx = 0 + else: + last_attended_idx = None + + # loop for an output sequence + idx = 0 + outs, att_ws, probs = [], [], [] + prob = paddle.zeros([1]) + while True: + # updated index + idx += self.reduction_factor + + # decoder calculation + if self.use_att_extra_inputs: + att_c, att_w = self.att( + hs, + ilens, + z_list[0], + prev_att_ws[-1], + prev_out, + last_attended_idx=last_attended_idx, + backward_window=backward_window, + forward_window=forward_window, ) + else: + att_c, att_w = self.att( + hs, + ilens, + z_list[0], + prev_att_ws[-1], + last_attended_idx=last_attended_idx, + backward_window=backward_window, + forward_window=forward_window, ) + + att_ws.append(att_w) + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + + z_list[0], c_list[0] = next_hidden + for i in range(1, len(self.lstm)): + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + z_list[i], c_list[i] = next_hidden + zcs = (paddle.concat([z_list[-1], att_c], axis=1) + if self.use_concate else z_list[-1]) + # [(1, odim, r), ...] + outs.append(self.feat_out(zcs).reshape([1, self.odim, -1])) + + prob = F.sigmoid(self.prob_out(zcs))[0] + probs.append(prob) + + if self.output_activation_fn is not None: + prev_out = self.output_activation_fn( + outs[-1][:, :, -1]) # (1, odim) + else: + prev_out = outs[-1][:, :, -1] # (1, odim) + if self.cumulate_att_w and paddle.sum(prev_att_w) != 0: + prev_att_w = prev_att_w + att_w # Note: error when use += + else: + prev_att_w = att_w + prev_att_ws.append(prev_att_w) + if use_att_constraint: + last_attended_idx = int(att_w.argmax()) + + # tacotron2 ljspeech 动转静的问题应该是这里没有正确判断 prob >= threshold 导致的 + if prob >= threshold or idx >= maxlen: + # check mininum length + if idx < minlen: + continue + break + """ + 仅解开 665~667 行的代码块,动转静时会卡死,但是动态图时可以正确生成音频,证明模型没问题 + 同时解开 665~667 行 和 668 ~ 670 行的代码块,动转静时不会卡死,但是生成的音频末尾有多余的噪声 + 证明动转静没有进入 prob >= threshold 的判断,但是静态图可以进入 prob >= threshold 并退出循环 + 动转静时是通过 idx >= maxlen 退出循环(所以没有这个逻辑的时候会一直循环,也就是卡死), + 没有在模型判断该结束的时候结束,而是在超出最大长度时结束,所以合成的音频末尾有很长的额外预测的噪声 + 动转静用 prob <= threshold 的条件可以退出循环(虽然结果不正确),证明条件参数的类型本身没问题,可能是 prob 有问题 + """ + # if prob >= threshold: + # print("prob >= threshold") + # break + # elif idx >= maxlen: + # print("idx >= maxlen") + # break + + # (1, odim, L) + outs = paddle.concat(outs, axis=2) + if self.postnet is not None: + # (1, odim, L) + outs = outs + self.postnet(outs) + # (L, odim) + outs = outs.transpose([0, 2, 1]).squeeze(0) + probs = paddle.concat(probs, axis=0) + att_ws = paddle.concat(att_ws, axis=0) + + if self.output_activation_fn is not None: + outs = self.output_activation_fn(outs) + + return outs, probs, att_ws + + def calculate_all_attentions(self, hs, hlens, ys): + """Calculate all of the attention weights. + + Args: + hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens (Tensor(int64)): Batch of lengths of each input batch (B,). + ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim). + + Returns: + numpy.ndarray: + Batch of attention weights (B, Lmax, Tmax). + + Note: + This computation is performed in teacher-forcing manner. + """ + # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) + if self.reduction_factor > 1: + ys = ys[:, self.reduction_factor - 1::self.reduction_factor] + + # length list should be list of int + hlens = list(map(int, hlens)) + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in range(1, len(self.lstm)): + c_list.append(self._zero_state(hs)) + z_list.append(self._zero_state(hs)) + prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim]) + + # initialize attention + prev_att_w = None + self.att.reset() + + # loop for an output sequence + att_ws = [] + for y in ys.transpose([1, 0, 2]): + if self.use_att_extra_inputs: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w, + prev_out) + else: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w) + att_ws.append(att_w) + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + z_list[0], c_list[0] = next_hidden + for i in range(1, len(self.lstm)): + z_list[i], c_list[i] = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + # teacher forcing + prev_out = y + if self.cumulate_att_w and prev_att_w is not None: + # Note: error when use += + prev_att_w = prev_att_w + att_w + else: + prev_att_w = att_w + # (B, Lmax, Tmax) + att_ws = paddle.stack(att_ws, axis=1) + + return att_ws diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index f1889061..db102a11 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -14,7 +14,6 @@ # Modified from espnet(https://github.com/espnet/espnet) """Tacotron2 encoder related modules.""" import paddle -import six from paddle import nn @@ -46,31 +45,18 @@ class Encoder(nn.Layer): dropout_rate=0.5, padding_idx=0, ): """Initialize Tacotron2 encoder module. - - Parameters - ---------- - idim : int - Dimension of the inputs. - input_layer : str - Input layer type. - embed_dim : int, optional - Dimension of character embedding. - elayers : int, optional - The number of encoder blstm layers. - eunits : int, optional - The number of encoder blstm units. - econv_layers : int, optional - The number of encoder conv layers. - econv_filts : int, optional - The number of encoder conv filter size. - econv_chans : int, optional - The number of encoder conv filter channels. - use_batch_norm : bool, optional - Whether to use batch normalization. - use_residual : bool, optional - Whether to use residual connection. - dropout_rate : float, optional - Dropout rate. + Args: + idim (int): Dimension of the inputs. + input_layer (str): Input layer type. + embed_dim (int, optional): Dimension of character embedding. + elayers (int, optional): The number of encoder blstm layers. + eunits (int, optional): The number of encoder blstm units. + econv_layers (int, optional): The number of encoder conv layers. + econv_filts (int, optional): The number of encoder conv filter size. + econv_chans (int, optional): The number of encoder conv filter channels. + use_batch_norm (bool, optional): Whether to use batch normalization. + use_residual (bool, optional): Whether to use residual connection. + dropout_rate (float, optional): Dropout rate. """ super().__init__() @@ -88,7 +74,7 @@ class Encoder(nn.Layer): if econv_layers > 0: self.convs = nn.LayerList() - for layer in six.moves.range(econv_layers): + for layer in range(econv_layers): ichans = (embed_dim if layer == 0 and input_layer == "embed" else econv_chans) if use_batch_norm: @@ -130,6 +116,7 @@ class Encoder(nn.Layer): direction='bidirectional', bias_ih_attr=True, bias_hh_attr=True) + self.blstm.flatten_parameters() else: self.blstm = None @@ -139,26 +126,19 @@ class Encoder(nn.Layer): def forward(self, xs, ilens=None): """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of the padded sequence. Either character ids (B, Tmax) - or acoustic feature (B, Tmax, idim * encoder_reduction_factor). - Padded value should be 0. - ilens : LongTensor - Batch of lengths of each input batch (B,). - - Returns - ---------- - Tensor - Batch of the sequences of encoder states(B, Tmax, eunits). - LongTensor - Batch of lengths of each sequence (B,) + Args: + xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax) + or acoustic feature (B, Tmax, idim * encoder_reduction_factor). + Padded value should be 0. + ilens (Tensor(int64)): Batch of lengths of each input batch (B,). + Returns: + Tensor: Batch of the sequences of encoder states(B, Tmax, eunits). + Tensor(int64): Batch of lengths of each sequence (B,) """ xs = self.embed(xs).transpose([0, 2, 1]) if self.convs is not None: - for i in six.moves.range(len(self.convs)): + for i in range(len(self.convs)): if self.use_residual: xs += self.convs[i](xs) else: @@ -168,10 +148,11 @@ class Encoder(nn.Layer): if not isinstance(ilens, paddle.Tensor): ilens = paddle.to_tensor(ilens) xs = xs.transpose([0, 2, 1]) - self.blstm.flatten_parameters() + # for dygraph to static graph + # self.blstm.flatten_parameters() # (B, Tmax, C) - xs, _ = self.blstm(xs) - # hlens 是什么 + # see https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/train_cn.html#paddletorch-nn-utils-rnn-pack-padded-sequencetorch-nn-utils-rnn-pad-packed-sequenceapi + xs, _ = self.blstm(xs, sequence_length=ilens) hlens = ilens return xs, hlens @@ -179,19 +160,15 @@ class Encoder(nn.Layer): def inference(self, x): """Inference. - Parameters - ---------- - x : Tensor - The sequeunce of character ids (T,) - or acoustic feature (T, idim * encoder_reduction_factor). + Args: + x (Tensor): The sequeunce of character ids (T,) + or acoustic feature (T, idim * encoder_reduction_factor). - Returns - ---------- - Tensor - The sequences of encoder states(T, eunits). + Returns: + Tensor: The sequences of encoder states(T, eunits). """ xs = x.unsqueeze(0) - ilens = paddle.to_tensor([x.shape[0]]) + ilens = paddle.shape(x)[0] return self.forward(xs, ilens)[0][0] diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py index 1ca4e6d8..b2275e23 100644 --- a/paddlespeech/t2s/modules/tade_res_block.py +++ b/paddlespeech/t2s/modules/tade_res_block.py @@ -59,18 +59,12 @@ class TADELayer(nn.Layer): def forward(self, x, c): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T). - c : Tensor - Auxiliary input tensor (B, aux_channels, T). - Returns - ---------- - Tensor - Output tensor (B, in_channels, T * upsample_factor). - Tensor - Upsampled aux tensor (B, in_channels, T * upsample_factor). + Args: + x (Tensor): Input tensor (B, in_channels, T). + c (Tensor): Auxiliary input tensor (B, aux_channels, T). + Returns: + Tensor: Output tensor (B, in_channels, T * upsample_factor). + Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor). """ x = self.norm(x) @@ -142,18 +136,13 @@ class TADEResBlock(nn.Layer): def forward(self, x, c): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T). - c : Tensor - Auxiliary input tensor (B, aux_channels, T). - Returns - ---------- - Tensor - Output tensor (B, in_channels, T * upsample_factor). - Tensor - Upsampled auxirialy tensor (B, in_channels, T * upsample_factor). + Args: + + x (Tensor): Input tensor (B, in_channels, T). + c (Tensor): Auxiliary input tensor (B, aux_channels, T). + Returns: + Tensor: Output tensor (B, in_channels, T * upsample_factor). + Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor). """ residual = x x, c = self.tade1(x, c) diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py index 34386f2a..cdb95b21 100644 --- a/paddlespeech/t2s/modules/transformer/attention.py +++ b/paddlespeech/t2s/modules/transformer/attention.py @@ -24,15 +24,10 @@ from paddlespeech.t2s.modules.masked_fill import masked_fill class MultiHeadedAttention(nn.Layer): """Multi-Head Attention layer. - - Parameters - ---------- - n_head : int - The number of heads. - n_feat : int - The number of features. - dropout_rate : float - Dropout rate. + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. """ def __init__(self, n_head, n_feat, dropout_rate): @@ -52,23 +47,15 @@ class MultiHeadedAttention(nn.Layer): def forward_qkv(self, query, key, value): """Transform query, key and value. - Parameters - ---------- - query : paddle.Tensor - query tensor (#batch, time1, size). - key : paddle.Tensor - Key tensor (#batch, time2, size). - value : paddle.Tensor - Value tensor (#batch, time2, size). - - Returns - ---------- - paddle.Tensor - Transformed query tensor (#batch, n_head, time1, d_k). - paddle.Tensor - Transformed key tensor (#batch, n_head, time2, d_k). - paddle.Tensor - Transformed value tensor (#batch, n_head, time2, d_k). + Args: + query(Tensor): query tensor (#batch, time1, size). + key(Tensor): Key tensor (#batch, time2, size). + value(Tensor): Value tensor (#batch, time2, size). + + Returns: + Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + Tensor: Transformed value tensor (#batch, n_head, time2, d_k). """ n_batch = paddle.shape(query)[0] @@ -89,20 +76,13 @@ class MultiHeadedAttention(nn.Layer): def forward_attention(self, value, scores, mask=None): """Compute attention context vector. - Parameters - ---------- - value : paddle.Tensor - Transformed value (#batch, n_head, time2, d_k). - scores : paddle.Tensor - Attention score (#batch, n_head, time1, time2). - mask : paddle.Tensor - Mask (#batch, 1, time2) or (#batch, time1, time2). - - Returns - ---------- - paddle.Tensor: - Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). + Args: + value(Tensor): Transformed value (#batch, n_head, time2, d_k). + scores(Tensor): Attention score (#batch, n_head, time1, time2). + mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) + + Returns: + Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2). """ n_batch = paddle.shape(value)[0] softmax = paddle.nn.Softmax(axis=-1) @@ -132,21 +112,14 @@ class MultiHeadedAttention(nn.Layer): def forward(self, query, key, value, mask=None): """Compute scaled dot product attention. - Parameters - ---------- - query : paddle.Tensor - Query tensor (#batch, time1, size). - key : paddle.Tensor - Key tensor (#batch, time2, size). - value : paddle.Tensor - Value tensor (#batch, time2, size). - mask : paddle.Tensor - Mask tensor (#batch, 1, time2) or (#batch, time1, time2). - - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time1, d_model). + Args: + query(Tensor): Query tensor (#batch, time1, size). + key(Tensor): Key tensor (#batch, time2, size). + value(Tensor): Value tensor (#batch, time2, size). + mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) + + Returns: + Tensor: Output tensor (#batch, time1, d_model). """ q, k, v = self.forward_qkv(query, key, value) scores = paddle.matmul(q, k.transpose( @@ -159,16 +132,12 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): """Multi-Head Attention layer with relative position encoding (new implementation). Details can be found in https://github.com/espnet/espnet/pull/2816. Paper: https://arxiv.org/abs/1901.02860 - Parameters - ---------- - n_head : int - The number of heads. - n_feat : int - The number of features. - dropout_rate : float - Dropout rate. - zero_triu : bool - Whether to zero the upper triangular part of attention matrix. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. """ def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): @@ -191,15 +160,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): def rel_shift(self, x): """Compute relative positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, head, time1, 2*time1-1). - time1 means the length of query vector. - Returns - ---------- - paddle.Tensor - Output tensor. + Args: + x(Tensor): Input tensor (batch, head, time1, 2*time1-1). + + Returns: + Tensor:Output tensor. """ b, h, t1, t2 = paddle.shape(x) zero_pad = paddle.zeros((b, h, t1, 1)) @@ -216,24 +181,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): def forward(self, query, key, value, pos_emb, mask): """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Parameters - ---------- - query : paddle.Tensor - Query tensor (#batch, time1, size). - key : paddle.Tensor - Key tensor (#batch, time2, size). - value : paddle.Tensor - Value tensor (#batch, time2, size). - pos_emb : paddle.Tensor - Positional embedding tensor - (#batch, 2*time1-1, size). - mask : paddle.Tensor - Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time1, d_model). + + Args: + query(Tensor): Query tensor (#batch, time1, size). + key(Tensor): Key tensor (#batch, time2, size). + value(Tensor): Value tensor (#batch, time2, size). + pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size). + mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + Tensor: Output tensor (#batch, time1, d_model). """ q, k, v = self.forward_qkv(query, key, value) # (batch, time1, head, d_k) diff --git a/paddlespeech/t2s/modules/transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py index fe2949f4..a8db7345 100644 --- a/paddlespeech/t2s/modules/transformer/decoder.py +++ b/paddlespeech/t2s/modules/transformer/decoder.py @@ -36,51 +36,32 @@ from paddlespeech.t2s.modules.transformer.repeat import repeat class Decoder(nn.Layer): """Transfomer decoder module. - Parameters - ---------- - odim : int - Output diminsion. - self_attention_layer_type : str - Self-attention layer type. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - conv_wshare : int - The number of kernel of convolution. Only used in - self_attention_layer_type == "lightconv*" or "dynamiconv*". - conv_kernel_length : Union[int, str]) - Kernel size str of convolution - (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". - conv_usebias : bool - Whether to use bias in convolution. Only used in - self_attention_layer_type == "lightconv*" or "dynamiconv*". - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - self_attention_dropout_rate : float - Dropout rate in self-attention. - src_attention_dropout_rate : float - Dropout rate in source-attention. - input_layer : (Union[str, nn.Layer]) - Input layer type. - use_output_layer : bool - Whether to use output layer. - pos_enc_class : nn.Layer - Positional encoding module class. - `PositionalEncoding `or `ScaledPositionalEncoding` - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) + Args: + odim (int): Output diminsion. + self_attention_layer_type (str): Self-attention layer type. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + conv_wshare (int): The number of kernel of convolution. Only used in + self_attention_layer_type == "lightconv*" or "dynamiconv*". + conv_kernel_length (Union[int, str]):Kernel size str of convolution + (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". + conv_usebias (bool): Whether to use bias in convolution. Only used in + self_attention_layer_type == "lightconv*" or "dynamiconv*". + linear_units(int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + self_attention_dropout_rate (float): Dropout rate in self-attention. + src_attention_dropout_rate (float): Dropout rate in source-attention. + input_layer (Union[str, nn.Layer]): Input layer type. + use_output_layer (bool): Whether to use output layer. + pos_enc_class (nn.Layer): Positional encoding module class. + `PositionalEncoding `or `ScaledPositionalEncoding` + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) """ @@ -161,27 +142,18 @@ class Decoder(nn.Layer): def forward(self, tgt, tgt_mask, memory, memory_mask): """Forward decoder. - - Parameters - ---------- - tgt : paddle.Tensor - Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". - In the other case, input tensor (#batch, maxlen_out, odim). - tgt_mask : paddle.Tensor - Input token mask (#batch, maxlen_out). - memory : paddle.Tensor - Encoded memory, float32 (#batch, maxlen_in, feat). - memory_mask : paddle.Tensor - Encoded memory mask (#batch, maxlen_in). - - Returns - ---------- - paddle.Tensor - Decoded token score before softmax (#batch, maxlen_out, odim) - if use_output_layer is True. In the other case,final block outputs - (#batch, maxlen_out, attention_dim). - paddle.Tensor - Score mask before softmax (#batch, maxlen_out). + Args: + tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". + In the other case, input tensor (#batch, maxlen_out, odim). + tgt_mask(Tensor): Input token mask (#batch, maxlen_out). + memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat). + memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in). + + Returns: + Tensor: + Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True. + In the other case,final block outputs (#batch, maxlen_out, attention_dim). + Tensor: Score mask before softmax (#batch, maxlen_out). """ x = self.embed(tgt) @@ -196,23 +168,15 @@ class Decoder(nn.Layer): def forward_one_step(self, tgt, tgt_mask, memory, cache=None): """Forward one step. - Parameters - ---------- - tgt : paddle.Tensor - Input token ids, int64 (#batch, maxlen_out). - tgt_mask : paddle.Tensor - Input token mask (#batch, maxlen_out). - memory : paddle.Tensor - Encoded memory, float32 (#batch, maxlen_in, feat). - cache : (List[paddle.Tensor]) - List of cached tensors. - Each tensor shape should be (#batch, maxlen_out - 1, size). - Returns - ---------- - paddle.Tensor - Output tensor (batch, maxlen_out, odim). - List[paddle.Tensor] - List of cache tensors of each decoder layer. + Args: + tgt(Tensor): Input token ids, int64 (#batch, maxlen_out). + tgt_mask(Tensor): Input token mask (#batch, maxlen_out). + memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat). + cache((List[Tensor]), optional): List of cached tensors. (Default value = None) + + Returns: + Tensor: Output tensor (batch, maxlen_out, odim). + List[Tensor]: List of cache tensors of each decoder layer. """ x = self.embed(tgt) @@ -254,20 +218,14 @@ class Decoder(nn.Layer): xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]: """Score new token batch (required). - Parameters - ---------- - ys : paddle.Tensor - paddle.int64 prefix tokens (n_batch, ylen). - states : List[Any] - Scorer states for prefix tokens. - xs : paddle.Tensor - The encoder feature that generates ys (n_batch, xlen, n_feat). + Args: + ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen). + states(List[Any]): Scorer states for prefix tokens. + xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat). - Returns - ---------- - tuple[paddle.Tensor, List[Any]] - Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)` - and next state list for ys. + Returns: + tuple[Tensor, List[Any]]: + Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)` and next state list for ys. """ # merge states diff --git a/paddlespeech/t2s/modules/transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py index 44978f1e..9a13cd79 100644 --- a/paddlespeech/t2s/modules/transformer/decoder_layer.py +++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py @@ -22,28 +22,21 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm class DecoderLayer(nn.Layer): """Single decoder layer module. - Parameters - ---------- - size : int - Input dimension. - self_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward : nn.Layer - Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - dropout_rate : float - Dropout rate. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) + + Args: + size (int): Input dimension. + self_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + src_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward (nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) """ @@ -75,30 +68,22 @@ class DecoderLayer(nn.Layer): def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None): """Compute decoded features. - Parameters - ---------- - tgt : paddle.Tensor - Input tensor (#batch, maxlen_out, size). - tgt_mask : paddle.Tensor - Mask for input tensor (#batch, maxlen_out). - memory : paddle.Tensor - Encoded memory, float32 (#batch, maxlen_in, size). - memory_mask : paddle.Tensor - Encoded memory mask (#batch, maxlen_in). - cache : List[paddle.Tensor] - List of cached tensors. - Each tensor shape should be (#batch, maxlen_out - 1, size). - - Returns - ---------- - paddle.Tensor - Output tensor(#batch, maxlen_out, size). - paddle.Tensor - Mask for output tensor (#batch, maxlen_out). - paddle.Tensor - Encoded memory (#batch, maxlen_in, size). - paddle.Tensor - Encoded memory mask (#batch, maxlen_in). + Args: + tgt(Tensor): Input tensor (#batch, maxlen_out, size). + tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out). + memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size). + memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in). + cache(List[Tensor], optional): List of cached tensors. + Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None) + Returns: + Tensor + Output tensor(#batch, maxlen_out, size). + Tensor + Mask for output tensor (#batch, maxlen_out). + Tensor + Encoded memory (#batch, maxlen_in, size). + Tensor + Encoded memory mask (#batch, maxlen_in). """ residual = tgt diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py index 40ab03ee..d9339d20 100644 --- a/paddlespeech/t2s/modules/transformer/embedding.py +++ b/paddlespeech/t2s/modules/transformer/embedding.py @@ -22,18 +22,12 @@ from paddle import nn class PositionalEncoding(nn.Layer): """Positional encoding. - Parameters - ---------- - d_model : int - Embedding dimension. - dropout_rate : float - Dropout rate. - max_len : int - Maximum input length. - reverse : bool - Whether to reverse the input position. - type : str - dtype of param + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + reverse (bool): Whether to reverse the input position. + type (str): dtype of param """ def __init__(self, @@ -73,15 +67,11 @@ class PositionalEncoding(nn.Layer): def forward(self, x: paddle.Tensor): """Add positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, time, `*`). + Args: + x (Tensor): Input tensor (batch, time, `*`). - Returns - ---------- - paddle.Tensor - Encoded tensor (batch, time, `*`). + Returns: + Tensor: Encoded tensor (batch, time, `*`). """ self.extend_pe(x) T = paddle.shape(x)[1] @@ -91,19 +81,13 @@ class PositionalEncoding(nn.Layer): class ScaledPositionalEncoding(PositionalEncoding): """Scaled positional encoding module. - See Sec. 3.2 https://arxiv.org/abs/1809.08895 - Parameters - ---------- - d_model : int - Embedding dimension. - dropout_rate : float - Dropout rate. - max_len : int - Maximum input length. - dtype : str - dtype of param + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + dtype (str): dtype of param """ def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): @@ -126,14 +110,10 @@ class ScaledPositionalEncoding(PositionalEncoding): def forward(self, x): """Add positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, time, `*`). - Returns - ---------- - paddle.Tensor - Encoded tensor (batch, time, `*`). + Args: + x (Tensor): Input tensor (batch, time, `*`). + Returns: + Tensor: Encoded tensor (batch, time, `*`). """ self.extend_pe(x) T = paddle.shape(x)[1] @@ -145,14 +125,11 @@ class RelPositionalEncoding(nn.Layer): """Relative positional encoding module (new implementation). Details can be found in https://github.com/espnet/espnet/pull/2816. See : Appendix B in https://arxiv.org/abs/1901.02860 - Parameters - ---------- - d_model : int - Embedding dimension. - dropout_rate : float - Dropout rate. - max_len : int - Maximum input length. + + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. """ def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): @@ -197,14 +174,10 @@ class RelPositionalEncoding(nn.Layer): def forward(self, x: paddle.Tensor): """Add positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, time, `*`). - Returns - ---------- - paddle.Tensor - Encoded tensor (batch, time, `*`). + Args: + x (Tensor):Input tensor (batch, time, `*`). + Returns: + Tensor: Encoded tensor (batch, time, `*`). """ self.extend_pe(x) x = x * self.xscale diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py index 8bf71b41..2b3ee788 100644 --- a/paddlespeech/t2s/modules/transformer/encoder.py +++ b/paddlespeech/t2s/modules/transformer/encoder.py @@ -37,62 +37,37 @@ from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling class BaseEncoder(nn.Layer): """Base Encoder module. - Parameters - ---------- - idim : int - Input dimension. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - attention_dropout_rate : float - Dropout rate in attention. - input_layer : Union[str, nn.Layer] - Input layer type. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type : str - "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size : int - Kernel size of positionwise conv1d layer. - macaron_style : bool - Whether to use macaron style for positionwise layer. - pos_enc_layer_type : str - Encoder positional encoding layer type. - selfattention_layer_type : str - Encoder attention layer type. - activation_type : str - Encoder activation function type. - use_cnn_module : bool - Whether to use convolution module. - zero_triu : bool - Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel : int - Kernerl size of convolution module. - padding_idx : int - Padding idx for input_layer=embed. - stochastic_depth_rate : float - Maximum probability to skip the encoder layer. - intermediate_layers : Union[List[int], None] - indices of intermediate CTC layer. - indices start from 1. - if not None, intermediate outputs are returned (which changes return type - signature.) - encoder_type: str - "transformer", or "conformer". + Args: + idim (int): Input dimension. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + linear_units (int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + attention_dropout_rate (float): Dropout rate in attention. + input_layer (Union[str, nn.Layer]): Input layer type. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. + macaron_style (bool): Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): Encoder positional encoding layer type. + selfattention_layer_type (str): Encoder attention layer type. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): Kernerl size of convolution module. + padding_idx (int): Padding idx for input_layer=embed. + stochastic_depth_rate (float): Maximum probability to skip the encoder layer. + intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer. + indices start from 1. + if not None, intermediate outputs are returned (which changes return type + signature.) + encoder_type (str): "transformer", or "conformer". """ def __init__(self, @@ -290,19 +265,13 @@ class BaseEncoder(nn.Layer): def forward(self, xs, masks): """Encode input sequence. - Parameters - ---------- - xs : paddle.Tensor - Input tensor (#batch, time, idim). - masks : paddle.Tensor - Mask tensor (#batch, 1, time). - - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, attention_dim). - paddle.Tensor - Mask tensor (#batch, 1, time). + Args: + xs (Tensor): Input tensor (#batch, time, idim). + masks (Tensor): Mask tensor (#batch, 1, time). + + Returns: + Tensor: Output tensor (#batch, time, attention_dim). + Tensor: Mask tensor (#batch, 1, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -313,45 +282,28 @@ class BaseEncoder(nn.Layer): class TransformerEncoder(BaseEncoder): """Transformer encoder module. - Parameters - ---------- - idim : int - Input dimension. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - attention_dropout_rate : float - Dropout rate in attention. - input_layer : Union[str, paddle.nn.Layer] - Input layer type. - pos_enc_layer_type : str - Encoder positional encoding layer type. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type : str - "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size : int - Kernel size of positionwise conv1d layer. - selfattention_layer_type : str - Encoder attention layer type. - activation_type : str - Encoder activation function type. - padding_idx : int - Padding idx for input_layer=embed. + + Args: + idim (int): Input dimension. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + linear_units (int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + attention_dropout_rate (float): Dropout rate in attention. + input_layer (Union[str, paddle.nn.Layer]): Input layer type. + pos_enc_layer_type (str): Encoder positional encoding layer type. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. + selfattention_layer_type (str): Encoder attention layer type. + activation_type (str): Encoder activation function type. + padding_idx (int): Padding idx for input_layer=embed. """ def __init__( @@ -397,19 +349,13 @@ class TransformerEncoder(BaseEncoder): def forward(self, xs, masks): """Encode input sequence. - Parameters - ---------- - xs : paddle.Tensor - Input tensor (#batch, time, idim). - masks : paddle.Tensor - Mask tensor (#batch, 1, time). - - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, attention_dim). - paddle.Tensor - Mask tensor (#batch, 1, time). + Args: + xs(Tensor): Input tensor (#batch, time, idim). + masks(Tensor): Mask tensor (#batch, 1, time). + + Returns: + Tensor: Output tensor (#batch, time, attention_dim). + Tensor:Mask tensor (#batch, 1, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -420,23 +366,15 @@ class TransformerEncoder(BaseEncoder): def forward_one_step(self, xs, masks, cache=None): """Encode input frame. - Parameters - ---------- - xs : paddle.Tensor - Input tensor. - masks : paddle.Tensor - Mask tensor. - cache : List[paddle.Tensor] - List of cache tensors. - - Returns - ---------- - paddle.Tensor - Output tensor. - paddle.Tensor - Mask tensor. - List[paddle.Tensor] - List of new cache tensors. + Args: + xs (Tensor): Input tensor. + masks (Tensor): Mask tensor. + cache (List[Tensor]): List of cache tensors. + + Returns: + Tensor: Output tensor. + Tensor: Mask tensor. + List[Tensor]: List of new cache tensors. """ xs = self.embed(xs) @@ -453,60 +391,35 @@ class TransformerEncoder(BaseEncoder): class ConformerEncoder(BaseEncoder): """Conformer encoder module. - Parameters - ---------- - idim : int - Input dimension. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - attention_dropout_rate : float - Dropout rate in attention. - input_layer : Union[str, nn.Layer] - Input layer type. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type : str - "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size : int - Kernel size of positionwise conv1d layer. - macaron_style : bool - Whether to use macaron style for positionwise layer. - pos_enc_layer_type : str - Encoder positional encoding layer type. - selfattention_layer_type : str - Encoder attention layer type. - activation_type : str - Encoder activation function type. - use_cnn_module : bool - Whether to use convolution module. - zero_triu : bool - Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel : int - Kernerl size of convolution module. - padding_idx : int - Padding idx for input_layer=embed. - stochastic_depth_rate : float - Maximum probability to skip the encoder layer. - intermediate_layers : Union[List[int], None] - indices of intermediate CTC layer. - indices start from 1. - if not None, intermediate outputs are returned (which changes return type - signature.) + + Args: + idim (int): Input dimension. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + linear_units (int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + attention_dropout_rate (float): Dropout rate in attention. + input_layer (Union[str, nn.Layer]): Input layer type. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool):Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. + macaron_style (bool): Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): Encoder positional encoding layer type. + selfattention_layer_type (str): Encoder attention layer type. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): Kernerl size of convolution module. + padding_idx (int): Padding idx for input_layer=embed. + stochastic_depth_rate (float): Maximum probability to skip the encoder layer. + intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1. + if not None, intermediate outputs are returned (which changes return type signature.) """ def __init__( @@ -563,18 +476,13 @@ class ConformerEncoder(BaseEncoder): def forward(self, xs, masks): """Encode input sequence. - Parameters - ---------- - xs : paddle.Tensor - Input tensor (#batch, time, idim). - masks : paddle.Tensor - Mask tensor (#batch, 1, time). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, attention_dim). - paddle.Tensor - Mask tensor (#batch, 1, time). + + Args: + xs (Tensor): Input tensor (#batch, time, idim). + masks (Tensor): Mask tensor (#batch, 1, time). + Returns: + Tensor: Output tensor (#batch, time, attention_dim). + Tensor: Mask tensor (#batch, 1, time). """ if isinstance(self.embed, (Conv2dSubsampling)): xs, masks = self.embed(xs, masks) diff --git a/paddlespeech/t2s/modules/transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py index f55ded3d..72372b69 100644 --- a/paddlespeech/t2s/modules/transformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py @@ -20,25 +20,18 @@ from paddle import nn class EncoderLayer(nn.Layer): """Encoder layer module. - Parameters - ---------- - size : int - Input dimension. - self_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward : nn.Layer - Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - dropout_rate : float - Dropout rate. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) + Args: + size (int): Input dimension. + self_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward (nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) """ def __init__( @@ -65,21 +58,14 @@ class EncoderLayer(nn.Layer): def forward(self, x, mask, cache=None): """Compute encoded features. - Parameters - ---------- - x_input : paddle.Tensor - Input tensor (#batch, time, size). - mask : paddle.Tensor - Mask tensor for the input (#batch, time). - cache : paddle.Tensor - Cache tensor of the input (#batch, time - 1, size). + Args: + x(Tensor): Input tensor (#batch, time, size). + mask(Tensor): Mask tensor for the input (#batch, time). + cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, size). - paddle.Tensor - Mask tensor (#batch, time). + Returns: + Tensor: Output tensor (#batch, time, size). + Tensor: Mask tensor (#batch, time). """ residual = x if self.normalize_before: diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py index ccf84c8a..9bcc1acf 100644 --- a/paddlespeech/t2s/modules/transformer/lightconv.py +++ b/paddlespeech/t2s/modules/transformer/lightconv.py @@ -30,20 +30,13 @@ class LightweightConvolution(nn.Layer): This implementation is based on https://github.com/pytorch/fairseq/tree/master/fairseq - Parameters - ---------- - wshare : int - the number of kernel of convolution - n_feat : int - the number of features - dropout_rate : float - dropout_rate - kernel_size : int - kernel size (length) - use_kernel_mask : bool - Use causal mask or not for convolution kernel - use_bias : bool - Use bias term or not. + Args: + wshare (int): the number of kernel of convolution + n_feat (int): the number of features + dropout_rate (float): dropout_rate + kernel_size (int): kernel size (length) + use_kernel_mask (bool): Use causal mask or not for convolution kernel + use_bias (bool): Use bias term or not. """ @@ -100,21 +93,14 @@ class LightweightConvolution(nn.Layer): This function takes query, key and value but uses only query. This is just for compatibility with self-attention layer (attention.py) - Parameters - ---------- - query : paddle.Tensor - (batch, time1, d_model) input tensor - key : paddle.Tensor - (batch, time2, d_model) NOT USED - value : paddle.Tensor - (batch, time2, d_model) NOT USED - mask : paddle.Tensor - (batch, time1, time2) mask - - Return - ---------- - x : paddle.Tensor - (batch, time1, d_model) ouput + Args: + query (Tensor): input tensor. (batch, time1, d_model) + key (Tensor): NOT USED. (batch, time2, d_model) + value (Tensor): NOT USED. (batch, time2, d_model) + mask : (Tensor): (batch, time1, time2) mask + + Return: + Tensor: ouput. (batch, time1, d_model) """ # linear -> GLU -> lightconv -> linear diff --git a/paddlespeech/t2s/modules/transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py index fd97b004..c10e6add 100644 --- a/paddlespeech/t2s/modules/transformer/mask.py +++ b/paddlespeech/t2s/modules/transformer/mask.py @@ -17,19 +17,16 @@ import paddle def subsequent_mask(size, dtype=paddle.bool): """Create mask for subsequent steps (size, size). - Parameters - ---------- - size : int - size of mask - dtype : paddle.dtype - result dtype - Return - ---------- - paddle.Tensor - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] + + Args: + size (int): size of mask + dtype (paddle.dtype): result dtype + Return: + Tensor: + >>> subsequent_mask(3) + [[1, 0, 0], + [1, 1, 0], + [1, 1, 1]] """ ret = paddle.ones([size, size], dtype=dtype) return paddle.tril(ret) @@ -37,19 +34,13 @@ def subsequent_mask(size, dtype=paddle.bool): def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool): """Create mask for decoder self-attention. - Parameters - ---------- - ys_pad : paddle.Tensor - batch of padded target sequences (B, Lmax) - ignore_id : int - index of padding - dtype : torch.dtype - result dtype - Return - ---------- - paddle.Tensor - (B, Lmax, Lmax) + Args: + ys_pad (Tensor): batch of padded target sequences (B, Lmax) + ignore_id (int): index of padding + dtype (paddle.dtype): result dtype + Return: + Tensor: (B, Lmax, Lmax) """ ys_mask = ys_in_pad != ignore_id m = subsequent_mask(ys_mask.shape[-1]).unsqueeze(0) diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py index df8929e3..d3285b65 100644 --- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py +++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py @@ -31,16 +31,11 @@ class MultiLayeredConv1d(nn.Layer): def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): """Initialize MultiLayeredConv1d module. - Parameters - ---------- - in_chans : int - Number of input channels. - hidden_chans : int - Number of hidden channels. - kernel_size : int - Kernel size of conv1d. - dropout_rate : float - Dropout rate. + Args: + in_chans (int): Number of input channels. + hidden_chans (int): Number of hidden channels. + kernel_size (int): Kernel size of conv1d. + dropout_rate (float): Dropout rate. """ super().__init__() @@ -62,15 +57,11 @@ class MultiLayeredConv1d(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : paddle.Tensor - Batch of input tensors (B, T, in_chans). + Args: + x (Tensor): Batch of input tensors (B, T, in_chans). - Returns - ---------- - paddle.Tensor - Batch of output tensors (B, T, in_chans). + Returns: + Tensor: Batch of output tensors (B, T, in_chans). """ x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose( @@ -87,16 +78,11 @@ class Conv1dLinear(nn.Layer): def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): """Initialize Conv1dLinear module. - Parameters - ---------- - in_chans : int - Number of input channels. - hidden_chans : int - Number of hidden channels. - kernel_size : int - Kernel size of conv1d. - dropout_rate : float - Dropout rate. + Args: + in_chans (int): Number of input channels. + hidden_chans (int): Number of hidden channels. + kernel_size (int): Kernel size of conv1d. + dropout_rate (float): Dropout rate. """ super().__init__() self.w_1 = nn.Conv1D( @@ -112,15 +98,11 @@ class Conv1dLinear(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : paddle.Tensor - Batch of input tensors (B, T, in_chans). + Args: + x (Tensor): Batch of input tensors (B, T, in_chans). - Returns - ---------- - paddle.Tensor - Batch of output tensors (B, T, in_chans). + Returns: + Tensor: Batch of output tensors (B, T, in_chans). """ x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py index 28ed1c31..92af6851 100644 --- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py +++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py @@ -20,14 +20,10 @@ from paddle import nn class PositionwiseFeedForward(nn.Layer): """Positionwise feed forward layer. - Parameters - ---------- - idim : int - Input dimenstion. - hidden_units : int - The number of hidden units. - dropout_rate : float - Dropout rate. + Args: + idim (int): Input dimenstion. + hidden_units (int): The number of hidden units. + dropout_rate (float): Dropout rate. """ def __init__(self, diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py index 0325a638..2073a78b 100644 --- a/paddlespeech/t2s/modules/transformer/repeat.py +++ b/paddlespeech/t2s/modules/transformer/repeat.py @@ -29,16 +29,11 @@ class MultiSequential(paddle.nn.Sequential): def repeat(N, fn): """Repeat module N times. - Parameters - ---------- - N : int - Number of repeat time. - fn : Callable - Function to generate module. + Args: + N (int): Number of repeat time. + fn (Callable): Function to generate module. - Returns - ---------- - MultiSequential - Repeated model instance. + Returns: + MultiSequential: Repeated model instance. """ - return MultiSequential(* [fn(n) for n in range(N)]) + return MultiSequential(*[fn(n) for n in range(N)]) diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py index cf0fca8a..07439705 100644 --- a/paddlespeech/t2s/modules/transformer/subsampling.py +++ b/paddlespeech/t2s/modules/transformer/subsampling.py @@ -21,16 +21,12 @@ from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding class Conv2dSubsampling(nn.Layer): """Convolutional 2D subsampling (to 1/4 length). - Parameters - ---------- - idim : int - Input dimension. - odim : int - Output dimension. - dropout_rate : float - Dropout rate. - pos_enc : nn.Layer - Custom position encoding layer. + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + pos_enc (nn.Layer): Custom position encoding layer. """ def __init__(self, idim, odim, dropout_rate, pos_enc=None): @@ -48,20 +44,12 @@ class Conv2dSubsampling(nn.Layer): def forward(self, x, x_mask): """Subsample x. - Parameters - ---------- - x : paddle.Tensor - Input tensor (#batch, time, idim). - x_mask : paddle.Tensor - Input mask (#batch, 1, time). - Returns - ---------- - paddle.Tensor - Subsampled tensor (#batch, time', odim), - where time' = time // 4. - paddle.Tensor - Subsampled mask (#batch, 1, time'), - where time' = time // 4. + Args: + x (Tensor): Input tensor (#batch, time, idim). + x_mask (Tensor): Input mask (#batch, 1, time). + Returns: + Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4. + Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4. """ # (b, c, t, f) x = x.unsqueeze(1) diff --git a/paddlespeech/t2s/modules/upsample.py b/paddlespeech/t2s/modules/upsample.py index 82e30414..65e78a89 100644 --- a/paddlespeech/t2s/modules/upsample.py +++ b/paddlespeech/t2s/modules/upsample.py @@ -27,17 +27,12 @@ class Stretch2D(nn.Layer): def __init__(self, w_scale: int, h_scale: int, mode: str="nearest"): """Strech an image (or image-like object) with some interpolation. - Parameters - ---------- - w_scale : int - Scalar of width. - h_scale : int - Scalar of the height. - mode : str, optional - Interpolation mode, modes suppored are "nearest", "bilinear", - "trilinear", "bicubic", "linear" and "area",by default "nearest" - - For more details about interpolation, see + Args: + w_scale (int): Scalar of width. + h_scale (int): Scalar of the height. + mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear", + "trilinear", "bicubic", "linear" and "area",by default "nearest" + For more details about interpolation, see `paddle.nn.functional.interpolate `_. """ super().__init__() @@ -47,16 +42,14 @@ class Stretch2D(nn.Layer): def forward(self, x): """ - Parameters - ---------- - x : Tensor - Shape (N, C, H, W) - - Returns - ------- - Tensor - Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``. - The stretched image. + + Args: + x (Tensor): Shape (N, C, H, W) + + Returns: + Tensor: The stretched image. + Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``. + """ out = F.interpolate( x, scale_factor=(self.h_scale, self.w_scale), mode=self.mode) @@ -67,26 +60,16 @@ class UpsampleNet(nn.Layer): """A Layer to upsample spectrogram by applying consecutive stretch and convolutions. - Parameters - ---------- - upsample_scales : List[int] - Upsampling factors for each strech. - nonlinear_activation : Optional[str], optional - Activation after each convolution, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to construct the activation, by default {} - interpolate_mode : str, optional - Interpolation mode of the strech, by default "nearest" - freq_axis_kernel_size : int, optional - Convolution kernel size along the frequency axis, by default 1 - use_causal_conv : bool, optional - Whether to use causal padding before convolution, by default False - - If True, Causal padding is used along the time axis, i.e. padding - amount is ``receptive field - 1`` and 0 for before and after, - respectively. - - If False, "same" padding is used along the time axis. + Args: + upsample_scales (List[int]): Upsampling factors for each strech. + nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {} + interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1 + use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False + If True, Causal padding is used along the time axis, + i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively. + If False, "same" padding is used along the time axis. """ def __init__(self, @@ -122,16 +105,12 @@ class UpsampleNet(nn.Layer): def forward(self, c): """ - Parameters - ---------- - c : Tensor - Shape (N, F, T), spectrogram - - Returns - ------- - Tensor - Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled - spectrogram + Args: + c (Tensor): spectrogram. Shape (N, F, T) + + Returns: + Tensor: upsampled spectrogram. + Shape (N, F, T'), where ``T' = upsample_factor * T``, """ c = c.unsqueeze(1) for f in self.up_layers: @@ -145,35 +124,22 @@ class UpsampleNet(nn.Layer): class ConvInUpsampleNet(nn.Layer): """A Layer to upsample spectrogram composed of a convolution and an UpsampleNet. - - Parameters - ---------- - upsample_scales : List[int] - Upsampling factors for each strech. - nonlinear_activation : Optional[str], optional - Activation after each convolution, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to construct the activation, by default {} - interpolate_mode : str, optional - Interpolation mode of the strech, by default "nearest" - freq_axis_kernel_size : int, optional - Convolution kernel size along the frequency axis, by default 1 - aux_channels : int, optional - Feature size of the input, by default 80 - aux_context_window : int, optional - Context window of the first 1D convolution applied to the input. It - related to the kernel size of the convolution, by default 0 - - If use causal convolution, the kernel size is ``window + 1``, else - the kernel size is ``2 * window + 1``. - use_causal_conv : bool, optional - Whether to use causal padding before convolution, by default False - - If True, Causal padding is used along the time axis, i.e. padding - amount is ``receptive field - 1`` and 0 for before and after, - respectively. - - If False, "same" padding is used along the time axis. + + Args: + upsample_scales (List[int]): Upsampling factors for each strech. + nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {} + interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1 + aux_channels (int, optional): Feature size of the input, by default 80 + aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It + related to the kernel size of the convolution, by default 0 + If use causal convolution, the kernel size is ``window + 1``, + else the kernel size is ``2 * window + 1``. + use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False + If True, Causal padding is used along the time axis, i.e. padding + amount is ``receptive field - 1`` and 0 for before and after, respectively. + If False, "same" padding is used along the time axis. """ def __init__(self, @@ -204,16 +170,11 @@ class ConvInUpsampleNet(nn.Layer): def forward(self, c): """ - Parameters - ---------- - c : Tensor - Shape (N, F, T), spectrogram - - Returns - ------- - Tensors - Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled - spectrogram + Args: + c (Tensor): spectrogram. Shape (N, F, T) + + Returns: + Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``, """ c_ = self.conv_in(c) c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_ diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py index de36db24..05a363ff 100644 --- a/paddlespeech/t2s/training/experiment.py +++ b/paddlespeech/t2s/training/experiment.py @@ -57,35 +57,30 @@ class ExperimentBase(object): Feel free to add/overwrite other methods and standalone functions if you need. - Parameters - ---------- - config: yacs.config.CfgNode - The configuration used for the experiment. - - args: argparse.Namespace - The parsed command line arguments. - - Examples - -------- - >>> def main_sp(config, args): - >>> exp = Experiment(config, args) - >>> exp.setup() - >>> exe.resume_or_load() - >>> exp.run() - >>> - >>> config = get_cfg_defaults() - >>> parser = default_argument_parser() - >>> args = parser.parse_args() - >>> if args.config: - >>> config.merge_from_file(args.config) - >>> if args.opts: - >>> config.merge_from_list(args.opts) - >>> config.freeze() - >>> - >>> if args.ngpu > 1: - >>> dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) - >>> else: - >>> main_sp(config, args) + Args: + config (yacs.config.CfgNode): The configuration used for the experiment. + args (argparse.Namespace): The parsed command line arguments. + + Examples: + >>> def main_sp(config, args): + >>> exp = Experiment(config, args) + >>> exp.setup() + >>> exe.resume_or_load() + >>> exp.run() + >>> + >>> config = get_cfg_defaults() + >>> parser = default_argument_parser() + >>> args = parser.parse_args() + >>> if args.config: + >>> config.merge_from_file(args.config) + >>> if args.opts: + >>> config.merge_from_list(args.opts) + >>> config.freeze() + >>> + >>> if args.ngpu > 1: + >>> dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) + >>> else: + >>> main_sp(config, args) """ def __init__(self, config, args): diff --git a/paddlespeech/t2s/training/extensions/snapshot.py b/paddlespeech/t2s/training/extensions/snapshot.py index 3a86556b..5f8d3c45 100644 --- a/paddlespeech/t2s/training/extensions/snapshot.py +++ b/paddlespeech/t2s/training/extensions/snapshot.py @@ -43,10 +43,8 @@ class Snapshot(extension.Extension): parameters and optimizer states. If the updater inside the trainer subclasses StandardUpdater, everything is good to go. - Parameters - ---------- - checkpoint_dir : Union[str, Path] - The directory to save checkpoints into. + Arsg: + checkpoint_dir (Union[str, Path]): The directory to save checkpoints into. """ trigger = (1, 'epoch') diff --git a/paddlespeech/t2s/training/optimizer.py b/paddlespeech/t2s/training/optimizer.py index 907e3daf..64274d53 100644 --- a/paddlespeech/t2s/training/optimizer.py +++ b/paddlespeech/t2s/training/optimizer.py @@ -26,10 +26,13 @@ optim_classes = dict( sgd=paddle.optimizer.SGD, ) -def build_optimizers(model: nn.Layer, - optim='adadelta', - max_grad_norm=None, - learning_rate=0.01) -> paddle.optimizer: +def build_optimizers( + model: nn.Layer, + optim='adadelta', + max_grad_norm=None, + learning_rate=0.01, + weight_decay=None, + epsilon=1.0e-6, ) -> paddle.optimizer: optim_class = optim_classes.get(optim) if optim_class is None: raise ValueError(f"must be one of {list(optim_classes)}: {optim}") @@ -37,10 +40,13 @@ def build_optimizers(model: nn.Layer, grad_clip = None if max_grad_norm: grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm) - optim = optim_class( - parameters=model.parameters(), - learning_rate=learning_rate, - grad_clip=grad_clip) + optim_dict = {} + optim_dict['parameters'] = model.parameters() + optim_dict['learning_rate'] = learning_rate + optim_dict['grad_clip'] = grad_clip + optim_dict['weight_decay'] = weight_decay + if optim_class not in {'momentum', 'sgd'}: + optim_dict['epsilon'] = epsilon + optimizers = optim_class(**optim_dict) - optimizers = optim return optimizers diff --git a/paddlespeech/t2s/utils/__init__.py b/paddlespeech/t2s/utils/__init__.py index ce3a4ef6..520c81a2 100644 --- a/paddlespeech/t2s/utils/__init__.py +++ b/paddlespeech/t2s/utils/__init__.py @@ -16,3 +16,7 @@ from . import display from . import layer_tools from . import mp_tools from . import scheduler + + +def str2bool(str): + return True if str.lower() == 'true' else False diff --git a/paddlespeech/t2s/utils/error_rate.py b/paddlespeech/t2s/utils/error_rate.py index 7a9fe5ad..41b13b75 100644 --- a/paddlespeech/t2s/utils/error_rate.py +++ b/paddlespeech/t2s/utils/error_rate.py @@ -70,21 +70,14 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '): """Compute the levenshtein distance between reference sequence and hypothesis sequence in word-level. - Parameters - ---------- - reference : str - The reference sentence. - hypothesis : str - The hypothesis sentence. - ignore_case : bool - Whether case-sensitive or not. - delimiter : char(str) - Delimiter of input sentences. - - Returns - ---------- - list - Levenshtein distance and word number of reference sentence. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + delimiter (char(str)): Delimiter of input sentences. + + Returns: + list: Levenshtein distance and word number of reference sentence. """ if ignore_case: reference = reference.lower() @@ -101,21 +94,14 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False): """Compute the levenshtein distance between reference sequence and hypothesis sequence in char-level. - Parameters - ---------- - reference: str - The reference sentence. - hypothesis: str - The hypothesis sentence. - ignore_case: bool - Whether case-sensitive or not. - remove_space: bool - Whether remove internal space characters - - Returns - ---------- - list - Levenshtein distance and length of reference sentence. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + remove_space (bool): Whether remove internal space characters + + Returns: + list: Levenshtein distance and length of reference sentence. """ if ignore_case: reference = reference.lower() @@ -146,27 +132,17 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '): We can use levenshtein distance to calculate WER. Please draw an attention that empty items will be removed when splitting sentences by delimiter. - Parameters - ---------- - reference: str - The reference sentence. - - hypothesis: str - The hypothesis sentence. - ignore_case: bool - Whether case-sensitive or not. - delimiter: char - Delimiter of input sentences. - - Returns - ---------- - float - Word error rate. - - Raises - ---------- - ValueError - If word number of reference is zero. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + delimiter (char): Delimiter of input sentences. + + Returns: + float: Word error rate. + + Raises: + ValueError: If word number of reference is zero. """ edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case, delimiter) @@ -194,26 +170,17 @@ def cer(reference, hypothesis, ignore_case=False, remove_space=False): space characters will be truncated and multiple consecutive space characters in a sentence will be replaced by one space character. - Parameters - ---------- - reference: str - The reference sentence. - hypothesis: str - The hypothesis sentence. - ignore_case: bool - Whether case-sensitive or not. - remove_space: bool - Whether remove internal space characters - - Returns - ---------- - float - Character error rate. - - Raises - ---------- - ValueError - If the reference length is zero. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + remove_space (bool): Whether remove internal space characters + + Returns: + float: Character error rate. + + Raises: + ValueError: If the reference length is zero. """ edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case, remove_space) diff --git a/paddlespeech/t2s/utils/h5_utils.py b/paddlespeech/t2s/utils/h5_utils.py index d0e277db..75c2e448 100644 --- a/paddlespeech/t2s/utils/h5_utils.py +++ b/paddlespeech/t2s/utils/h5_utils.py @@ -23,18 +23,12 @@ import numpy as np def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any: """Read a dataset from a HDF5 file. + Args: + filename (Union[Path, str]): Path of the HDF5 file. + dataset_name (str): Name of the dataset to read. - Parameters - ---------- - filename : Union[Path, str] - Path of the HDF5 file. - dataset_name : str - Name of the dataset to read. - - Returns - ------- - Any - The retrieved dataset. + Returns: + Any: The retrieved dataset. """ filename = Path(filename) @@ -60,17 +54,11 @@ def write_hdf5(filename: Union[Path, str], write_data: np.ndarray, is_overwrite: bool=True) -> None: """Write dataset to HDF5 file. - - Parameters - ---------- - filename : Union[Path, str] - Path of the HDF5 file. - dataset_name : str - Name of the dataset to write to. - write_data : np.ndarrays - The data to write. - is_overwrite : bool, optional - Whether to overwrite, by default True + Args: + filename (Union[Path, str]): Path of the HDF5 file. + dataset_name (str): Name of the dataset to write to. + write_data (np.ndarrays): The data to write. + is_overwrite (bool, optional): Whether to overwrite, by default True """ # convert to numpy array filename = Path(filename) diff --git a/paddlespeech/text/exps/ernie_linear/train.py b/paddlespeech/text/exps/ernie_linear/train.py index 0d730d66..22c25e17 100644 --- a/paddlespeech/text/exps/ernie_linear/train.py +++ b/paddlespeech/text/exps/ernie_linear/train.py @@ -135,9 +135,8 @@ def train_sp(args, config): if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) - trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) - # print(trainer.extensions) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) trainer.run() diff --git a/paddlespeech/vector/exps/ge2e/audio_processor.py b/paddlespeech/vector/exps/ge2e/audio_processor.py index 2d6bbe34..1ab0419e 100644 --- a/paddlespeech/vector/exps/ge2e/audio_processor.py +++ b/paddlespeech/vector/exps/ge2e/audio_processor.py @@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int, partial_utterance_n_frames : int the number of mel spectrogram frames in each partial utterance. - min_pad_coverage : int + min_pad_coverage : int when reaching the last partial utterance, it may or may not have enough frames. If at least of are present, then the last partial utterance will be considered, as if we padded the audio. Otherwise, @@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int, by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint. Returns ---------- - the waveform slices and mel spectrogram slices as lists of array slices. + the waveform slices and mel spectrogram slices as lists of array slices. Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances. """ assert 0 <= overlap < 1 @@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object): # Resample if numpy.array is passed and sr does not match if source_sr is not None and source_sr != self.sampling_rate: - wav = librosa.resample(wav, source_sr, self.sampling_rate) + wav = librosa.resample( + wav, orig_sr=source_sr, target_sr=self.sampling_rate) # loudness normalization wav = normalize_volume( @@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object): def melspectrogram(self, wav): mel = librosa.feature.melspectrogram( - wav, + y=wav, sr=self.sampling_rate, n_fft=self.n_fft, hop_length=self.hop_length, diff --git a/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py index 194eb7f2..ae6f6ad9 100644 --- a/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py +++ b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py @@ -123,9 +123,3 @@ class Collate(object): frame_clips = [self.random_crop(mel) for mel in examples] batced_clips = np.stack(frame_clips) return batced_clips - - -if __name__ == "__main__": - mydataset = MultiSpeakerMelDataset( - Path("/home/chenfeiyu/datasets/SV2TTS/encoder")) - print(mydataset.get_example_by_index(0, 10)) diff --git a/setup.py b/setup.py index cdb899e4..71b6d528 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,8 @@ from setuptools.command.install import install HERE = Path(os.path.abspath(os.path.dirname(__file__))) +VERSION = '0.1.1' + requirements = { "install": [ "editdistance", @@ -85,6 +87,24 @@ requirements = { } +def write_version_py(filename='paddlespeech/__init__.py'): + import paddlespeech + if hasattr(paddlespeech, + "__version__") and paddlespeech.__version__ == VERSION: + return + with open(filename, "a") as f: + f.write(f"\n__version__ = '{VERSION}'\n") + + +def remove_version_py(filename='paddlespeech/__init__.py'): + with open(filename, "r") as f: + lines = f.readlines() + with open(filename, "w") as f: + for line in lines: + if "__version__" not in line: + f.write(line) + + @contextlib.contextmanager def pushd(new_dir): old_dir = os.getcwd() @@ -172,10 +192,12 @@ class UploadCommand(Command): sys.exit() +write_version_py() + setup_info = dict( # Metadata name='paddlespeech', - version='0.1.1', + version=VERSION, author='PaddlePaddle Speech and Language Team', author_email='paddlesl@baidu.com', url='https://github.com/PaddlePaddle/PaddleSpeech', @@ -238,3 +260,5 @@ setup_info = dict( }) setup(**setup_info) + +remove_version_py() diff --git a/setup_audio.py b/setup_audio.py index 5f014065..21204998 100644 --- a/setup_audio.py +++ b/setup_audio.py @@ -13,14 +13,33 @@ # limitations under the License. import setuptools -import paddleaudio - # set the version here -version = paddleaudio.__version__ +VERSION = '0.1.0' + + +def write_version_py(filename='paddleaudio/__init__.py'): + import paddleaudio + if hasattr(paddleaudio, + "__version__") and paddleaudio.__version__ == VERSION: + return + with open(filename, "a") as f: + f.write(f"\n__version__ = '{VERSION}'\n") + + +def remove_version_py(filename='paddleaudio/__init__.py'): + with open(filename, "r") as f: + lines = f.readlines() + with open(filename, "w") as f: + for line in lines: + if "__version__" not in line: + f.write(line) + + +write_version_py() setuptools.setup( name="paddleaudio", - version=version, + version=VERSION, author="", author_email="", description="PaddleAudio, in development", @@ -41,3 +60,5 @@ setuptools.setup( 'soundfile >= 0.9.0', 'colorlog', ], ) + +remove_version_py() diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt new file mode 100644 index 00000000..e003136a --- /dev/null +++ b/speechx/CMakeLists.txt @@ -0,0 +1,124 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +project(paddlespeech VERSION 0.1) + +set(CMAKE_VERBOSE_MAKEFILE on) +# set std-14 +set(CMAKE_CXX_STANDARD 14) + +# include file +include(FetchContent) +include(ExternalProject) +# fc_patch dir +set(FETCHCONTENT_QUIET off) +get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}") +set(FETCHCONTENT_BASE_DIR ${fc_patch}) + + +############################################################################### +# Option Configurations +############################################################################### +# option configurations +option(TEST_DEBUG "option for debug" OFF) + + +############################################################################### +# Include third party +############################################################################### +# #example for include third party +# FetchContent_Declare() +# # FetchContent_MakeAvailable was not added until CMake 3.14 +# FetchContent_MakeAvailable() +# include_directories() + +# ABSEIL-CPP +include(FetchContent) +FetchContent_Declare( + absl + GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git" + GIT_TAG "20210324.1" +) +FetchContent_MakeAvailable(absl) + +# libsndfile +include(FetchContent) +FetchContent_Declare( + libsndfile + GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git" + GIT_TAG "1.0.31" +) +FetchContent_MakeAvailable(libsndfile) + +# gflags +FetchContent_Declare( + gflags + URL https://github.com/gflags/gflags/archive/v2.2.1.zip + URL_HASH SHA256=4e44b69e709c826734dbbbd5208f61888a2faf63f239d73d8ba0011b2dccc97a +) +FetchContent_MakeAvailable(gflags) +include_directories(${gflags_BINARY_DIR}/include) + +# glog +FetchContent_Declare( + glog + URL https://github.com/google/glog/archive/v0.4.0.zip + URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc +) +FetchContent_MakeAvailable(glog) +include_directories(${glog_BINARY_DIR}) + +# gtest +FetchContent_Declare(googletest + URL https://github.com/google/googletest/archive/release-1.10.0.zip + URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91 +) +FetchContent_MakeAvailable(googletest) + +# openfst +set(openfst_SOURCE_DIR ${fc_patch}/openfst-src) +set(openfst_BINARY_DIR ${fc_patch}/openfst-build) +set(openfst_PREFIX_DIR ${fc_patch}/openfst-subbuild/openfst-populate-prefix) +ExternalProject_Add(openfst + URL https://github.com/mjansche/openfst/archive/refs/tags/1.7.2.zip + URL_HASH SHA256=ffc56931025579a8af3515741c0f3b0fc3a854c023421472c07ca0c6389c75e6 + SOURCE_DIR ${openfst_SOURCE_DIR} + BINARY_DIR ${openfst_BINARY_DIR} + CONFIGURE_COMMAND ${openfst_SOURCE_DIR}/configure --prefix=${openfst_PREFIX_DIR} + "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}" + "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}" + "LIBS=-lgflags_nothreads -lglog -lpthread" + BUILD_COMMAND make -j 4 +) +add_dependencies(openfst gflags glog) +link_directories(${openfst_PREFIX_DIR}/lib) +include_directories(${openfst_PREFIX_DIR}/include) + +add_subdirectory(speechx) + +#openblas +#set(OpenBLAS_INSTALL_PREFIX ${fc_patch}/OpenBLAS) +#set(OpenBLAS_SOURCE_DIR ${fc_patch}/OpenBLAS-src) +#ExternalProject_Add( +# OpenBLAS +# GIT_REPOSITORY https://github.com/xianyi/OpenBLAS +# GIT_TAG v0.3.13 +# GIT_SHALLOW TRUE +# GIT_PROGRESS TRUE +# CONFIGURE_COMMAND "" +# BUILD_IN_SOURCE TRUE +# BUILD_COMMAND make USE_LOCKING=1 USE_THREAD=0 +# INSTALL_COMMAND make PREFIX=${OpenBLAS_INSTALL_PREFIX} install +# UPDATE_DISCONNECTED TRUE +#) + +############################################################################### +# Add local library +############################################################################### +# system lib +#find_package() +# if dir have CmakeLists.txt +#add_subdirectory(speechx) +# if dir do not have CmakeLists.txt +#add_library(lib_name STATIC file.cc) +#target_link_libraries(lib_name item0 item1) +#add_dependencies(lib_name depend-target) \ No newline at end of file diff --git a/speechx/docker/.gitkeep b/speechx/docker/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/speechx/examples/.gitkeep b/speechx/examples/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt new file mode 100644 index 00000000..71c7eb7c --- /dev/null +++ b/speechx/speechx/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +project(speechx LANGUAGES CXX) + +link_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/openblas) + +include_directories( +${CMAKE_CURRENT_SOURCE_DIR} +${CMAKE_CURRENT_SOURCE_DIR}/kaldi +) +add_subdirectory(kaldi) + +add_executable(mfcc-test codelab/feat_test/feature-mfcc-test.cc) +target_link_libraries(mfcc-test kaldi-mfcc) diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h new file mode 100644 index 00000000..1966c021 --- /dev/null +++ b/speechx/speechx/base/basic_types.h @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "kaldi/base/kaldi-types.h" + +#include + +typedef float BaseFloat; +typedef double double64; + +typedef signed char int8; +typedef short int16; +typedef int int32; + +#if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD) +typedef long int64; +#else +typedef long long int64; +#endif + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; + +if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD) +typedef unsigned long uint64; +#else +typedef unsigned long long uint64; +#endif + +typedef signed int char32; + +const uint8 kuint8max = (( uint8) 0xFF); +const uint16 kuint16max = ((uint16) 0xFFFF); +const uint32 kuint32max = ((uint32) 0xFFFFFFFF); +const uint64 kuint64max = ((uint64) (0xFFFFFFFFFFFFFFFFLL)); +const int8 kint8min = (( int8) 0x80); +const int8 kint8max = (( int8) 0x7F); +const int16 kint16min = (( int16) 0x8000); +const int16 kint16max = (( int16) 0x7FFF); +const int32 kint32min = (( int32) 0x80000000); +const int32 kint32max = (( int32) 0x7FFFFFFF); +const int64 kint64min = (( int64) (0x8000000000000000LL)); +const int64 kint64max = (( int64) (0x7FFFFFFFFFFFFFFFLL)); + +const BaseFloat kBaseFloatMax = std::numeric_limits::max(); +const BaseFloat kBaseFloatMin = std::numeric_limits::min(); diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h new file mode 100644 index 00000000..c8d254d6 --- /dev/null +++ b/speechx/speechx/base/macros.h @@ -0,0 +1,23 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace ppspeech { + +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + void operator=(const TypeName&) = delete + +} // namespace pp_speech \ No newline at end of file diff --git a/speechx/speechx/codelab/README.md b/speechx/speechx/codelab/README.md new file mode 100644 index 00000000..95c95db1 --- /dev/null +++ b/speechx/speechx/codelab/README.md @@ -0,0 +1,4 @@ +# codelab + +This directory is here for testing some funcitons temporaril. + diff --git a/speechx/speechx/codelab/feat_test/feature-mfcc-test.cc b/speechx/speechx/codelab/feat_test/feature-mfcc-test.cc new file mode 100644 index 00000000..c4367139 --- /dev/null +++ b/speechx/speechx/codelab/feat_test/feature-mfcc-test.cc @@ -0,0 +1,686 @@ +// feat/feature-mfcc-test.cc + +// Copyright 2009-2011 Karel Vesely; Petr Motlicek + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include + +#include "feat/feature-mfcc.h" +#include "base/kaldi-math.h" +#include "matrix/kaldi-matrix-inl.h" +#include "feat/wave-reader.h" + +using namespace kaldi; + + + +static void UnitTestReadWave() { + + std::cout << "=== UnitTestReadWave() ===\n"; + + Vector v, v2; + + std::cout << "<<<=== Reading waveform\n"; + + { + std::ifstream is("test_data/test.wav", std::ios_base::binary); + WaveData wave; + wave.Read(is); + const Matrix data(wave.Data()); + KALDI_ASSERT(data.NumRows() == 1); + v.Resize(data.NumCols()); + v.CopyFromVec(data.Row(0)); + } + + std::cout << "<<<=== Reading Vector waveform, prepared by matlab\n"; + std::ifstream input( + "test_data/test_matlab.ascii" + ); + KALDI_ASSERT(input.good()); + v2.Read(input, false); + input.close(); + + std::cout << "<<<=== Comparing freshly read waveform to 'libsndfile' waveform\n"; + KALDI_ASSERT(v.Dim() == v2.Dim()); + for (int32 i = 0; i < v.Dim(); i++) { + KALDI_ASSERT(v(i) == v2(i)); + } + std::cout << "<<<=== Comparing done\n"; + + // std::cout << "== The Waveform Samples == \n"; + // std::cout << v; + + std::cout << "Test passed :)\n\n"; + +} + + + +/** + */ +static void UnitTestSimple() { + std::cout << "=== UnitTestSimple() ===\n"; + + Vector v(100000); + Matrix m; + + // init with noise + for (int32 i = 0; i < v.Dim(); i++) { + v(i) = (abs( i * 433024253 ) % 65535) - (65535 / 2); + } + + std::cout << "<<<=== Just make sure it runs... Nothing is compared\n"; + // the parametrization object + MfccOptions op; + // trying to have same opts as baseline. + op.frame_opts.dither = 0.0; + op.frame_opts.preemph_coeff = 0.0; + op.frame_opts.window_type = "rectangular"; + op.frame_opts.remove_dc_offset = false; + op.frame_opts.round_to_power_of_two = true; + op.mel_opts.low_freq = 0.0; + op.mel_opts.htk_mode = true; + op.htk_compat = true; + + Mfcc mfcc(op); + // use default parameters + + // compute mfccs. + mfcc.Compute(v, 1.0, &m); + + // possibly dump + // std::cout << "== Output features == \n" << m; + std::cout << "Test passed :)\n\n"; +} + + +static void UnitTestHTKCompare1() { + std::cout << "=== UnitTestHTKCompare1() ===\n"; + + std::ifstream is("test_data/test.wav", std::ios_base::binary); + WaveData wave; + wave.Read(is); + KALDI_ASSERT(wave.Data().NumRows() == 1); + SubVector waveform(wave.Data(), 0); + + // read the HTK features + Matrix htk_features; + { + std::ifstream is("test_data/test.wav.fea_htk.1", + std::ios::in | std::ios_base::binary); + bool ans = ReadHtk(is, &htk_features, 0); + KALDI_ASSERT(ans); + } + + // use mfcc with default configuration... + MfccOptions op; + op.frame_opts.dither = 0.0; + op.frame_opts.preemph_coeff = 0.0; + op.frame_opts.window_type = "hamming"; + op.frame_opts.remove_dc_offset = false; + op.frame_opts.round_to_power_of_two = true; + op.mel_opts.low_freq = 0.0; + op.mel_opts.htk_mode = true; + op.htk_compat = true; + op.use_energy = false; // C0 not energy. + + Mfcc mfcc(op); + + // calculate kaldi features + Matrix kaldi_raw_features; + mfcc.Compute(waveform, 1.0, &kaldi_raw_features); + + DeltaFeaturesOptions delta_opts; + Matrix kaldi_features; + ComputeDeltas(delta_opts, + kaldi_raw_features, + &kaldi_features); + + // compare the results + bool passed = true; + int32 i_old = -1; + KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); + KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); + // Ignore ends-- we make slightly different choices than + // HTK about how to treat the deltas at the ends. + for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) { + for (int32 j = 0; j < kaldi_features.NumCols(); j++) { + BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); + if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! + // print the non-matching data only once per-line + if (i_old != i) { + std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n"; + std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n"; + i_old = i; + } + // print indices of non-matching cells + std::cout << "[" << i << ", " << j << "]"; + passed = false; + }}} + if (!passed) KALDI_ERR << "Test failed"; + + // write the htk features for later inspection + HtkHeader header = { + kaldi_features.NumRows(), + 100000, // 10ms + static_cast(sizeof(float)*kaldi_features.NumCols()), + 021406 // MFCC_D_A_0 + }; + { + std::ofstream os("tmp.test.wav.fea_kaldi.1", + std::ios::out|std::ios::binary); + WriteHtk(os, kaldi_features, header); + } + + std::cout << "Test passed :)\n\n"; + + unlink("tmp.test.wav.fea_kaldi.1"); +} + + +static void UnitTestHTKCompare2() { + std::cout << "=== UnitTestHTKCompare2() ===\n"; + + std::ifstream is("test_data/test.wav", std::ios_base::binary); + WaveData wave; + wave.Read(is); + KALDI_ASSERT(wave.Data().NumRows() == 1); + SubVector waveform(wave.Data(), 0); + + // read the HTK features + Matrix htk_features; + { + std::ifstream is("test_data/test.wav.fea_htk.2", + std::ios::in | std::ios_base::binary); + bool ans = ReadHtk(is, &htk_features, 0); + KALDI_ASSERT(ans); + } + + // use mfcc with default configuration... + MfccOptions op; + op.frame_opts.dither = 0.0; + op.frame_opts.preemph_coeff = 0.0; + op.frame_opts.window_type = "hamming"; + op.frame_opts.remove_dc_offset = false; + op.frame_opts.round_to_power_of_two = true; + op.mel_opts.low_freq = 0.0; + op.mel_opts.htk_mode = true; + op.htk_compat = true; + op.use_energy = true; // Use energy. + + Mfcc mfcc(op); + + // calculate kaldi features + Matrix kaldi_raw_features; + mfcc.Compute(waveform, 1.0, &kaldi_raw_features); + + DeltaFeaturesOptions delta_opts; + Matrix kaldi_features; + ComputeDeltas(delta_opts, + kaldi_raw_features, + &kaldi_features); + + // compare the results + bool passed = true; + int32 i_old = -1; + KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); + KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); + // Ignore ends-- we make slightly different choices than + // HTK about how to treat the deltas at the ends. + for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) { + for (int32 j = 0; j < kaldi_features.NumCols(); j++) { + BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); + if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! + // print the non-matching data only once per-line + if (i_old != i) { + std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n"; + std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n"; + i_old = i; + } + // print indices of non-matching cells + std::cout << "[" << i << ", " << j << "]"; + passed = false; + }}} + if (!passed) KALDI_ERR << "Test failed"; + + // write the htk features for later inspection + HtkHeader header = { + kaldi_features.NumRows(), + 100000, // 10ms + static_cast(sizeof(float)*kaldi_features.NumCols()), + 021406 // MFCC_D_A_0 + }; + { + std::ofstream os("tmp.test.wav.fea_kaldi.2", + std::ios::out|std::ios::binary); + WriteHtk(os, kaldi_features, header); + } + + std::cout << "Test passed :)\n\n"; + + unlink("tmp.test.wav.fea_kaldi.2"); +} + + +static void UnitTestHTKCompare3() { + std::cout << "=== UnitTestHTKCompare3() ===\n"; + + std::ifstream is("test_data/test.wav", std::ios_base::binary); + WaveData wave; + wave.Read(is); + KALDI_ASSERT(wave.Data().NumRows() == 1); + SubVector waveform(wave.Data(), 0); + + // read the HTK features + Matrix htk_features; + { + std::ifstream is("test_data/test.wav.fea_htk.3", + std::ios::in | std::ios_base::binary); + bool ans = ReadHtk(is, &htk_features, 0); + KALDI_ASSERT(ans); + } + + // use mfcc with default configuration... + MfccOptions op; + op.frame_opts.dither = 0.0; + op.frame_opts.preemph_coeff = 0.0; + op.frame_opts.window_type = "hamming"; + op.frame_opts.remove_dc_offset = false; + op.frame_opts.round_to_power_of_two = true; + op.htk_compat = true; + op.use_energy = true; // Use energy. + op.mel_opts.low_freq = 20.0; + //op.mel_opts.debug_mel = true; + op.mel_opts.htk_mode = true; + + Mfcc mfcc(op); + + // calculate kaldi features + Matrix kaldi_raw_features; + mfcc.Compute(waveform, 1.0, &kaldi_raw_features); + + DeltaFeaturesOptions delta_opts; + Matrix kaldi_features; + ComputeDeltas(delta_opts, + kaldi_raw_features, + &kaldi_features); + + // compare the results + bool passed = true; + int32 i_old = -1; + KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); + KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); + // Ignore ends-- we make slightly different choices than + // HTK about how to treat the deltas at the ends. + for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) { + for (int32 j = 0; j < kaldi_features.NumCols(); j++) { + BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); + if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! + // print the non-matching data only once per-line + if (static_cast(i_old) != i) { + std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n"; + std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n"; + i_old = i; + } + // print indices of non-matching cells + std::cout << "[" << i << ", " << j << "]"; + passed = false; + }}} + if (!passed) KALDI_ERR << "Test failed"; + + // write the htk features for later inspection + HtkHeader header = { + kaldi_features.NumRows(), + 100000, // 10ms + static_cast(sizeof(float)*kaldi_features.NumCols()), + 021406 // MFCC_D_A_0 + }; + { + std::ofstream os("tmp.test.wav.fea_kaldi.3", + std::ios::out|std::ios::binary); + WriteHtk(os, kaldi_features, header); + } + + std::cout << "Test passed :)\n\n"; + + unlink("tmp.test.wav.fea_kaldi.3"); +} + + +static void UnitTestHTKCompare4() { + std::cout << "=== UnitTestHTKCompare4() ===\n"; + + std::ifstream is("test_data/test.wav", std::ios_base::binary); + WaveData wave; + wave.Read(is); + KALDI_ASSERT(wave.Data().NumRows() == 1); + SubVector waveform(wave.Data(), 0); + + // read the HTK features + Matrix htk_features; + { + std::ifstream is("test_data/test.wav.fea_htk.4", + std::ios::in | std::ios_base::binary); + bool ans = ReadHtk(is, &htk_features, 0); + KALDI_ASSERT(ans); + } + + // use mfcc with default configuration... + MfccOptions op; + op.frame_opts.dither = 0.0; + op.frame_opts.window_type = "hamming"; + op.frame_opts.remove_dc_offset = false; + op.frame_opts.round_to_power_of_two = true; + op.mel_opts.low_freq = 0.0; + op.htk_compat = true; + op.use_energy = true; // Use energy. + op.mel_opts.htk_mode = true; + + Mfcc mfcc(op); + + // calculate kaldi features + Matrix kaldi_raw_features; + mfcc.Compute(waveform, 1.0, &kaldi_raw_features); + + DeltaFeaturesOptions delta_opts; + Matrix kaldi_features; + ComputeDeltas(delta_opts, + kaldi_raw_features, + &kaldi_features); + + // compare the results + bool passed = true; + int32 i_old = -1; + KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); + KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); + // Ignore ends-- we make slightly different choices than + // HTK about how to treat the deltas at the ends. + for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) { + for (int32 j = 0; j < kaldi_features.NumCols(); j++) { + BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); + if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! + // print the non-matching data only once per-line + if (static_cast(i_old) != i) { + std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n"; + std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n"; + i_old = i; + } + // print indices of non-matching cells + std::cout << "[" << i << ", " << j << "]"; + passed = false; + }}} + if (!passed) KALDI_ERR << "Test failed"; + + // write the htk features for later inspection + HtkHeader header = { + kaldi_features.NumRows(), + 100000, // 10ms + static_cast(sizeof(float)*kaldi_features.NumCols()), + 021406 // MFCC_D_A_0 + }; + { + std::ofstream os("tmp.test.wav.fea_kaldi.4", + std::ios::out|std::ios::binary); + WriteHtk(os, kaldi_features, header); + } + + std::cout << "Test passed :)\n\n"; + + unlink("tmp.test.wav.fea_kaldi.4"); +} + + +static void UnitTestHTKCompare5() { + std::cout << "=== UnitTestHTKCompare5() ===\n"; + + std::ifstream is("test_data/test.wav", std::ios_base::binary); + WaveData wave; + wave.Read(is); + KALDI_ASSERT(wave.Data().NumRows() == 1); + SubVector waveform(wave.Data(), 0); + + // read the HTK features + Matrix htk_features; + { + std::ifstream is("test_data/test.wav.fea_htk.5", + std::ios::in | std::ios_base::binary); + bool ans = ReadHtk(is, &htk_features, 0); + KALDI_ASSERT(ans); + } + + // use mfcc with default configuration... + MfccOptions op; + op.frame_opts.dither = 0.0; + op.frame_opts.window_type = "hamming"; + op.frame_opts.remove_dc_offset = false; + op.frame_opts.round_to_power_of_two = true; + op.htk_compat = true; + op.use_energy = true; // Use energy. + op.mel_opts.low_freq = 0.0; + op.mel_opts.vtln_low = 100.0; + op.mel_opts.vtln_high = 7500.0; + op.mel_opts.htk_mode = true; + + BaseFloat vtln_warp = 1.1; // our approach identical to htk for warp factor >1, + // differs slightly for higher mel bins if warp_factor <0.9 + + Mfcc mfcc(op); + + // calculate kaldi features + Matrix kaldi_raw_features; + mfcc.Compute(waveform, vtln_warp, &kaldi_raw_features); + + DeltaFeaturesOptions delta_opts; + Matrix kaldi_features; + ComputeDeltas(delta_opts, + kaldi_raw_features, + &kaldi_features); + + // compare the results + bool passed = true; + int32 i_old = -1; + KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); + KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); + // Ignore ends-- we make slightly different choices than + // HTK about how to treat the deltas at the ends. + for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) { + for (int32 j = 0; j < kaldi_features.NumCols(); j++) { + BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); + if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! + // print the non-matching data only once per-line + if (static_cast(i_old) != i) { + std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n"; + std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n"; + i_old = i; + } + // print indices of non-matching cells + std::cout << "[" << i << ", " << j << "]"; + passed = false; + }}} + if (!passed) KALDI_ERR << "Test failed"; + + // write the htk features for later inspection + HtkHeader header = { + kaldi_features.NumRows(), + 100000, // 10ms + static_cast(sizeof(float)*kaldi_features.NumCols()), + 021406 // MFCC_D_A_0 + }; + { + std::ofstream os("tmp.test.wav.fea_kaldi.5", + std::ios::out|std::ios::binary); + WriteHtk(os, kaldi_features, header); + } + + std::cout << "Test passed :)\n\n"; + + unlink("tmp.test.wav.fea_kaldi.5"); +} + +static void UnitTestHTKCompare6() { + std::cout << "=== UnitTestHTKCompare6() ===\n"; + + + std::ifstream is("test_data/test.wav", std::ios_base::binary); + WaveData wave; + wave.Read(is); + KALDI_ASSERT(wave.Data().NumRows() == 1); + SubVector waveform(wave.Data(), 0); + + // read the HTK features + Matrix htk_features; + { + std::ifstream is("test_data/test.wav.fea_htk.6", + std::ios::in | std::ios_base::binary); + bool ans = ReadHtk(is, &htk_features, 0); + KALDI_ASSERT(ans); + } + + // use mfcc with default configuration... + MfccOptions op; + op.frame_opts.dither = 0.0; + op.frame_opts.preemph_coeff = 0.97; + op.frame_opts.window_type = "hamming"; + op.frame_opts.remove_dc_offset = false; + op.frame_opts.round_to_power_of_two = true; + op.mel_opts.num_bins = 24; + op.mel_opts.low_freq = 125.0; + op.mel_opts.high_freq = 7800.0; + op.htk_compat = true; + op.use_energy = false; // C0 not energy. + + Mfcc mfcc(op); + + // calculate kaldi features + Matrix kaldi_raw_features; + mfcc.Compute(waveform, 1.0, &kaldi_raw_features); + + DeltaFeaturesOptions delta_opts; + Matrix kaldi_features; + ComputeDeltas(delta_opts, + kaldi_raw_features, + &kaldi_features); + + // compare the results + bool passed = true; + int32 i_old = -1; + KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows()); + KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols()); + // Ignore ends-- we make slightly different choices than + // HTK about how to treat the deltas at the ends. + for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) { + for (int32 j = 0; j < kaldi_features.NumCols(); j++) { + BaseFloat a = kaldi_features(i, j), b = htk_features(i, j); + if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!! + // print the non-matching data only once per-line + if (static_cast(i_old) != i) { + std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n"; + std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n"; + i_old = i; + } + // print indices of non-matching cells + std::cout << "[" << i << ", " << j << "]"; + passed = false; + }}} + if (!passed) KALDI_ERR << "Test failed"; + + // write the htk features for later inspection + HtkHeader header = { + kaldi_features.NumRows(), + 100000, // 10ms + static_cast(sizeof(float)*kaldi_features.NumCols()), + 021406 // MFCC_D_A_0 + }; + { + std::ofstream os("tmp.test.wav.fea_kaldi.6", + std::ios::out|std::ios::binary); + WriteHtk(os, kaldi_features, header); + } + + std::cout << "Test passed :)\n\n"; + + unlink("tmp.test.wav.fea_kaldi.6"); +} + +void UnitTestVtln() { + // Test the function VtlnWarpFreq. + BaseFloat low_freq = 10, high_freq = 7800, + vtln_low_cutoff = 20, vtln_high_cutoff = 7400; + + for (size_t i = 0; i < 100; i++) { + BaseFloat freq = 5000, warp_factor = 0.9 + RandUniform() * 0.2; + AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, + low_freq, high_freq, warp_factor, + freq), + freq / warp_factor); + + AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, + low_freq, high_freq, warp_factor, + low_freq), + low_freq); + AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, + low_freq, high_freq, warp_factor, + high_freq), + high_freq); + BaseFloat freq2 = low_freq + (high_freq-low_freq) * RandUniform(), + freq3 = freq2 + (high_freq-freq2) * RandUniform(); // freq3>=freq2 + BaseFloat w2 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, + low_freq, high_freq, warp_factor, + freq2); + BaseFloat w3 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, + low_freq, high_freq, warp_factor, + freq3); + KALDI_ASSERT(w3 >= w2); // increasing function. + BaseFloat w3dash = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, + low_freq, high_freq, 1.0, + freq3); + AssertEqual(w3dash, freq3); + } +} + +static void UnitTestFeat() { + UnitTestVtln(); + UnitTestReadWave(); + UnitTestSimple(); + UnitTestHTKCompare1(); + UnitTestHTKCompare2(); + // commenting out this one as it doesn't compare right now I normalized + // the way the FFT bins are treated (removed offset of 0.5)... this seems + // to relate to the way frequency zero behaves. + UnitTestHTKCompare3(); + UnitTestHTKCompare4(); + UnitTestHTKCompare5(); + UnitTestHTKCompare6(); + std::cout << "Tests succeeded.\n"; +} + + + +int main() { + try { + for (int i = 0; i < 5; i++) + UnitTestFeat(); + std::cout << "Tests succeeded.\n"; + return 0; + } catch (const std::exception &e) { + std::cerr << e.what(); + return 1; + } +} + + diff --git a/speechx/speechx/common/CMakeLists.txt b/speechx/speechx/common/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt new file mode 100644 index 00000000..259261bd --- /dev/null +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -0,0 +1,2 @@ +aux_source_directory(. DIR_LIB_SRCS) +add_library(decoder STATIC ${DIR_LIB_SRCS}) diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/frontend/text/CMakeLists.txt b/speechx/speechx/frontend/text/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/kaldi/.gitkeep b/speechx/speechx/kaldi/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/kaldi/CMakeLists.txt b/speechx/speechx/kaldi/CMakeLists.txt new file mode 100644 index 00000000..414a6fa0 --- /dev/null +++ b/speechx/speechx/kaldi/CMakeLists.txt @@ -0,0 +1,6 @@ +project(kaldi) + +add_subdirectory(base) +add_subdirectory(util) +add_subdirectory(feat) +add_subdirectory(matrix) diff --git a/speechx/speechx/kaldi/base/CMakeLists.txt b/speechx/speechx/kaldi/base/CMakeLists.txt new file mode 100644 index 00000000..f738bf2d --- /dev/null +++ b/speechx/speechx/kaldi/base/CMakeLists.txt @@ -0,0 +1,7 @@ + +add_library(kaldi-base + io-funcs.cc + kaldi-error.cc + kaldi-math.cc + kaldi-utils.cc + timer.cc) \ No newline at end of file diff --git a/speechx/speechx/kaldi/base/io-funcs-inl.h b/speechx/speechx/kaldi/base/io-funcs-inl.h new file mode 100644 index 00000000..b703ef5a --- /dev/null +++ b/speechx/speechx/kaldi/base/io-funcs-inl.h @@ -0,0 +1,327 @@ +// base/io-funcs-inl.h + +// Copyright 2009-2011 Microsoft Corporation; Saarland University; +// Jan Silovsky; Yanmin Qian; +// Johns Hopkins University (Author: Daniel Povey) +// 2016 Xiaohui Zhang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_BASE_IO_FUNCS_INL_H_ +#define KALDI_BASE_IO_FUNCS_INL_H_ 1 + +// Do not include this file directly. It is included by base/io-funcs.h + +#include +#include + +namespace kaldi { + +// Template that covers integers. +template void WriteBasicType(std::ostream &os, + bool binary, T t) { + // Compile time assertion that this is not called with a wrong type. + KALDI_ASSERT_IS_INTEGER_TYPE(T); + if (binary) { + char len_c = (std::numeric_limits::is_signed ? 1 : -1) + * static_cast(sizeof(t)); + os.put(len_c); + os.write(reinterpret_cast(&t), sizeof(t)); + } else { + if (sizeof(t) == 1) + os << static_cast(t) << " "; + else + os << t << " "; + } + if (os.fail()) { + KALDI_ERR << "Write failure in WriteBasicType."; + } +} + +// Template that covers integers. +template inline void ReadBasicType(std::istream &is, + bool binary, T *t) { + KALDI_PARANOID_ASSERT(t != NULL); + // Compile time assertion that this is not called with a wrong type. + KALDI_ASSERT_IS_INTEGER_TYPE(T); + if (binary) { + int len_c_in = is.get(); + if (len_c_in == -1) + KALDI_ERR << "ReadBasicType: encountered end of stream."; + char len_c = static_cast(len_c_in), len_c_expected + = (std::numeric_limits::is_signed ? 1 : -1) + * static_cast(sizeof(*t)); + if (len_c != len_c_expected) { + KALDI_ERR << "ReadBasicType: did not get expected integer type, " + << static_cast(len_c) + << " vs. " << static_cast(len_c_expected) + << ". You can change this code to successfully" + << " read it later, if needed."; + // insert code here to read "wrong" type. Might have a switch statement. + } + is.read(reinterpret_cast(t), sizeof(*t)); + } else { + if (sizeof(*t) == 1) { + int16 i; + is >> i; + *t = i; + } else { + is >> *t; + } + } + if (is.fail()) { + KALDI_ERR << "Read failure in ReadBasicType, file position is " + << is.tellg() << ", next char is " << is.peek(); + } +} + +// Template that covers integers. +template +inline void WriteIntegerPairVector(std::ostream &os, bool binary, + const std::vector > &v) { + // Compile time assertion that this is not called with a wrong type. + KALDI_ASSERT_IS_INTEGER_TYPE(T); + if (binary) { + char sz = sizeof(T); // this is currently just a check. + os.write(&sz, 1); + int32 vecsz = static_cast(v.size()); + KALDI_ASSERT((size_t)vecsz == v.size()); + os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); + if (vecsz != 0) { + os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz * 2); + } + } else { + // focus here is on prettiness of text form rather than + // efficiency of reading-in. + // reading-in is dominated by low-level operations anyway: + // for efficiency use binary. + os << "[ "; + typename std::vector >::const_iterator iter = v.begin(), + end = v.end(); + for (; iter != end; ++iter) { + if (sizeof(T) == 1) + os << static_cast(iter->first) << ',' + << static_cast(iter->second) << ' '; + else + os << iter->first << ',' + << iter->second << ' '; + } + os << "]\n"; + } + if (os.fail()) { + KALDI_ERR << "Write failure in WriteIntegerPairVector."; + } +} + +// Template that covers integers. +template +inline void ReadIntegerPairVector(std::istream &is, bool binary, + std::vector > *v) { + KALDI_ASSERT_IS_INTEGER_TYPE(T); + KALDI_ASSERT(v != NULL); + if (binary) { + int sz = is.peek(); + if (sz == sizeof(T)) { + is.get(); + } else { // this is currently just a check. + KALDI_ERR << "ReadIntegerPairVector: expected to see type of size " + << sizeof(T) << ", saw instead " << sz << ", at file position " + << is.tellg(); + } + int32 vecsz; + is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); + if (is.fail() || vecsz < 0) goto bad; + v->resize(vecsz); + if (vecsz > 0) { + is.read(reinterpret_cast(&((*v)[0])), sizeof(T)*vecsz*2); + } + } else { + std::vector > tmp_v; // use temporary so v doesn't use extra memory + // due to resizing. + is >> std::ws; + if (is.peek() != static_cast('[')) { + KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw " + << is.peek() << ", at file position " << is.tellg(); + } + is.get(); // consume the '['. + is >> std::ws; // consume whitespace. + while (is.peek() != static_cast(']')) { + if (sizeof(T) == 1) { // read/write chars as numbers. + int16 next_t1, next_t2; + is >> next_t1; + if (is.fail()) goto bad; + if (is.peek() != static_cast(',')) + KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " + << is.peek() << ", at file position " << is.tellg(); + is.get(); // consume the ','. + is >> next_t2 >> std::ws; + if (is.fail()) goto bad; + else + tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); + } else { + T next_t1, next_t2; + is >> next_t1; + if (is.fail()) goto bad; + if (is.peek() != static_cast(',')) + KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " + << is.peek() << ", at file position " << is.tellg(); + is.get(); // consume the ','. + is >> next_t2 >> std::ws; + if (is.fail()) goto bad; + else + tmp_v.push_back(std::pair(next_t1, next_t2)); + } + } + is.get(); // get the final ']'. + *v = tmp_v; // could use std::swap to use less temporary memory, but this + // uses less permanent memory. + } + if (!is.fail()) return; + bad: + KALDI_ERR << "ReadIntegerPairVector: read failure at file position " + << is.tellg(); +} + +template inline void WriteIntegerVector(std::ostream &os, bool binary, + const std::vector &v) { + // Compile time assertion that this is not called with a wrong type. + KALDI_ASSERT_IS_INTEGER_TYPE(T); + if (binary) { + char sz = sizeof(T); // this is currently just a check. + os.write(&sz, 1); + int32 vecsz = static_cast(v.size()); + KALDI_ASSERT((size_t)vecsz == v.size()); + os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); + if (vecsz != 0) { + os.write(reinterpret_cast(&(v[0])), sizeof(T)*vecsz); + } + } else { + // focus here is on prettiness of text form rather than + // efficiency of reading-in. + // reading-in is dominated by low-level operations anyway: + // for efficiency use binary. + os << "[ "; + typename std::vector::const_iterator iter = v.begin(), end = v.end(); + for (; iter != end; ++iter) { + if (sizeof(T) == 1) + os << static_cast(*iter) << " "; + else + os << *iter << " "; + } + os << "]\n"; + } + if (os.fail()) { + KALDI_ERR << "Write failure in WriteIntegerVector."; + } +} + + +template inline void ReadIntegerVector(std::istream &is, + bool binary, + std::vector *v) { + KALDI_ASSERT_IS_INTEGER_TYPE(T); + KALDI_ASSERT(v != NULL); + if (binary) { + int sz = is.peek(); + if (sz == sizeof(T)) { + is.get(); + } else { // this is currently just a check. + KALDI_ERR << "ReadIntegerVector: expected to see type of size " + << sizeof(T) << ", saw instead " << sz << ", at file position " + << is.tellg(); + } + int32 vecsz; + is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); + if (is.fail() || vecsz < 0) goto bad; + v->resize(vecsz); + if (vecsz > 0) { + is.read(reinterpret_cast(&((*v)[0])), sizeof(T)*vecsz); + } + } else { + std::vector tmp_v; // use temporary so v doesn't use extra memory + // due to resizing. + is >> std::ws; + if (is.peek() != static_cast('[')) { + KALDI_ERR << "ReadIntegerVector: expected to see [, saw " + << is.peek() << ", at file position " << is.tellg(); + } + is.get(); // consume the '['. + is >> std::ws; // consume whitespace. + while (is.peek() != static_cast(']')) { + if (sizeof(T) == 1) { // read/write chars as numbers. + int16 next_t; + is >> next_t >> std::ws; + if (is.fail()) goto bad; + else + tmp_v.push_back((T)next_t); + } else { + T next_t; + is >> next_t >> std::ws; + if (is.fail()) goto bad; + else + tmp_v.push_back(next_t); + } + } + is.get(); // get the final ']'. + *v = tmp_v; // could use std::swap to use less temporary memory, but this + // uses less permanent memory. + } + if (!is.fail()) return; + bad: + KALDI_ERR << "ReadIntegerVector: read failure at file position " + << is.tellg(); +} + + +// Initialize an opened stream for writing by writing an optional binary +// header and modifying the floating-point precision. +inline void InitKaldiOutputStream(std::ostream &os, bool binary) { + // This does not throw exceptions (does not check for errors). + if (binary) { + os.put('\0'); + os.put('B'); + } + // Note, in non-binary mode we may at some point want to mess with + // the precision a bit. + // 7 is a bit more than the precision of float.. + if (os.precision() < 7) + os.precision(7); +} + +/// Initialize an opened stream for reading by detecting the binary header and +// setting the "binary" value appropriately. +inline bool InitKaldiInputStream(std::istream &is, bool *binary) { + // Sets the 'binary' variable. + // Throws exception in the very unusual situation that stream + // starts with '\0' but not then 'B'. + + if (is.peek() == '\0') { // seems to be binary + is.get(); + if (is.peek() != 'B') { + return false; + } + is.get(); + *binary = true; + return true; + } else { + *binary = false; + return true; + } +} + +} // end namespace kaldi. + +#endif // KALDI_BASE_IO_FUNCS_INL_H_ diff --git a/speechx/speechx/kaldi/base/io-funcs.cc b/speechx/speechx/kaldi/base/io-funcs.cc new file mode 100644 index 00000000..150f7409 --- /dev/null +++ b/speechx/speechx/kaldi/base/io-funcs.cc @@ -0,0 +1,218 @@ +// base/io-funcs.cc + +// Copyright 2009-2011 Microsoft Corporation; Saarland University + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/io-funcs.h" +#include "base/kaldi-math.h" + +namespace kaldi { + +template<> +void WriteBasicType(std::ostream &os, bool binary, bool b) { + os << (b ? "T":"F"); + if (!binary) os << " "; + if (os.fail()) + KALDI_ERR << "Write failure in WriteBasicType"; +} + +template<> +void ReadBasicType(std::istream &is, bool binary, bool *b) { + KALDI_PARANOID_ASSERT(b != NULL); + if (!binary) is >> std::ws; // eat up whitespace. + char c = is.peek(); + if (c == 'T') { + *b = true; + is.get(); + } else if (c == 'F') { + *b = false; + is.get(); + } else { + KALDI_ERR << "Read failure in ReadBasicType, file position is " + << is.tellg() << ", next char is " << CharToString(c); + } +} + +template<> +void WriteBasicType(std::ostream &os, bool binary, float f) { + if (binary) { + char c = sizeof(f); + os.put(c); + os.write(reinterpret_cast(&f), sizeof(f)); + } else { + os << f << " "; + } +} + +template<> +void WriteBasicType(std::ostream &os, bool binary, double f) { + if (binary) { + char c = sizeof(f); + os.put(c); + os.write(reinterpret_cast(&f), sizeof(f)); + } else { + os << f << " "; + } +} + +template<> +void ReadBasicType(std::istream &is, bool binary, float *f) { + KALDI_PARANOID_ASSERT(f != NULL); + if (binary) { + double d; + int c = is.peek(); + if (c == sizeof(*f)) { + is.get(); + is.read(reinterpret_cast(f), sizeof(*f)); + } else if (c == sizeof(d)) { + ReadBasicType(is, binary, &d); + *f = d; + } else { + KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() + << ", at file position " << is.tellg(); + } + } else { + is >> *f; + } + if (is.fail()) { + KALDI_ERR << "ReadBasicType: failed to read, at file position " + << is.tellg(); + } +} + +template<> +void ReadBasicType(std::istream &is, bool binary, double *d) { + KALDI_PARANOID_ASSERT(d != NULL); + if (binary) { + float f; + int c = is.peek(); + if (c == sizeof(*d)) { + is.get(); + is.read(reinterpret_cast(d), sizeof(*d)); + } else if (c == sizeof(f)) { + ReadBasicType(is, binary, &f); + *d = f; + } else { + KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() + << ", at file position " << is.tellg(); + } + } else { + is >> *d; + } + if (is.fail()) { + KALDI_ERR << "ReadBasicType: failed to read, at file position " + << is.tellg(); + } +} + +void CheckToken(const char *token) { + if (*token == '\0') + KALDI_ERR << "Token is empty (not a valid token)"; + const char *orig_token = token; + while (*token != '\0') { + if (::isspace(*token)) + KALDI_ERR << "Token is not a valid token (contains space): '" + << orig_token << "'"; + token++; + } +} + +void WriteToken(std::ostream &os, bool binary, const char *token) { + // binary mode is ignored; + // we use space as termination character in either case. + KALDI_ASSERT(token != NULL); + CheckToken(token); // make sure it's valid (can be read back) + os << token << " "; + if (os.fail()) { + KALDI_ERR << "Write failure in WriteToken."; + } +} + +int Peek(std::istream &is, bool binary) { + if (!binary) is >> std::ws; // eat up whitespace. + return is.peek(); +} + +void WriteToken(std::ostream &os, bool binary, const std::string & token) { + WriteToken(os, binary, token.c_str()); +} + +void ReadToken(std::istream &is, bool binary, std::string *str) { + KALDI_ASSERT(str != NULL); + if (!binary) is >> std::ws; // consume whitespace. + is >> *str; + if (is.fail()) { + KALDI_ERR << "ReadToken, failed to read token at file position " + << is.tellg(); + } + if (!isspace(is.peek())) { + KALDI_ERR << "ReadToken, expected space after token, saw instead " + << CharToString(static_cast(is.peek())) + << ", at file position " << is.tellg(); + } + is.get(); // consume the space. +} + +int PeekToken(std::istream &is, bool binary) { + if (!binary) is >> std::ws; // consume whitespace. + bool read_bracket; + if (static_cast(is.peek()) == '<') { + read_bracket = true; + is.get(); + } else { + read_bracket = false; + } + int ans = is.peek(); + if (read_bracket) { + if (!is.unget()) { + // Clear the bad bit. This code can be (and is in fact) reached, since the + // C++ standard does not guarantee that a call to unget() must succeed. + is.clear(); + } + } + return ans; +} + + +void ExpectToken(std::istream &is, bool binary, const char *token) { + int pos_at_start = is.tellg(); + KALDI_ASSERT(token != NULL); + CheckToken(token); // make sure it's valid (can be read back) + if (!binary) is >> std::ws; // consume whitespace. + std::string str; + is >> str; + is.get(); // consume the space. + if (is.fail()) { + KALDI_ERR << "Failed to read token [started at file position " + << pos_at_start << "], expected " << token; + } + // The second half of the '&&' expression below is so that if we're expecting + // "", we will accept "Foo>" instead. This is so that the model-reading + // code will tolerate errors in PeekToken where is.unget() failed; search for + // is.clear() in PeekToken() for an explanation. + if (strcmp(str.c_str(), token) != 0 && + !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) { + KALDI_ERR << "Expected token \"" << token << "\", got instead \"" + << str <<"\"."; + } +} + +void ExpectToken(std::istream &is, bool binary, const std::string &token) { + ExpectToken(is, binary, token.c_str()); +} + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/base/io-funcs.h b/speechx/speechx/kaldi/base/io-funcs.h new file mode 100644 index 00000000..895f661e --- /dev/null +++ b/speechx/speechx/kaldi/base/io-funcs.h @@ -0,0 +1,245 @@ +// base/io-funcs.h + +// Copyright 2009-2011 Microsoft Corporation; Saarland University; +// Jan Silovsky; Yanmin Qian +// 2016 Xiaohui Zhang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_BASE_IO_FUNCS_H_ +#define KALDI_BASE_IO_FUNCS_H_ + +// This header only contains some relatively low-level I/O functions. +// The full Kaldi I/O declarations are in ../util/kaldi-io.h +// and ../util/kaldi-table.h +// They were put in util/ in order to avoid making the Matrix library +// dependent on them. + +#include +#include +#include + +#include "base/kaldi-common.h" +#include "base/io-funcs-inl.h" + +namespace kaldi { + + + +/* + This comment describes the Kaldi approach to I/O. All objects can be written + and read in two modes: binary and text. In addition we want to make the I/O + work if we redefine the typedef "BaseFloat" between floats and doubles. + We also want to have control over whitespace in text mode without affecting + the meaning of the file, for pretty-printing purposes. + + Errors are handled by throwing a KaldiFatalError exception. + + For integer and floating-point types (and boolean values): + + WriteBasicType(std::ostream &, bool binary, const T&); + ReadBasicType(std::istream &, bool binary, T*); + + and we expect these functions to be defined in such a way that they work when + the type T changes between float and double, so you can read float into double + and vice versa]. Note that for efficiency and space-saving reasons, the Vector + and Matrix classes do not use these functions [but they preserve the type + interchangeability in their own way] + + For a class (or struct) C: + class C { + .. + Write(std::ostream &, bool binary, [possibly extra optional args for specific classes]) const; + Read(std::istream &, bool binary, [possibly extra optional args for specific classes]); + .. + } + NOTE: The only actual optional args we used are the "add" arguments in + Vector/Matrix classes, which specify whether we should sum the data already + in the class with the data being read. + + For types which are typedef's involving stl classes, I/O is as follows: + typedef std::vector > MyTypedefName; + + The user should define something like: + + WriteMyTypedefName(std::ostream &, bool binary, const MyTypedefName &t); + ReadMyTypedefName(std::ostream &, bool binary, MyTypedefName *t); + + The user would have to write these functions. + + For a type std::vector: + + void WriteIntegerVector(std::ostream &os, bool binary, const std::vector &v); + void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); + + For other types, e.g. vectors of pairs, the user should create a routine of the + type WriteMyTypedefName. This is to avoid introducing confusing templated functions; + we could easily create templated functions to handle most of these cases but they + would have to share the same name. + + It also often happens that the user needs to write/read special tokens as part + of a file. These might be class headers, or separators/identifiers in the class. + We provide special functions for manipulating these. These special tokens must + be nonempty and must not contain any whitespace. + + void WriteToken(std::ostream &os, bool binary, const char*); + void WriteToken(std::ostream &os, bool binary, const std::string & token); + int Peek(std::istream &is, bool binary); + void ReadToken(std::istream &is, bool binary, std::string *str); + void PeekToken(std::istream &is, bool binary, std::string *str); + + WriteToken writes the token and one space (whether in binary or text mode). + + Peek returns the first character of the next token, by consuming whitespace + (in text mode) and then returning the peek() character. It returns -1 at EOF; + it doesn't throw. It's useful if a class can have various forms based on + typedefs and virtual classes, and wants to know which version to read. + + ReadToken allows the caller to obtain the next token. PeekToken works just + like ReadToken, but seeks back to the beginning of the token. A subsequent + call to ReadToken will read the same token again. This is useful when + different object types are written to the same file; using PeekToken one can + decide which of the objects to read. + + There is currently no special functionality for writing/reading strings (where the strings + contain data rather than "special tokens" that are whitespace-free and nonempty). This is + because Kaldi is structured in such a way that strings don't appear, except as OpenFst symbol + table entries (and these have their own format). + + + NOTE: you should not call ReadIntegerType and WriteIntegerType with types, + such as int and size_t, that are machine-independent -- at least not + if you want your file formats to port between machines. Use int32 and + int64 where necessary. There is no way to detect this using compile-time + assertions because C++ only keeps track of the internal representation of + the type. +*/ + +/// \addtogroup io_funcs_basic +/// @{ + + +/// WriteBasicType is the name of the write function for bool, integer types, +/// and floating-point types. They all throw on error. +template void WriteBasicType(std::ostream &os, bool binary, T t); + +/// ReadBasicType is the name of the read function for bool, integer types, +/// and floating-point types. They all throw on error. +template void ReadBasicType(std::istream &is, bool binary, T *t); + + +// Declare specialization for bool. +template<> +void WriteBasicType(std::ostream &os, bool binary, bool b); + +template <> +void ReadBasicType(std::istream &is, bool binary, bool *b); + +// Declare specializations for float and double. +template<> +void WriteBasicType(std::ostream &os, bool binary, float f); + +template<> +void WriteBasicType(std::ostream &os, bool binary, double f); + +template<> +void ReadBasicType(std::istream &is, bool binary, float *f); + +template<> +void ReadBasicType(std::istream &is, bool binary, double *f); + +// Define ReadBasicType that accepts an "add" parameter to add to +// the destination. Caution: if used in Read functions, be careful +// to initialize the parameters concerned to zero in the default +// constructor. +template +inline void ReadBasicType(std::istream &is, bool binary, T *t, bool add) { + if (!add) { + ReadBasicType(is, binary, t); + } else { + T tmp = T(0); + ReadBasicType(is, binary, &tmp); + *t += tmp; + } +} + +/// Function for writing STL vectors of integer types. +template inline void WriteIntegerVector(std::ostream &os, bool binary, + const std::vector &v); + +/// Function for reading STL vector of integer types. +template inline void ReadIntegerVector(std::istream &is, bool binary, + std::vector *v); + +/// Function for writing STL vectors of pairs of integer types. +template +inline void WriteIntegerPairVector(std::ostream &os, bool binary, + const std::vector > &v); + +/// Function for reading STL vector of pairs of integer types. +template +inline void ReadIntegerPairVector(std::istream &is, bool binary, + std::vector > *v); + +/// The WriteToken functions are for writing nonempty sequences of non-space +/// characters. They are not for general strings. +void WriteToken(std::ostream &os, bool binary, const char *token); +void WriteToken(std::ostream &os, bool binary, const std::string & token); + +/// Peek consumes whitespace (if binary == false) and then returns the peek() +/// value of the stream. +int Peek(std::istream &is, bool binary); + +/// ReadToken gets the next token and puts it in str (exception on failure). If +/// PeekToken() had been previously called, it is possible that the stream had +/// failed to unget the starting '<' character. In this case ReadToken() returns +/// the token string without the leading '<'. You must be prepared to handle +/// this case. ExpectToken() handles this internally, and is not affected. +void ReadToken(std::istream &is, bool binary, std::string *token); + +/// PeekToken will return the first character of the next token, or -1 if end of +/// file. It's the same as Peek(), except if the first character is '<' it will +/// skip over it and will return the next character. It will attempt to unget +/// the '<' so the stream is where it was before you did PeekToken(), however, +/// this is not guaranteed (see ReadToken()). +int PeekToken(std::istream &is, bool binary); + +/// ExpectToken tries to read in the given token, and throws an exception +/// on failure. +void ExpectToken(std::istream &is, bool binary, const char *token); +void ExpectToken(std::istream &is, bool binary, const std::string & token); + +/// ExpectPretty attempts to read the text in "token", but only in non-binary +/// mode. Throws exception on failure. It expects an exact match except that +/// arbitrary whitespace matches arbitrary whitespace. +void ExpectPretty(std::istream &is, bool binary, const char *token); +void ExpectPretty(std::istream &is, bool binary, const std::string & token); + +/// @} end "addtogroup io_funcs_basic" + + +/// InitKaldiOutputStream initializes an opened stream for writing by writing an +/// optional binary header and modifying the floating-point precision; it will +/// typically not be called by users directly. +inline void InitKaldiOutputStream(std::ostream &os, bool binary); + +/// InitKaldiInputStream initializes an opened stream for reading by detecting +/// the binary header and setting the "binary" value appropriately; +/// It will typically not be called by users directly. +inline bool InitKaldiInputStream(std::istream &is, bool *binary); + +} // end namespace kaldi. +#endif // KALDI_BASE_IO_FUNCS_H_ diff --git a/speechx/speechx/kaldi/base/kaldi-common.h b/speechx/speechx/kaldi/base/kaldi-common.h new file mode 100644 index 00000000..264565d1 --- /dev/null +++ b/speechx/speechx/kaldi/base/kaldi-common.h @@ -0,0 +1,41 @@ +// base/kaldi-common.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_BASE_KALDI_COMMON_H_ +#define KALDI_BASE_KALDI_COMMON_H_ 1 + +#include +#include +#include // C string stuff like strcpy +#include +#include +#include +#include +#include +#include +#include + +#include "base/kaldi-utils.h" +#include "base/kaldi-error.h" +#include "base/kaldi-types.h" +#include "base/io-funcs.h" +#include "base/kaldi-math.h" +#include "base/timer.h" + +#endif // KALDI_BASE_KALDI_COMMON_H_ diff --git a/speechx/speechx/kaldi/base/kaldi-error.cc b/speechx/speechx/kaldi/base/kaldi-error.cc new file mode 100644 index 00000000..2dbc7318 --- /dev/null +++ b/speechx/speechx/kaldi/base/kaldi-error.cc @@ -0,0 +1,245 @@ +// base/kaldi-error.cc + +// Copyright 2019 LAIX (Yi Sun) +// Copyright 2019 SmartAction LLC (kkm) +// Copyright 2016 Brno University of Technology (author: Karel Vesely) +// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; Ondrej Glembek + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifdef HAVE_EXECINFO_H +#include // To get stack trace in error messages. +// If this #include fails there is an error in the Makefile, it does not +// support your platform well. Make sure HAVE_EXECINFO_H is undefined, +// and the code will compile. +#ifdef HAVE_CXXABI_H +#include // For name demangling. +// Useful to decode the stack trace, but only used if we have execinfo.h +#endif // HAVE_CXXABI_H +#endif // HAVE_EXECINFO_H + +#include "base/kaldi-common.h" +#include "base/kaldi-error.h" +#include "base/version.h" + +namespace kaldi { + +/***** GLOBAL VARIABLES FOR LOGGING *****/ + +int32 g_kaldi_verbose_level = 0; +static std::string program_name; +static LogHandler log_handler = NULL; + +void SetProgramName(const char *basename) { + // Using the 'static std::string' for the program name is mostly harmless, + // because (a) Kaldi logging is undefined before main(), and (b) no stdc++ + // string implementation has been found in the wild that would not be just + // an empty string when zero-initialized but not yet constructed. + program_name = basename; +} + +/***** HELPER FUNCTIONS *****/ + +// Trim filename to at most 1 trailing directory long. Given a filename like +// "/a/b/c/d/e/f.cc", return "e/f.cc". Support both '/' and '\' as the path +// separator. +static const char *GetShortFileName(const char *path) { + if (path == nullptr) + return ""; + + const char *prev = path, *last = path; + while ((path = std::strpbrk(path, "\\/")) != nullptr) { + ++path; + prev = last; + last = path; + } + return prev; +} + +/***** STACK TRACE *****/ + +namespace internal { +bool LocateSymbolRange(const std::string &trace_name, size_t *begin, + size_t *end) { + // Find the first '_' with leading ' ' or '('. + *begin = std::string::npos; + for (size_t i = 1; i < trace_name.size(); i++) { + if (trace_name[i] != '_') { + continue; + } + if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') { + *begin = i; + break; + } + } + if (*begin == std::string::npos) { + return false; + } + *end = trace_name.find_first_of(" +", *begin); + return *end != std::string::npos; +} +} // namespace internal + +#ifdef HAVE_EXECINFO_H +static std::string Demangle(std::string trace_name) { +#ifndef HAVE_CXXABI_H + return trace_name; +#else // HAVE_CXXABI_H + // Try demangle the symbol. We are trying to support the following formats + // produced by different platforms: + // + // Linux: + // ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d] + // + // Mac: + // 0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813 + // + // We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and + // demangle it info a readable name like kaldi::UnitTextError. + size_t begin, end; + if (!internal::LocateSymbolRange(trace_name, &begin, &end)) { + return trace_name; + } + std::string symbol = trace_name.substr(begin, end - begin); + int status; + char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status); + if (status == 0 && demangled_name != nullptr) { + symbol = demangled_name; + free(demangled_name); + } + return trace_name.substr(0, begin) + symbol + + trace_name.substr(end, std::string::npos); +#endif // HAVE_CXXABI_H +} +#endif // HAVE_EXECINFO_H + +static std::string KaldiGetStackTrace() { + std::string ans; +#ifdef HAVE_EXECINFO_H + const size_t KALDI_MAX_TRACE_SIZE = 50; + const size_t KALDI_MAX_TRACE_PRINT = 50; // Must be even. + // Buffer for the trace. + void *trace[KALDI_MAX_TRACE_SIZE]; + // Get the trace. + size_t size = backtrace(trace, KALDI_MAX_TRACE_SIZE); + // Get the trace symbols. + char **trace_symbol = backtrace_symbols(trace, size); + if (trace_symbol == NULL) + return ans; + + // Compose a human-readable backtrace string. + ans += "[ Stack-Trace: ]\n"; + if (size <= KALDI_MAX_TRACE_PRINT) { + for (size_t i = 0; i < size; i++) { + ans += Demangle(trace_symbol[i]) + "\n"; + } + } else { // Print out first+last (e.g.) 5. + for (size_t i = 0; i < KALDI_MAX_TRACE_PRINT / 2; i++) { + ans += Demangle(trace_symbol[i]) + "\n"; + } + ans += ".\n.\n.\n"; + for (size_t i = size - KALDI_MAX_TRACE_PRINT / 2; i < size; i++) { + ans += Demangle(trace_symbol[i]) + "\n"; + } + if (size == KALDI_MAX_TRACE_SIZE) + ans += ".\n.\n.\n"; // Stack was too long, probably a bug. + } + + // We must free the array of pointers allocated by backtrace_symbols(), + // but not the strings themselves. + free(trace_symbol); +#endif // HAVE_EXECINFO_H + return ans; +} + +/***** KALDI LOGGING *****/ + +MessageLogger::MessageLogger(LogMessageEnvelope::Severity severity, + const char *func, const char *file, int32 line) { + // Obviously, we assume the strings survive the destruction of this object. + envelope_.severity = severity; + envelope_.func = func; + envelope_.file = GetShortFileName(file); // Points inside 'file'. + envelope_.line = line; +} + +void MessageLogger::LogMessage() const { + // Send to the logging handler if provided. + if (log_handler != NULL) { + log_handler(envelope_, GetMessage().c_str()); + return; + } + + // Otherwise, use the default Kaldi logging. + // Build the log-message header. + std::stringstream full_message; + if (envelope_.severity > LogMessageEnvelope::kInfo) { + full_message << "VLOG[" << envelope_.severity << "] ("; + } else { + switch (envelope_.severity) { + case LogMessageEnvelope::kInfo: + full_message << "LOG ("; + break; + case LogMessageEnvelope::kWarning: + full_message << "WARNING ("; + break; + case LogMessageEnvelope::kAssertFailed: + full_message << "ASSERTION_FAILED ("; + break; + case LogMessageEnvelope::kError: + default: // If not the ERROR, it still an error! + full_message << "ERROR ("; + break; + } + } + // Add other info from the envelope and the message text. + full_message << program_name.c_str() << "[" KALDI_VERSION "]" << ':' + << envelope_.func << "():" << envelope_.file << ':' + << envelope_.line << ") " << GetMessage().c_str(); + + // Add stack trace for errors and assertion failures, if available. + if (envelope_.severity < LogMessageEnvelope::kWarning) { + const std::string &stack_trace = KaldiGetStackTrace(); + if (!stack_trace.empty()) { + full_message << "\n\n" << stack_trace; + } + } + + // Print the complete message to stderr. + full_message << "\n"; + std::cerr << full_message.str(); +} + +/***** KALDI ASSERTS *****/ + +void KaldiAssertFailure_(const char *func, const char *file, int32 line, + const char *cond_str) { + MessageLogger::Log() = + MessageLogger(LogMessageEnvelope::kAssertFailed, func, file, line) + << "Assertion failed: (" << cond_str << ")"; + fflush(NULL); // Flush all pending buffers, abort() may not flush stderr. + std::abort(); +} + +/***** THIRD-PARTY LOG-HANDLER *****/ + +LogHandler SetLogHandler(LogHandler handler) { + LogHandler old_handler = log_handler; + log_handler = handler; + return old_handler; +} + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/base/kaldi-error.h b/speechx/speechx/kaldi/base/kaldi-error.h new file mode 100644 index 00000000..a9904a75 --- /dev/null +++ b/speechx/speechx/kaldi/base/kaldi-error.h @@ -0,0 +1,231 @@ +// base/kaldi-error.h + +// Copyright 2019 LAIX (Yi Sun) +// Copyright 2019 SmartAction LLC (kkm) +// Copyright 2016 Brno University of Technology (author: Karel Vesely) +// Copyright 2009-2011 Microsoft Corporation; Ondrej Glembek; Lukas Burget; +// Saarland University + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_BASE_KALDI_ERROR_H_ +#define KALDI_BASE_KALDI_ERROR_H_ 1 + +#include +#include +#include +#include +#include +#include + +#include "base/kaldi-types.h" +#include "base/kaldi-utils.h" +/* Important that this file does not depend on any other kaldi headers. */ + +#ifdef _MSC_VER +#define __func__ __FUNCTION__ +#endif + +namespace kaldi { + +/// \addtogroup error_group +/// @{ + +/***** PROGRAM NAME AND VERBOSITY LEVEL *****/ + +/// Called by ParseOptions to set base name (no directory) of the executing +/// program. The name is printed in logging code along with every message, +/// because in our scripts, we often mix together the stderr of many programs. +/// This function is very thread-unsafe. +void SetProgramName(const char *basename); + +/// This is set by util/parse-options.{h,cc} if you set --verbose=? option. +/// Do not use directly, prefer {Get,Set}VerboseLevel(). +extern int32 g_kaldi_verbose_level; + +/// Get verbosity level, usually set via command line '--verbose=' switch. +inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; } + +/// This should be rarely used, except by programs using Kaldi as library; +/// command-line programs set the verbose level automatically from ParseOptions. +inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; } + +/***** KALDI LOGGING *****/ + +/// Log message severity and source location info. +struct LogMessageEnvelope { + /// Message severity. In addition to these levels, positive values (1 to 6) + /// specify verbose logging level. Verbose messages are produced only when + /// SetVerboseLevel() has been called to set logging level to at least the + /// corresponding value. + enum Severity { + kAssertFailed = -3, //!< Assertion failure. abort() will be called. + kError = -2, //!< Fatal error. KaldiFatalError will be thrown. + kWarning = -1, //!< Indicates a recoverable but abnormal condition. + kInfo = 0, //!< Informational message. + }; + int severity; //!< A Severity value, or positive verbosity level. + const char *func; //!< Name of the function invoking the logging. + const char *file; //!< Source file name with up to 1 leading directory. + int32 line; // MessageLogger &operator<<(const T &val) { + ss_ << val; + return *this; + } + + // When assigned a MessageLogger, log its contents. + struct Log final { + void operator=(const MessageLogger &logger) { logger.LogMessage(); } + }; + + // When assigned a MessageLogger, log its contents and then throw + // a KaldiFatalError. + struct LogAndThrow final { + [[noreturn]] void operator=(const MessageLogger &logger) { + logger.LogMessage(); + throw KaldiFatalError(logger.GetMessage()); + } + }; + +private: + std::string GetMessage() const { return ss_.str(); } + void LogMessage() const; + + LogMessageEnvelope envelope_; + std::ostringstream ss_; +}; + +// Logging macros. +#define KALDI_ERR \ + ::kaldi::MessageLogger::LogAndThrow() = ::kaldi::MessageLogger( \ + ::kaldi::LogMessageEnvelope::kError, __func__, __FILE__, __LINE__) +#define KALDI_WARN \ + ::kaldi::MessageLogger::Log() = ::kaldi::MessageLogger( \ + ::kaldi::LogMessageEnvelope::kWarning, __func__, __FILE__, __LINE__) +#define KALDI_LOG \ + ::kaldi::MessageLogger::Log() = ::kaldi::MessageLogger( \ + ::kaldi::LogMessageEnvelope::kInfo, __func__, __FILE__, __LINE__) +#define KALDI_VLOG(v) \ + if ((v) <= ::kaldi::GetVerboseLevel()) \ + ::kaldi::MessageLogger::Log() = \ + ::kaldi::MessageLogger((::kaldi::LogMessageEnvelope::Severity)(v), \ + __func__, __FILE__, __LINE__) + +/***** KALDI ASSERTS *****/ + +[[noreturn]] void KaldiAssertFailure_(const char *func, const char *file, + int32 line, const char *cond_str); + +// Note on KALDI_ASSERT and KALDI_PARANOID_ASSERT: +// +// A single block {} around if /else does not work, because it causes +// syntax error (unmatched else block) in the following code: +// +// if (condition) +// KALDI_ASSERT(condition2); +// else +// SomethingElse(); +// +// do {} while(0) -- note there is no semicolon at the end! -- works nicely, +// and compilers will be able to optimize the loop away (as the condition +// is always false). +// +// Also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h, and +// KALDI_ASSERT_IS_INTEGER_TYPE and KALDI_ASSERT_IS_FLOATING_TYPE, also defined +// there. +#ifndef NDEBUG +#define KALDI_ASSERT(cond) \ + do { \ + if (cond) \ + (void)0; \ + else \ + ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); \ + } while (0) +#else +#define KALDI_ASSERT(cond) (void)0 +#endif + +// Some more expensive asserts only checked if this defined. +#ifdef KALDI_PARANOID +#define KALDI_PARANOID_ASSERT(cond) \ + do { \ + if (cond) \ + (void)0; \ + else \ + ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); \ + } while (0) +#else +#define KALDI_PARANOID_ASSERT(cond) (void)0 +#endif + +/***** THIRD-PARTY LOG-HANDLER *****/ + +/// Type of third-party logging function. +typedef void (*LogHandler)(const LogMessageEnvelope &envelope, + const char *message); + +/// Set logging handler. If called with a non-NULL function pointer, the +/// function pointed by it is called to send messages to a caller-provided log. +/// If called with a NULL pointer, restores default Kaldi error logging to +/// stderr. This function is obviously not thread safe; the log handler must be. +/// Returns a previously set logging handler pointer, or NULL. +LogHandler SetLogHandler(LogHandler); + +/// @} end "addtogroup error_group" + +// Functions within internal is exported for testing only, do not use. +namespace internal { +bool LocateSymbolRange(const std::string &trace_name, size_t *begin, + size_t *end); +} // namespace internal +} // namespace kaldi + +#endif // KALDI_BASE_KALDI_ERROR_H_ diff --git a/speechx/speechx/kaldi/base/kaldi-math.cc b/speechx/speechx/kaldi/base/kaldi-math.cc new file mode 100644 index 00000000..484c80d4 --- /dev/null +++ b/speechx/speechx/kaldi/base/kaldi-math.cc @@ -0,0 +1,162 @@ +// base/kaldi-math.cc + +// Copyright 2009-2011 Microsoft Corporation; Yanmin Qian; +// Saarland University; Jan Silovsky + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-math.h" +#ifndef _MSC_VER +#include +#include +#endif +#include +#include + +namespace kaldi { +// These routines are tested in matrix/matrix-test.cc + +int32 RoundUpToNearestPowerOfTwo(int32 n) { + KALDI_ASSERT(n > 0); + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + return n+1; +} + +static std::mutex _RandMutex; + +int Rand(struct RandomState* state) { +#if !defined(_POSIX_THREAD_SAFE_FUNCTIONS) + // On Windows and Cygwin, just call Rand() + return rand(); +#else + if (state) { + return rand_r(&(state->seed)); + } else { + std::lock_guard lock(_RandMutex); + return rand(); + } +#endif +} + +RandomState::RandomState() { + // we initialize it as Rand() + 27437 instead of just Rand(), because on some + // systems, e.g. at the very least Mac OSX Yosemite and later, it seems to be + // the case that rand_r when initialized with rand() will give you the exact + // same sequence of numbers that rand() will give if you keep calling rand() + // after that initial call. This can cause problems with repeated sequences. + // For example if you initialize two RandomState structs one after the other + // without calling rand() in between, they would give you the same sequence + // offset by one (if we didn't have the "+ 27437" in the code). 27437 is just + // a randomly chosen prime number. + seed = Rand() + 27437; +} + +bool WithProb(BaseFloat prob, struct RandomState* state) { + KALDI_ASSERT(prob >= 0 && prob <= 1.1); // prob should be <= 1.0, + // but we allow slightly larger values that could arise from roundoff in + // previous calculations. + KALDI_COMPILE_TIME_ASSERT(RAND_MAX > 128 * 128); + if (prob == 0) return false; + else if (prob == 1.0) return true; + else if (prob * RAND_MAX < 128.0) { + // prob is very small but nonzero, and the "main algorithm" + // wouldn't work that well. So: with probability 1/128, we + // return WithProb (prob * 128), else return false. + if (Rand(state) < RAND_MAX / 128) { // with probability 128... + // Note: we know that prob * 128.0 < 1.0, because + // we asserted RAND_MAX > 128 * 128. + return WithProb(prob * 128.0); + } else { + return false; + } + } else { + return (Rand(state) < ((RAND_MAX + static_cast(1.0)) * prob)); + } +} + +int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) { + // This is not exact. + KALDI_ASSERT(max_val >= min_val); + if (max_val == min_val) return min_val; + +#ifdef _MSC_VER + // RAND_MAX is quite small on Windows -> may need to handle larger numbers. + if (RAND_MAX > (max_val-min_val)*8) { + // *8 to avoid large inaccuracies in probability, from the modulus... + return min_val + + ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val)); + } else { + if ((unsigned int)(RAND_MAX*RAND_MAX) > + (unsigned int)((max_val+1-min_val)*8)) { + // *8 to avoid inaccuracies in probability, from the modulus... + return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state))) + % (unsigned int)(max_val+1-min_val)); + } else { + KALDI_ERR << "rand_int failed because we do not support such large " + "random numbers. (Extend this function)."; + } + } +#else + return min_val + + (static_cast(Rand(state)) % static_cast(max_val+1-min_val)); +#endif +} + +// Returns poisson-distributed random number. +// Take care: this takes time proportional +// to lambda. Faster algorithms exist but are more complex. +int32 RandPoisson(float lambda, struct RandomState* state) { + // Knuth's algorithm. + KALDI_ASSERT(lambda >= 0); + float L = expf(-lambda), p = 1.0; + int32 k = 0; + do { + k++; + float u = RandUniform(state); + p *= u; + } while (p > L); + return k-1; +} + +void RandGauss2(float *a, float *b, RandomState *state) { + KALDI_ASSERT(a); + KALDI_ASSERT(b); + float u1 = RandUniform(state); + float u2 = RandUniform(state); + u1 = sqrtf(-2.0f * logf(u1)); + u2 = 2.0f * M_PI * u2; + *a = u1 * cosf(u2); + *b = u1 * sinf(u2); +} + +void RandGauss2(double *a, double *b, RandomState *state) { + KALDI_ASSERT(a); + KALDI_ASSERT(b); + float a_float, b_float; + // Just because we're using doubles doesn't mean we need super-high-quality + // random numbers, so we just use the floating-point version internally. + RandGauss2(&a_float, &b_float, state); + *a = a_float; + *b = b_float; +} + + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/base/kaldi-math.h b/speechx/speechx/kaldi/base/kaldi-math.h new file mode 100644 index 00000000..93c265ee --- /dev/null +++ b/speechx/speechx/kaldi/base/kaldi-math.h @@ -0,0 +1,363 @@ +// base/kaldi-math.h + +// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Yanmin Qian; +// Jan Silovsky; Saarland University +// +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_BASE_KALDI_MATH_H_ +#define KALDI_BASE_KALDI_MATH_H_ 1 + +#ifdef _MSC_VER +#include +#endif + +#include +#include +#include + +#include "base/kaldi-types.h" +#include "base/kaldi-common.h" + + +#ifndef DBL_EPSILON +#define DBL_EPSILON 2.2204460492503131e-16 +#endif +#ifndef FLT_EPSILON +#define FLT_EPSILON 1.19209290e-7f +#endif + +#ifndef M_PI +#define M_PI 3.1415926535897932384626433832795 +#endif + +#ifndef M_SQRT2 +#define M_SQRT2 1.4142135623730950488016887 +#endif + +#ifndef M_2PI +#define M_2PI 6.283185307179586476925286766559005 +#endif + +#ifndef M_SQRT1_2 +#define M_SQRT1_2 0.7071067811865475244008443621048490 +#endif + +#ifndef M_LOG_2PI +#define M_LOG_2PI 1.8378770664093454835606594728112 +#endif + +#ifndef M_LN2 +#define M_LN2 0.693147180559945309417232121458 +#endif + +#ifndef M_LN10 +#define M_LN10 2.302585092994045684017991454684 +#endif + + +#define KALDI_ISNAN std::isnan +#define KALDI_ISINF std::isinf +#define KALDI_ISFINITE(x) std::isfinite(x) + +#if !defined(KALDI_SQR) +# define KALDI_SQR(x) ((x) * (x)) +#endif + +namespace kaldi { + +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) +inline double Exp(double x) { return exp(x); } +#ifndef KALDI_NO_EXPF +inline float Exp(float x) { return expf(x); } +#else +inline float Exp(float x) { return exp(static_cast(x)); } +#endif // KALDI_NO_EXPF +#else +inline double Exp(double x) { return exp(x); } +#if !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) +// Microsoft CL v18.0 buggy 64-bit implementation of +// expf() incorrectly returns -inf for exp(-inf). +inline float Exp(float x) { return exp(static_cast(x)); } +#else +inline float Exp(float x) { return expf(x); } +#endif // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) +#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) + +inline double Log(double x) { return log(x); } +inline float Log(float x) { return logf(x); } + +#if !defined(_MSC_VER) || (_MSC_VER >= 1700) +inline double Log1p(double x) { return log1p(x); } +inline float Log1p(float x) { return log1pf(x); } +#else +inline double Log1p(double x) { + const double cutoff = 1.0e-08; + if (x < cutoff) + return x - 0.5 * x * x; + else + return Log(1.0 + x); +} + +inline float Log1p(float x) { + const float cutoff = 1.0e-07; + if (x < cutoff) + return x - 0.5 * x * x; + else + return Log(1.0 + x); +} +#endif + +static const double kMinLogDiffDouble = Log(DBL_EPSILON); // negative! +static const float kMinLogDiffFloat = Log(FLT_EPSILON); // negative! + +// -infinity +const float kLogZeroFloat = -std::numeric_limits::infinity(); +const double kLogZeroDouble = -std::numeric_limits::infinity(); +const BaseFloat kLogZeroBaseFloat = -std::numeric_limits::infinity(); + +// Returns a random integer between 0 and RAND_MAX, inclusive +int Rand(struct RandomState* state = NULL); + +// State for thread-safe random number generator +struct RandomState { + RandomState(); + unsigned seed; +}; + +// Returns a random integer between first and last inclusive. +int32 RandInt(int32 first, int32 last, struct RandomState* state = NULL); + +// Returns true with probability "prob", +bool WithProb(BaseFloat prob, struct RandomState* state = NULL); +// with 0 <= prob <= 1 [we check this]. +// Internally calls Rand(). This function is carefully implemented so +// that it should work even if prob is very small. + +/// Returns a random number strictly between 0 and 1. +inline float RandUniform(struct RandomState* state = NULL) { + return static_cast((Rand(state) + 1.0) / (RAND_MAX+2.0)); +} + +inline float RandGauss(struct RandomState* state = NULL) { + return static_cast(sqrtf (-2 * Log(RandUniform(state))) + * cosf(2*M_PI*RandUniform(state))); +} + +// Returns poisson-distributed random number. Uses Knuth's algorithm. +// Take care: this takes time proportional +// to lambda. Faster algorithms exist but are more complex. +int32 RandPoisson(float lambda, struct RandomState* state = NULL); + +// Returns a pair of gaussian random numbers. Uses Box-Muller transform +void RandGauss2(float *a, float *b, RandomState *state = NULL); +void RandGauss2(double *a, double *b, RandomState *state = NULL); + +// Also see Vector::RandCategorical(). + +// This is a randomized pruning mechanism that preserves expectations, +// that we typically use to prune posteriors. +template +inline Float RandPrune(Float post, BaseFloat prune_thresh, + struct RandomState* state = NULL) { + KALDI_ASSERT(prune_thresh >= 0.0); + if (post == 0.0 || std::abs(post) >= prune_thresh) + return post; + return (post >= 0 ? 1.0 : -1.0) * + (RandUniform(state) <= fabs(post)/prune_thresh ? prune_thresh : 0.0); +} + +// returns log(exp(x) + exp(y)). +inline double LogAdd(double x, double y) { + double diff; + + if (x < y) { + diff = x - y; + x = y; + } else { + diff = y - x; + } + // diff is negative. x is now the larger one. + + if (diff >= kMinLogDiffDouble) { + double res; + res = x + Log1p(Exp(diff)); + return res; + } else { + return x; // return the larger one. + } +} + + +// returns log(exp(x) + exp(y)). +inline float LogAdd(float x, float y) { + float diff; + + if (x < y) { + diff = x - y; + x = y; + } else { + diff = y - x; + } + // diff is negative. x is now the larger one. + + if (diff >= kMinLogDiffFloat) { + float res; + res = x + Log1p(Exp(diff)); + return res; + } else { + return x; // return the larger one. + } +} + + +// returns log(exp(x) - exp(y)). +inline double LogSub(double x, double y) { + if (y >= x) { // Throws exception if y>=x. + if (y == x) + return kLogZeroDouble; + else + KALDI_ERR << "Cannot subtract a larger from a smaller number."; + } + + double diff = y - x; // Will be negative. + double res = x + Log(1.0 - Exp(diff)); + + // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision + if (KALDI_ISNAN(res)) + return kLogZeroDouble; + return res; +} + + +// returns log(exp(x) - exp(y)). +inline float LogSub(float x, float y) { + if (y >= x) { // Throws exception if y>=x. + if (y == x) + return kLogZeroDouble; + else + KALDI_ERR << "Cannot subtract a larger from a smaller number."; + } + + float diff = y - x; // Will be negative. + float res = x + Log(1.0f - Exp(diff)); + + // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision + if (KALDI_ISNAN(res)) + return kLogZeroFloat; + return res; +} + +/// return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)). +static inline bool ApproxEqual(float a, float b, + float relative_tolerance = 0.001) { + // a==b handles infinities. + if (a == b) return true; + float diff = std::abs(a-b); + if (diff == std::numeric_limits::infinity() + || diff != diff) return false; // diff is +inf or nan. + return (diff <= relative_tolerance*(std::abs(a)+std::abs(b))); +} + +/// assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b)) +static inline void AssertEqual(float a, float b, + float relative_tolerance = 0.001) { + // a==b handles infinities. + KALDI_ASSERT(ApproxEqual(a, b, relative_tolerance)); +} + + +// RoundUpToNearestPowerOfTwo does the obvious thing. It crashes if n <= 0. +int32 RoundUpToNearestPowerOfTwo(int32 n); + +/// Returns a / b, rounding towards negative infinity in all cases. +static inline int32 DivideRoundingDown(int32 a, int32 b) { + KALDI_ASSERT(b != 0); + if (a * b >= 0) + return a / b; + else if (a < 0) + return (a - b + 1) / b; + else + return (a - b - 1) / b; +} + +template I Gcd(I m, I n) { + if (m == 0 || n == 0) { + if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors. + KALDI_ERR << "Undefined GCD since m = 0, n = 0."; + } + return (m == 0 ? (n > 0 ? n : -n) : ( m > 0 ? m : -m)); + // return absolute value of whichever is nonzero + } + // could use compile-time assertion + // but involves messing with complex template stuff. + KALDI_ASSERT(std::numeric_limits::is_integer); + while (1) { + m %= n; + if (m == 0) return (n > 0 ? n : -n); + n %= m; + if (n == 0) return (m > 0 ? m : -m); + } +} + +/// Returns the least common multiple of two integers. Will +/// crash unless the inputs are positive. +template I Lcm(I m, I n) { + KALDI_ASSERT(m > 0 && n > 0); + I gcd = Gcd(m, n); + return gcd * (m/gcd) * (n/gcd); +} + + +template void Factorize(I m, std::vector *factors) { + // Splits a number into its prime factors, in sorted order from + // least to greatest, with duplication. A very inefficient + // algorithm, which is mainly intended for use in the + // mixed-radix FFT computation (where we assume most factors + // are small). + KALDI_ASSERT(factors != NULL); + KALDI_ASSERT(m >= 1); // Doesn't work for zero or negative numbers. + factors->clear(); + I small_factors[10] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29 }; + + // First try small factors. + for (I i = 0; i < 10; i++) { + if (m == 1) return; // We're done. + while (m % small_factors[i] == 0) { + m /= small_factors[i]; + factors->push_back(small_factors[i]); + } + } + // Next try all odd numbers starting from 31. + for (I j = 31;; j += 2) { + if (m == 1) return; + while (m % j == 0) { + m /= j; + factors->push_back(j); + } + } +} + +inline double Hypot(double x, double y) { return hypot(x, y); } +inline float Hypot(float x, float y) { return hypotf(x, y); } + + + + +} // namespace kaldi + + +#endif // KALDI_BASE_KALDI_MATH_H_ diff --git a/speechx/speechx/kaldi/base/kaldi-types.h b/speechx/speechx/kaldi/base/kaldi-types.h new file mode 100644 index 00000000..4fa8f224 --- /dev/null +++ b/speechx/speechx/kaldi/base/kaldi-types.h @@ -0,0 +1,76 @@ +// base/kaldi-types.h + +// Copyright 2009-2011 Microsoft Corporation; Saarland University; +// Jan Silovsky; Yanmin Qian + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_BASE_KALDI_TYPES_H_ +#define KALDI_BASE_KALDI_TYPES_H_ 1 + +namespace kaldi { +// TYPEDEFS .................................................................. +#if (KALDI_DOUBLEPRECISION != 0) +typedef double BaseFloat; +#else +typedef float BaseFloat; +#endif +} + +#ifdef _MSC_VER +#include +#define ssize_t SSIZE_T +#endif + +// we can do this a different way if some platform +// we find in the future lacks stdint.h +#include + +// for discussion on what to do if you need compile kaldi +// without OpenFST, see the bottom of this this file + +#include + +namespace kaldi { + using ::int16; + using ::int32; + using ::int64; + using ::uint16; + using ::uint32; + using ::uint64; + typedef float float32; + typedef double double64; +} // end namespace kaldi + +// In a theoretical case you decide compile Kaldi without the OpenFST +// comment the previous namespace statement and uncomment the following +/* +namespace kaldi { + typedef int8_t int8; + typedef int16_t int16; + typedef int32_t int32; + typedef int64_t int64; + + typedef uint8_t uint8; + typedef uint16_t uint16; + typedef uint32_t uint32; + typedef uint64_t uint64; + typedef float float32; + typedef double double64; +} // end namespace kaldi +*/ + +#endif // KALDI_BASE_KALDI_TYPES_H_ diff --git a/speechx/speechx/kaldi/base/kaldi-utils.cc b/speechx/speechx/kaldi/base/kaldi-utils.cc new file mode 100644 index 00000000..432da426b --- /dev/null +++ b/speechx/speechx/kaldi/base/kaldi-utils.cc @@ -0,0 +1,55 @@ +// base/kaldi-utils.cc +// Copyright 2009-2011 Karel Vesely; Yanmin Qian; Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifdef _WIN32_WINNT_WIN8 +#include +#elif defined(_WIN32) || defined(_MSC_VER) || defined(MINGW) +#include +#if defined(_MSC_VER) && _MSC_VER < 1900 +#define snprintf _snprintf +#endif /* _MSC_VER < 1900 */ +#else +#include +#endif + +#include +#include "base/kaldi-common.h" + + +namespace kaldi { + +std::string CharToString(const char &c) { + char buf[20]; + if (std::isprint(c)) + snprintf(buf, sizeof(buf), "\'%c\'", c); + else + snprintf(buf, sizeof(buf), "[character %d]", static_cast(c)); + return (std::string) buf; +} + +void Sleep(float seconds) { +#if defined(_MSC_VER) || defined(MINGW) + ::Sleep(static_cast(seconds * 1000.0)); +#elif defined(__CYGWIN__) + sleep(static_cast(seconds)); +#else + usleep(static_cast(seconds * 1000000.0)); +#endif +} + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/base/kaldi-utils.h b/speechx/speechx/kaldi/base/kaldi-utils.h new file mode 100644 index 00000000..c9d6fd95 --- /dev/null +++ b/speechx/speechx/kaldi/base/kaldi-utils.h @@ -0,0 +1,155 @@ +// base/kaldi-utils.h + +// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; +// Saarland University; Karel Vesely; Yanmin Qian + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_BASE_KALDI_UTILS_H_ +#define KALDI_BASE_KALDI_UTILS_H_ 1 + +#if defined(_MSC_VER) +# define WIN32_LEAN_AND_MEAN +# define NOMINMAX +# include +#endif + +#ifdef _MSC_VER +#include +#define unlink _unlink +#else +#include +#endif + +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4056 4305 4800 4267 4996 4756 4661) +#if _MSC_VER < 1400 +#define __restrict__ +#else +#define __restrict__ __restrict +#endif +#endif + +#if defined(_MSC_VER) +# define KALDI_MEMALIGN(align, size, pp_orig) \ + (*(pp_orig) = _aligned_malloc(size, align)) +# define KALDI_MEMALIGN_FREE(x) _aligned_free(x) +#elif defined(__CYGWIN__) +# define KALDI_MEMALIGN(align, size, pp_orig) \ + (*(pp_orig) = aligned_alloc(align, size)) +# define KALDI_MEMALIGN_FREE(x) free(x) +#else +# define KALDI_MEMALIGN(align, size, pp_orig) \ + (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL) +# define KALDI_MEMALIGN_FREE(x) free(x) +#endif + +#ifdef __ICC +#pragma warning(disable: 383) // ICPC remark we don't want. +#pragma warning(disable: 810) // ICPC remark we don't want. +#pragma warning(disable: 981) // ICPC remark we don't want. +#pragma warning(disable: 1418) // ICPC remark we don't want. +#pragma warning(disable: 444) // ICPC remark we don't want. +#pragma warning(disable: 869) // ICPC remark we don't want. +#pragma warning(disable: 1287) // ICPC remark we don't want. +#pragma warning(disable: 279) // ICPC remark we don't want. +#pragma warning(disable: 981) // ICPC remark we don't want. +#endif + + +namespace kaldi { + + +// CharToString prints the character in a human-readable form, for debugging. +std::string CharToString(const char &c); + + +inline int MachineIsLittleEndian() { + int check = 1; + return (*reinterpret_cast(&check) != 0); +} + +// This function kaldi::Sleep() provides a portable way +// to sleep for a possibly fractional +// number of seconds. On Windows it's only accurate to microseconds. +void Sleep(float seconds); +} + +#define KALDI_SWAP8(a) { \ + int t = (reinterpret_cast(&a))[0];\ + (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[7];\ + (reinterpret_cast(&a))[7]=t;\ + t = (reinterpret_cast(&a))[1];\ + (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[6];\ + (reinterpret_cast(&a))[6]=t;\ + t = (reinterpret_cast(&a))[2];\ + (reinterpret_cast(&a))[2]=(reinterpret_cast(&a))[5];\ + (reinterpret_cast(&a))[5]=t;\ + t = (reinterpret_cast(&a))[3];\ + (reinterpret_cast(&a))[3]=(reinterpret_cast(&a))[4];\ + (reinterpret_cast(&a))[4]=t;} +#define KALDI_SWAP4(a) { \ + int t = (reinterpret_cast(&a))[0];\ + (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[3];\ + (reinterpret_cast(&a))[3]=t;\ + t = (reinterpret_cast(&a))[1];\ + (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[2];\ + (reinterpret_cast(&a))[2]=t;} +#define KALDI_SWAP2(a) { \ + int t = (reinterpret_cast(&a))[0];\ + (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[1];\ + (reinterpret_cast(&a))[1]=t;} + + +// Makes copy constructor and operator= private. +#define KALDI_DISALLOW_COPY_AND_ASSIGN(type) \ + type(const type&); \ + void operator = (const type&) + +template class KaldiCompileTimeAssert { }; +template<> class KaldiCompileTimeAssert { + public: + static inline void Check() { } +}; + +#define KALDI_COMPILE_TIME_ASSERT(b) KaldiCompileTimeAssert<(b)>::Check() + +#define KALDI_ASSERT_IS_INTEGER_TYPE(I) \ + KaldiCompileTimeAssert::is_specialized \ + && std::numeric_limits::is_integer>::Check() + +#define KALDI_ASSERT_IS_FLOATING_TYPE(F) \ + KaldiCompileTimeAssert::is_specialized \ + && !std::numeric_limits::is_integer>::Check() + +#if defined(_MSC_VER) +#define KALDI_STRCASECMP _stricmp +#elif defined(__CYGWIN__) +#include +#define KALDI_STRCASECMP strcasecmp +#else +#define KALDI_STRCASECMP strcasecmp +#endif +#ifdef _MSC_VER +# define KALDI_STRTOLL(cur_cstr, end_cstr) _strtoi64(cur_cstr, end_cstr, 10); +#else +# define KALDI_STRTOLL(cur_cstr, end_cstr) strtoll(cur_cstr, end_cstr, 10); +#endif + +#endif // KALDI_BASE_KALDI_UTILS_H_ diff --git a/speechx/speechx/kaldi/base/timer.cc b/speechx/speechx/kaldi/base/timer.cc new file mode 100644 index 00000000..ce4ef292 --- /dev/null +++ b/speechx/speechx/kaldi/base/timer.cc @@ -0,0 +1,85 @@ +// base/timer.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/timer.h" +#include "base/kaldi-error.h" +#include +#include +#include +#include + +namespace kaldi { + +class ProfileStats { + public: + void AccStats(const char *function_name, double elapsed) { + std::unordered_map::iterator + iter = map_.find(function_name); + if (iter == map_.end()) { + map_[function_name] = ProfileStatsEntry(function_name); + map_[function_name].total_time = elapsed; + } else { + iter->second.total_time += elapsed; + } + } + ~ProfileStats() { + // This map makes sure we agglomerate the time if there were any duplicate + // addresses of strings. + std::unordered_map total_time; + for (auto iter = map_.begin(); iter != map_.end(); iter++) + total_time[iter->second.name] += iter->second.total_time; + + ReverseSecondComparator comp; + std::vector > pairs(total_time.begin(), + total_time.end()); + std::sort(pairs.begin(), pairs.end(), comp); + for (size_t i = 0; i < pairs.size(); i++) { + KALDI_LOG << "Time taken in " << pairs[i].first << " is " + << std::fixed << std::setprecision(2) << pairs[i].second << "s."; + } + } + private: + + struct ProfileStatsEntry { + std::string name; + double total_time; + ProfileStatsEntry() { } + ProfileStatsEntry(const char *name): name(name) { } + }; + + struct ReverseSecondComparator { + bool operator () (const std::pair &a, + const std::pair &b) { + return a.second > b.second; + } + }; + + // Note: this map is keyed on the address of the string, there is no proper + // hash function. The assumption is that the strings are compile-time + // constants. + std::unordered_map map_; +}; + +ProfileStats g_profile_stats; + +Profiler::~Profiler() { + g_profile_stats.AccStats(name_, tim_.Elapsed()); +} + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/base/timer.h b/speechx/speechx/kaldi/base/timer.h new file mode 100644 index 00000000..0e033766 --- /dev/null +++ b/speechx/speechx/kaldi/base/timer.h @@ -0,0 +1,115 @@ +// base/timer.h + +// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_BASE_TIMER_H_ +#define KALDI_BASE_TIMER_H_ + +#include "base/kaldi-utils.h" +#include "base/kaldi-error.h" + + +#if defined(_MSC_VER) || defined(MINGW) + +namespace kaldi { +class Timer { + public: + Timer() { Reset(); } + + // You can initialize with bool to control whether or not you want the time to + // be set when the object is created. + explicit Timer(bool set_timer) { if (set_timer) Reset(); } + + void Reset() { + QueryPerformanceCounter(&time_start_); + } + double Elapsed() const { + LARGE_INTEGER time_end; + LARGE_INTEGER freq; + QueryPerformanceCounter(&time_end); + + if (QueryPerformanceFrequency(&freq) == 0) { + // Hardware does not support this. + return 0.0; + } + return (static_cast(time_end.QuadPart) - + static_cast(time_start_.QuadPart)) / + (static_cast(freq.QuadPart)); + } + private: + LARGE_INTEGER time_start_; +}; + + +#else +#include +#include + +namespace kaldi { +class Timer { + public: + Timer() { Reset(); } + + // You can initialize with bool to control whether or not you want the time to + // be set when the object is created. + explicit Timer(bool set_timer) { if (set_timer) Reset(); } + + void Reset() { gettimeofday(&this->time_start_, &time_zone_); } + + /// Returns time in seconds. + double Elapsed() const { + struct timeval time_end; + struct timezone time_zone; + gettimeofday(&time_end, &time_zone); + double t1, t2; + t1 = static_cast(time_start_.tv_sec) + + static_cast(time_start_.tv_usec)/(1000*1000); + t2 = static_cast(time_end.tv_sec) + + static_cast(time_end.tv_usec)/(1000*1000); + return t2-t1; + } + + private: + struct timeval time_start_; + struct timezone time_zone_; +}; + +#endif + +class Profiler { + public: + // Caution: the 'const char' should always be a string constant; for speed, + // internally the profiling code uses the address of it as a lookup key. + Profiler(const char *function_name): name_(function_name) { } + ~Profiler(); + private: + Timer tim_; + const char *name_; +}; + +// To add timing info for a function, you just put +// KALDI_PROFILE; +// at the beginning of the function. Caution: this doesn't +// include the class name. +#define KALDI_PROFILE Profiler _profiler(__func__) + + + +} // namespace kaldi + + +#endif // KALDI_BASE_TIMER_H_ diff --git a/speechx/speechx/kaldi/base/version.h b/speechx/speechx/kaldi/base/version.h new file mode 100644 index 00000000..a79a5758 --- /dev/null +++ b/speechx/speechx/kaldi/base/version.h @@ -0,0 +1,4 @@ +// This file was automatically created by ./get_version.sh. +// It is only included by ./kaldi-error.cc. +#define KALDI_VERSION "5.5.544~2-f21d7" +#define KALDI_GIT_HEAD "f21d7e768635ca98aeeb43f30e2c6a9f14ab8f0f" diff --git a/speechx/speechx/kaldi/feat/CMakeLists.txt b/speechx/speechx/kaldi/feat/CMakeLists.txt new file mode 100644 index 00000000..8b914962 --- /dev/null +++ b/speechx/speechx/kaldi/feat/CMakeLists.txt @@ -0,0 +1,19 @@ +add_library(kaldi-mfcc + feature-mfcc.cc +) +target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common) + +add_library(fbank + feature-fbank.cc +) +target_link_libraries(fbank PUBLIC kaldi-feat-common) + +add_library(kaldi-feat-common + wave-reader.cc + signal.cc + feature-functions.cc + feature-window.cc + resample.cc + mel-computations.cc +) +target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util) \ No newline at end of file diff --git a/speechx/speechx/kaldi/feat/feature-common-inl.h b/speechx/speechx/kaldi/feat/feature-common-inl.h new file mode 100644 index 00000000..26127a4d --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-common-inl.h @@ -0,0 +1,99 @@ +// feat/feature-common-inl.h + +// Copyright 2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_FEATURE_COMMON_INL_H_ +#define KALDI_FEAT_FEATURE_COMMON_INL_H_ + +#include "feat/resample.h" +// Do not include this file directly. It is included by feat/feature-common.h + +namespace kaldi { + +template +void OfflineFeatureTpl::ComputeFeatures( + const VectorBase &wave, + BaseFloat sample_freq, + BaseFloat vtln_warp, + Matrix *output) { + KALDI_ASSERT(output != NULL); + BaseFloat new_sample_freq = computer_.GetFrameOptions().samp_freq; + if (sample_freq == new_sample_freq) { + Compute(wave, vtln_warp, output); + } else { + if (new_sample_freq < sample_freq && + ! computer_.GetFrameOptions().allow_downsample) + KALDI_ERR << "Waveform and config sample Frequency mismatch: " + << sample_freq << " .vs " << new_sample_freq + << " (use --allow-downsample=true to allow " + << " downsampling the waveform)."; + else if (new_sample_freq > sample_freq && + ! computer_.GetFrameOptions().allow_upsample) + KALDI_ERR << "Waveform and config sample Frequency mismatch: " + << sample_freq << " .vs " << new_sample_freq + << " (use --allow-upsample=true option to allow " + << " upsampling the waveform)."; + // Resample the waveform. + Vector resampled_wave(wave); + ResampleWaveform(sample_freq, wave, + new_sample_freq, &resampled_wave); + Compute(resampled_wave, vtln_warp, output); + } +} + +template +void OfflineFeatureTpl::Compute( + const VectorBase &wave, + BaseFloat vtln_warp, + Matrix *output) { + KALDI_ASSERT(output != NULL); + int32 rows_out = NumFrames(wave.Dim(), computer_.GetFrameOptions()), + cols_out = computer_.Dim(); + if (rows_out == 0) { + output->Resize(0, 0); + return; + } + output->Resize(rows_out, cols_out); + Vector window; // windowed waveform. + bool use_raw_log_energy = computer_.NeedRawLogEnergy(); + for (int32 r = 0; r < rows_out; r++) { // r is frame index. + BaseFloat raw_log_energy = 0.0; + ExtractWindow(0, wave, r, computer_.GetFrameOptions(), + feature_window_function_, &window, + (use_raw_log_energy ? &raw_log_energy : NULL)); + + SubVector output_row(*output, r); + computer_.Compute(raw_log_energy, vtln_warp, &window, &output_row); + } +} + +template +void OfflineFeatureTpl::Compute( + const VectorBase &wave, + BaseFloat vtln_warp, + Matrix *output) const { + OfflineFeatureTpl temp(*this); + // call the non-const version of Compute() on a temporary copy of this object. + // This is a workaround for const-ness that may sometimes be useful in + // multi-threaded code, although it's not optimally efficient. + temp.Compute(wave, vtln_warp, output); +} + +} // end namespace kaldi + +#endif diff --git a/speechx/speechx/kaldi/feat/feature-common.h b/speechx/speechx/kaldi/feat/feature-common.h new file mode 100644 index 00000000..3c2fbd37 --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-common.h @@ -0,0 +1,176 @@ +// feat/feature-common.h + +// Copyright 2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABILITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_FEATURE_COMMON_H_ +#define KALDI_FEAT_FEATURE_COMMON_H_ + +#include +#include +#include "feat/feature-window.h" + +namespace kaldi { +/// @addtogroup feat FeatureExtraction +/// @{ + + + +/// This class is only added for documentation, it is not intended to ever be +/// used. +struct ExampleFeatureComputerOptions { + FrameExtractionOptions frame_opts; + // .. more would go here. +}; + +/// This class is only added for documentation, it is not intended to ever be +/// used. It documents the interface of the *Computer classes which wrap the +/// low-level feature extraction. The template argument F of OfflineFeatureTpl must +/// follow this interface. This interface is intended for features such as +/// MFCCs and PLPs which can be computed frame by frame. +class ExampleFeatureComputer { + public: + typedef ExampleFeatureComputerOptions Options; + + /// Returns a reference to the frame-extraction options class, which + /// will be part of our own options class. + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } + + /// Returns the feature dimension + int32 Dim() const; + + /// Returns true if this function may inspect the raw log-energy of the signal + /// (before windowing and pre-emphasis); it's safe to always return true, but + /// setting it to false enables an optimization. + bool NeedRawLogEnergy() const { return true; } + + /// constructor from options class; it should not store a reference or pointer + /// to the options class but should copy it. + explicit ExampleFeatureComputer(const ExampleFeatureComputerOptions &opts): + opts_(opts) { } + + /// Copy constructor; all of these classes must have one. + ExampleFeatureComputer(const ExampleFeatureComputer &other); + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. + */ + void Compute(BaseFloat signal_raw_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature); + + private: + // disallow assignment. + ExampleFeatureComputer &operator = (const ExampleFeatureComputer &in); + Options opts_; +}; + + +/// This templated class is intended for offline feature extraction, i.e. where +/// you have access to the entire signal at the start. It exists mainly to be +/// drop-in replacement for the old (pre-2016) classes Mfcc, Plp and so on, for +/// use in the offline case. In April 2016 we reorganized the online +/// feature-computation code for greater modularity and to have correct support +/// for the snip-edges=false option. +template +class OfflineFeatureTpl { + public: + typedef typename F::Options Options; + + // Note: feature_window_function_ is the windowing function, which initialized + // using the options class, that we cache at this level. + OfflineFeatureTpl(const Options &opts): + computer_(opts), + feature_window_function_(computer_.GetFrameOptions()) { } + + // Internal (and back-compatibility) interface for computing features, which + // requires that the user has already checked that the sampling frequency + // of the waveform is equal to the sampling frequency specified in + // the frame-extraction options. + void Compute(const VectorBase &wave, + BaseFloat vtln_warp, + Matrix *output); + + // This const version of Compute() is a wrapper that + // calls the non-const version on a temporary object. + // It's less efficient than the non-const version. + void Compute(const VectorBase &wave, + BaseFloat vtln_warp, + Matrix *output) const; + + /** + Computes the features for one file (one sequence of features). + This is the newer interface where you specify the sample frequency + of the input waveform. + @param [in] wave The input waveform + @param [in] sample_freq The sampling frequency with which + 'wave' was sampled. + if sample_freq is higher than the frequency + specified in the config, we will downsample + the waveform, but if lower, it's an error. + @param [in] vtln_warp The VTLN warping factor (will normally + be 1.0) + @param [out] output The matrix of features, where the row-index + is the frame index. + */ + void ComputeFeatures(const VectorBase &wave, + BaseFloat sample_freq, + BaseFloat vtln_warp, + Matrix *output); + + int32 Dim() const { return computer_.Dim(); } + + // Copy constructor. + OfflineFeatureTpl(const OfflineFeatureTpl &other): + computer_(other.computer_), + feature_window_function_(other.feature_window_function_) { } + private: + // Disallow assignment. + OfflineFeatureTpl &operator =(const OfflineFeatureTpl &other); + + F computer_; + FeatureWindowFunction feature_window_function_; +}; + +/// @} End of "addtogroup feat" +} // namespace kaldi + + +#include "feat/feature-common-inl.h" + +#endif // KALDI_FEAT_FEATURE_COMMON_H_ diff --git a/speechx/speechx/kaldi/feat/feature-fbank.cc b/speechx/speechx/kaldi/feat/feature-fbank.cc new file mode 100644 index 00000000..d9ac03e5 --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-fbank.cc @@ -0,0 +1,125 @@ +// feat/feature-fbank.cc + +// Copyright 2009-2012 Karel Vesely +// 2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "feat/feature-fbank.h" + +namespace kaldi { + +FbankComputer::FbankComputer(const FbankOptions &opts): + opts_(opts), srfft_(NULL) { + if (opts.energy_floor > 0.0) + log_energy_floor_ = Log(opts.energy_floor); + + int32 padded_window_size = opts.frame_opts.PaddedWindowSize(); + if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two... + srfft_ = new SplitRadixRealFft(padded_window_size); + + // We'll definitely need the filterbanks info for VTLN warping factor 1.0. + // [note: this call caches it.] + GetMelBanks(1.0); +} + +FbankComputer::FbankComputer(const FbankComputer &other): + opts_(other.opts_), log_energy_floor_(other.log_energy_floor_), + mel_banks_(other.mel_banks_), srfft_(NULL) { + for (std::map::iterator iter = mel_banks_.begin(); + iter != mel_banks_.end(); + ++iter) + iter->second = new MelBanks(*(iter->second)); + if (other.srfft_) + srfft_ = new SplitRadixRealFft(*(other.srfft_)); +} + +FbankComputer::~FbankComputer() { + for (std::map::iterator iter = mel_banks_.begin(); + iter != mel_banks_.end(); ++iter) + delete iter->second; + delete srfft_; +} + +const MelBanks* FbankComputer::GetMelBanks(BaseFloat vtln_warp) { + MelBanks *this_mel_banks = NULL; + std::map::iterator iter = mel_banks_.find(vtln_warp); + if (iter == mel_banks_.end()) { + this_mel_banks = new MelBanks(opts_.mel_opts, + opts_.frame_opts, + vtln_warp); + mel_banks_[vtln_warp] = this_mel_banks; + } else { + this_mel_banks = iter->second; + } + return this_mel_banks; +} + +void FbankComputer::Compute(BaseFloat signal_raw_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature) { + + const MelBanks &mel_banks = *(GetMelBanks(vtln_warp)); + + KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() && + feature->Dim() == this->Dim()); + + + // Compute energy after window function (not the raw one). + if (opts_.use_energy && !opts_.raw_energy) + signal_raw_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame), + std::numeric_limits::epsilon())); + + if (srfft_ != NULL) // Compute FFT using split-radix algorithm. + srfft_->Compute(signal_frame->Data(), true); + else // An alternative algorithm that works for non-powers-of-two. + RealFft(signal_frame, true); + + // Convert the FFT into a power spectrum. + ComputePowerSpectrum(signal_frame); + SubVector power_spectrum(*signal_frame, 0, + signal_frame->Dim() / 2 + 1); + + // Use magnitude instead of power if requested. + if (!opts_.use_power) + power_spectrum.ApplyPow(0.5); + + int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0); + SubVector mel_energies(*feature, + mel_offset, + opts_.mel_opts.num_bins); + + // Sum with mel fiterbanks over the power spectrum + mel_banks.Compute(power_spectrum, &mel_energies); + if (opts_.use_log_fbank) { + // Avoid log of zero (which should be prevented anyway by dithering). + mel_energies.ApplyFloor(std::numeric_limits::epsilon()); + mel_energies.ApplyLog(); // take the log. + } + + // Copy energy as first value (or the last, if htk_compat == true). + if (opts_.use_energy) { + if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) { + signal_raw_log_energy = log_energy_floor_; + } + int32 energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0; + (*feature)(energy_index) = signal_raw_log_energy; + } +} + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/feat/feature-fbank.h b/speechx/speechx/kaldi/feat/feature-fbank.h new file mode 100644 index 00000000..f57d185a --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-fbank.h @@ -0,0 +1,149 @@ +// feat/feature-fbank.h + +// Copyright 2009-2012 Karel Vesely +// 2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_FEATURE_FBANK_H_ +#define KALDI_FEAT_FEATURE_FBANK_H_ + +#include +#include + +#include "feat/feature-common.h" +#include "feat/feature-functions.h" +#include "feat/feature-window.h" +#include "feat/mel-computations.h" + +namespace kaldi { +/// @addtogroup feat FeatureExtraction +/// @{ + + +/// FbankOptions contains basic options for computing filterbank features. +/// It only includes things that can be done in a "stateless" way, i.e. +/// it does not include energy max-normalization. +/// It does not include delta computation. +struct FbankOptions { + FrameExtractionOptions frame_opts; + MelBanksOptions mel_opts; + bool use_energy; // append an extra dimension with energy to the filter banks + BaseFloat energy_floor; + bool raw_energy; // If true, compute energy before preemphasis and windowing + bool htk_compat; // If true, put energy last (if using energy) + bool use_log_fbank; // if true (default), produce log-filterbank, else linear + bool use_power; // if true (default), use power in filterbank analysis, else magnitude. + + FbankOptions(): mel_opts(23), + // defaults the #mel-banks to 23 for the FBANK computations. + // this seems to be common for 16khz-sampled data, + // but for 8khz-sampled data, 15 may be better. + use_energy(false), + energy_floor(0.0), + raw_energy(true), + htk_compat(false), + use_log_fbank(true), + use_power(true) {} + + void Register(OptionsItf *opts) { + frame_opts.Register(opts); + mel_opts.Register(opts); + opts->Register("use-energy", &use_energy, + "Add an extra dimension with energy to the FBANK output."); + opts->Register("energy-floor", &energy_floor, + "Floor on energy (absolute, not relative) in FBANK computation. " + "Only makes a difference if --use-energy=true; only necessary if " + "--dither=0.0. Suggested values: 0.1 or 1.0"); + opts->Register("raw-energy", &raw_energy, + "If true, compute energy before preemphasis and windowing"); + opts->Register("htk-compat", &htk_compat, "If true, put energy last. " + "Warning: not sufficient to get HTK compatible features (need " + "to change other parameters)."); + opts->Register("use-log-fbank", &use_log_fbank, + "If true, produce log-filterbank, else produce linear."); + opts->Register("use-power", &use_power, + "If true, use power, else use magnitude."); + } +}; + + +/// Class for computing mel-filterbank features; see \ref feat_mfcc for more +/// information. +class FbankComputer { + public: + typedef FbankOptions Options; + + explicit FbankComputer(const FbankOptions &opts); + FbankComputer(const FbankComputer &other); + + int32 Dim() const { + return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0); + } + + bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; } + + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. + */ + void Compute(BaseFloat signal_raw_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature); + + ~FbankComputer(); + + private: + const MelBanks *GetMelBanks(BaseFloat vtln_warp); + + + FbankOptions opts_; + BaseFloat log_energy_floor_; + std::map mel_banks_; // BaseFloat is VTLN coefficient. + SplitRadixRealFft *srfft_; + // Disallow assignment. + FbankComputer &operator =(const FbankComputer &other); +}; + +typedef OfflineFeatureTpl Fbank; + +/// @} End of "addtogroup feat" +} // namespace kaldi + + +#endif // KALDI_FEAT_FEATURE_FBANK_H_ diff --git a/speechx/speechx/kaldi/feat/feature-functions.cc b/speechx/speechx/kaldi/feat/feature-functions.cc new file mode 100644 index 00000000..76500ccf --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-functions.cc @@ -0,0 +1,362 @@ +// feat/feature-functions.cc + +// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation +// 2013 Johns Hopkins University (author: Daniel Povey) +// 2014 IMSL, PKU-HKUST (author: Wei Shi) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "feat/feature-functions.h" +#include "matrix/matrix-functions.h" + + +namespace kaldi { + +void ComputePowerSpectrum(VectorBase *waveform) { + int32 dim = waveform->Dim(); + + // no, letting it be non-power-of-two for now. + // KALDI_ASSERT(dim > 0 && (dim & (dim-1) == 0)); // make sure a power of two.. actually my FFT code + // does not require this (dan) but this is better in case we use different code [dan]. + + // RealFft(waveform, true); // true == forward (not inverse) FFT; makes no difference here, + // as we just want power spectrum. + + // now we have in waveform, first half of complex spectrum + // it's stored as [real0, realN/2, real1, im1, real2, im2, ...] + int32 half_dim = dim/2; + BaseFloat first_energy = (*waveform)(0) * (*waveform)(0), + last_energy = (*waveform)(1) * (*waveform)(1); // handle this special case + for (int32 i = 1; i < half_dim; i++) { + BaseFloat real = (*waveform)(i*2), im = (*waveform)(i*2 + 1); + (*waveform)(i) = real*real + im*im; + } + (*waveform)(0) = first_energy; + (*waveform)(half_dim) = last_energy; // Will actually never be used, and anyway + // if the signal has been bandlimited sensibly this should be zero. +} + + +DeltaFeatures::DeltaFeatures(const DeltaFeaturesOptions &opts): opts_(opts) { + KALDI_ASSERT(opts.order >= 0 && opts.order < 1000); // just make sure we don't get binary junk. + // opts will normally be 2 or 3. + KALDI_ASSERT(opts.window > 0 && opts.window < 1000); // again, basic sanity check. + // normally the window size will be two. + + scales_.resize(opts.order+1); + scales_[0].Resize(1); + scales_[0](0) = 1.0; // trivial window for 0th order delta [i.e. baseline feats] + + for (int32 i = 1; i <= opts.order; i++) { + Vector &prev_scales = scales_[i-1], + &cur_scales = scales_[i]; + int32 window = opts.window; // this code is designed to still + // work if instead we later make it an array and do opts.window[i-1], + // or something like that. "window" is a parameter specifying delta-window + // width which is actually 2*window + 1. + KALDI_ASSERT(window != 0); + int32 prev_offset = (static_cast(prev_scales.Dim()-1))/2, + cur_offset = prev_offset + window; + cur_scales.Resize(prev_scales.Dim() + 2*window); // also zeros it. + + BaseFloat normalizer = 0.0; + for (int32 j = -window; j <= window; j++) { + normalizer += j*j; + for (int32 k = -prev_offset; k <= prev_offset; k++) { + cur_scales(j+k+cur_offset) += + static_cast(j) * prev_scales(k+prev_offset); + } + } + cur_scales.Scale(1.0 / normalizer); + } +} + +void DeltaFeatures::Process(const MatrixBase &input_feats, + int32 frame, + VectorBase *output_frame) const { + KALDI_ASSERT(frame < input_feats.NumRows()); + int32 num_frames = input_feats.NumRows(), + feat_dim = input_feats.NumCols(); + KALDI_ASSERT(static_cast(output_frame->Dim()) == feat_dim * (opts_.order+1)); + output_frame->SetZero(); + for (int32 i = 0; i <= opts_.order; i++) { + const Vector &scales = scales_[i]; + int32 max_offset = (scales.Dim() - 1) / 2; + SubVector output(*output_frame, i*feat_dim, feat_dim); + for (int32 j = -max_offset; j <= max_offset; j++) { + // if asked to read + int32 offset_frame = frame + j; + if (offset_frame < 0) offset_frame = 0; + else if (offset_frame >= num_frames) + offset_frame = num_frames - 1; + BaseFloat scale = scales(j + max_offset); + if (scale != 0.0) + output.AddVec(scale, input_feats.Row(offset_frame)); + } + } +} + +ShiftedDeltaFeatures::ShiftedDeltaFeatures( + const ShiftedDeltaFeaturesOptions &opts): opts_(opts) { + KALDI_ASSERT(opts.window > 0 && opts.window < 1000); + + // Default window is 1. + int32 window = opts.window; + KALDI_ASSERT(window != 0); + scales_.Resize(1 + 2*window); // also zeros it. + BaseFloat normalizer = 0.0; + for (int32 j = -window; j <= window; j++) { + normalizer += j*j; + scales_(j + window) += static_cast(j); + } + scales_.Scale(1.0 / normalizer); +} + +void ShiftedDeltaFeatures::Process(const MatrixBase &input_feats, + int32 frame, + SubVector *output_frame) const { + KALDI_ASSERT(frame < input_feats.NumRows()); + int32 num_frames = input_feats.NumRows(), + feat_dim = input_feats.NumCols(); + KALDI_ASSERT(static_cast(output_frame->Dim()) + == feat_dim * (opts_.num_blocks + 1)); + output_frame->SetZero(); + + // The original features + SubVector output(*output_frame, 0, feat_dim); + output.AddVec(1.0, input_feats.Row(frame)); + + // Concatenate the delta-blocks. Each block is block_shift + // (usually 3) frames apart. + for (int32 i = 0; i < opts_.num_blocks; i++) { + int32 max_offset = (scales_.Dim() - 1) / 2; + SubVector output(*output_frame, (i + 1) * feat_dim, feat_dim); + for (int32 j = -max_offset; j <= max_offset; j++) { + int32 offset_frame = frame + j + i * opts_.block_shift; + if (offset_frame < 0) offset_frame = 0; + else if (offset_frame >= num_frames) + offset_frame = num_frames - 1; + BaseFloat scale = scales_(j + max_offset); + if (scale != 0.0) + output.AddVec(scale, input_feats.Row(offset_frame)); + } + } +} + +void ComputeDeltas(const DeltaFeaturesOptions &delta_opts, + const MatrixBase &input_features, + Matrix *output_features) { + output_features->Resize(input_features.NumRows(), + input_features.NumCols() + *(delta_opts.order + 1)); + DeltaFeatures delta(delta_opts); + for (int32 r = 0; r < static_cast(input_features.NumRows()); r++) { + SubVector row(*output_features, r); + delta.Process(input_features, r, &row); + } +} + +void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts, + const MatrixBase &input_features, + Matrix *output_features) { + output_features->Resize(input_features.NumRows(), + input_features.NumCols() + * (delta_opts.num_blocks + 1)); + ShiftedDeltaFeatures delta(delta_opts); + + for (int32 r = 0; r < static_cast(input_features.NumRows()); r++) { + SubVector row(*output_features, r); + delta.Process(input_features, r, &row); + } +} + + +void InitIdftBases(int32 n_bases, int32 dimension, Matrix *mat_out) { + BaseFloat angle = M_PI / static_cast(dimension - 1); + BaseFloat scale = 1.0f / (2.0 * static_cast(dimension - 1)); + mat_out->Resize(n_bases, dimension); + for (int32 i = 0; i < n_bases; i++) { + (*mat_out)(i, 0) = 1.0 * scale; + BaseFloat i_fl = static_cast(i); + for (int32 j = 1; j < dimension - 1; j++) { + BaseFloat j_fl = static_cast(j); + (*mat_out)(i, j) = 2.0 * scale * cos(angle * i_fl * j_fl); + } + + (*mat_out)(i, dimension -1) + = scale * cos(angle * i_fl * static_cast(dimension-1)); + } +} + +void SpliceFrames(const MatrixBase &input_features, + int32 left_context, + int32 right_context, + Matrix *output_features) { + int32 T = input_features.NumRows(), D = input_features.NumCols(); + if (T == 0 || D == 0) + KALDI_ERR << "SpliceFrames: empty input"; + KALDI_ASSERT(left_context >= 0 && right_context >= 0); + int32 N = 1 + left_context + right_context; + output_features->Resize(T, D*N); + for (int32 t = 0; t < T; t++) { + SubVector dst_row(*output_features, t); + for (int32 j = 0; j < N; j++) { + int32 t2 = t + j - left_context; + if (t2 < 0) t2 = 0; + if (t2 >= T) t2 = T-1; + SubVector dst(dst_row, j*D, D), + src(input_features, t2); + dst.CopyFromVec(src); + } + } +} + +void ReverseFrames(const MatrixBase &input_features, + Matrix *output_features) { + int32 T = input_features.NumRows(), D = input_features.NumCols(); + if (T == 0 || D == 0) + KALDI_ERR << "ReverseFrames: empty input"; + output_features->Resize(T, D); + for (int32 t = 0; t < T; t++) { + SubVector dst_row(*output_features, t); + SubVector src_row(input_features, T-1-t); + dst_row.CopyFromVec(src_row); + } +} + + +void SlidingWindowCmnOptions::Check() const { + KALDI_ASSERT(cmn_window > 0); + if (center) + KALDI_ASSERT(min_window > 0 && min_window <= cmn_window); + // else ignored so value doesn't matter. +} + +// Internal version of SlidingWindowCmn with double-precision arguments. +void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts, + const MatrixBase &input, + MatrixBase *output) { + opts.Check(); + int32 num_frames = input.NumRows(), dim = input.NumCols(), + last_window_start = -1, last_window_end = -1, + warning_count = 0; + Vector cur_sum(dim), cur_sumsq(dim); + + for (int32 t = 0; t < num_frames; t++) { + int32 window_start, window_end; // note: window_end will be one + // past the end of the window we use for normalization. + if (opts.center) { + window_start = t - (opts.cmn_window / 2); + window_end = window_start + opts.cmn_window; + } else { + window_start = t - opts.cmn_window; + window_end = t + 1; + } + if (window_start < 0) { // shift window right if starts <0. + window_end -= window_start; + window_start = 0; // or: window_start -= window_start + } + if (!opts.center) { + if (window_end > t) + window_end = std::max(t + 1, opts.min_window); + } + if (window_end > num_frames) { + window_start -= (window_end - num_frames); + window_end = num_frames; + if (window_start < 0) window_start = 0; + } + if (last_window_start == -1) { + SubMatrix input_part(input, + window_start, window_end - window_start, + 0, dim); + cur_sum.AddRowSumMat(1.0, input_part , 0.0); + if (opts.normalize_variance) + cur_sumsq.AddDiagMat2(1.0, input_part, kTrans, 0.0); + } else { + if (window_start > last_window_start) { + KALDI_ASSERT(window_start == last_window_start + 1); + SubVector frame_to_remove(input, last_window_start); + cur_sum.AddVec(-1.0, frame_to_remove); + if (opts.normalize_variance) + cur_sumsq.AddVec2(-1.0, frame_to_remove); + } + if (window_end > last_window_end) { + KALDI_ASSERT(window_end == last_window_end + 1); + SubVector frame_to_add(input, last_window_end); + cur_sum.AddVec(1.0, frame_to_add); + if (opts.normalize_variance) + cur_sumsq.AddVec2(1.0, frame_to_add); + } + } + int32 window_frames = window_end - window_start; + last_window_start = window_start; + last_window_end = window_end; + + KALDI_ASSERT(window_frames > 0); + SubVector input_frame(input, t), + output_frame(*output, t); + output_frame.CopyFromVec(input_frame); + output_frame.AddVec(-1.0 / window_frames, cur_sum); + + if (opts.normalize_variance) { + if (window_frames == 1) { + output_frame.Set(0.0); + } else { + Vector variance(cur_sumsq); + variance.Scale(1.0 / window_frames); + variance.AddVec2(-1.0 / (window_frames * window_frames), cur_sum); + // now "variance" is the variance of the features in the window, + // around their own mean. + int32 num_floored; + variance.ApplyFloor(1.0e-10, &num_floored); + if (num_floored > 0 && num_frames > 1) { + if (opts.max_warnings == warning_count) { + KALDI_WARN << "Suppressing the remaining variance flooring " + << "warnings. Run program with --max-warnings=-1 to " + << "see all warnings."; + } + // If opts.max_warnings is a negative number, we won't restrict the + // number of times that the warning is printed out. + else if (opts.max_warnings < 0 + || opts.max_warnings > warning_count) { + KALDI_WARN << "Flooring when normalizing variance, floored " + << num_floored << " elements; num-frames was " + << window_frames; + } + warning_count++; + } + variance.ApplyPow(-0.5); // get inverse standard deviation. + output_frame.MulElements(variance); + } + } + } +} + + +void SlidingWindowCmn(const SlidingWindowCmnOptions &opts, + const MatrixBase &input, + MatrixBase *output) { + KALDI_ASSERT(SameDim(input, *output) && input.NumRows() > 0); + Matrix input_dbl(input), output_dbl(input.NumRows(), input.NumCols()); + // call double-precision version + SlidingWindowCmnInternal(opts, input_dbl, &output_dbl); + output->CopyFromMat(output_dbl); +} + + + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/feat/feature-functions.h b/speechx/speechx/kaldi/feat/feature-functions.h new file mode 100644 index 00000000..52454f30 --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-functions.h @@ -0,0 +1,204 @@ +// feat/feature-functions.h + +// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation +// 2014 IMSL, PKU-HKUST (author: Wei Shi) +// 2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_FEAT_FEATURE_FUNCTIONS_H_ +#define KALDI_FEAT_FEATURE_FUNCTIONS_H_ + +#include +#include + +#include "matrix/matrix-lib.h" +#include "util/common-utils.h" +#include "base/kaldi-error.h" + +namespace kaldi { +/// @addtogroup feat FeatureExtraction +/// @{ + + +// ComputePowerSpectrum converts a complex FFT (as produced by the FFT +// functions in matrix/matrix-functions.h), and converts it into +// a power spectrum. If the complex FFT is a vector of size n (representing +// half the complex FFT of a real signal of size n, as described there), +// this function computes in the first (n/2) + 1 elements of it, the +// energies of the fft bins from zero to the Nyquist frequency. Contents of the +// remaining (n/2) - 1 elements are undefined at output. +void ComputePowerSpectrum(VectorBase *complex_fft); + + +struct DeltaFeaturesOptions { + int32 order; + int32 window; // e.g. 2; controls window size (window size is 2*window + 1) + // the behavior at the edges is to replicate the first or last frame. + // this is not configurable. + + DeltaFeaturesOptions(int32 order = 2, int32 window = 2): + order(order), window(window) { } + void Register(OptionsItf *opts) { + opts->Register("delta-order", &order, "Order of delta computation"); + opts->Register("delta-window", &window, + "Parameter controlling window for delta computation (actual window" + " size for each delta order is 1 + 2*delta-window-size)"); + } +}; + +class DeltaFeatures { + public: + // This class provides a low-level function to compute delta features. + // The function takes as input a matrix of features and a frame index + // that it should compute the deltas on. It puts its output in an object + // of type VectorBase, of size (original-feature-dimension) * (opts.order+1). + // This is not the most efficient way to do the computation, but it's + // state-free and thus easier to understand + + explicit DeltaFeatures(const DeltaFeaturesOptions &opts); + + void Process(const MatrixBase &input_feats, + int32 frame, + VectorBase *output_frame) const; + private: + DeltaFeaturesOptions opts_; + std::vector > scales_; // a scaling window for each + // of the orders, including zero: multiply the features for each + // dimension by this window. +}; + +struct ShiftedDeltaFeaturesOptions { + int32 window, // The time delay and advance + num_blocks, + block_shift; // Distance between consecutive blocks + + ShiftedDeltaFeaturesOptions(): + window(1), num_blocks(7), block_shift(3) { } + void Register(OptionsItf *opts) { + opts->Register("delta-window", &window, "Size of delta advance and delay."); + opts->Register("num-blocks", &num_blocks, "Number of delta blocks in advance" + " of each frame to be concatenated"); + opts->Register("block-shift", &block_shift, "Distance between each block"); + } +}; + +class ShiftedDeltaFeatures { + public: + // This class provides a low-level function to compute shifted + // delta cesptra (SDC). + // The function takes as input a matrix of features and a frame index + // that it should compute the deltas on. It puts its output in an object + // of type VectorBase, of size original-feature-dimension + (1 * num_blocks). + + explicit ShiftedDeltaFeatures(const ShiftedDeltaFeaturesOptions &opts); + + void Process(const MatrixBase &input_feats, + int32 frame, + SubVector *output_frame) const; + private: + ShiftedDeltaFeaturesOptions opts_; + Vector scales_; // a scaling window for each + +}; + +// ComputeDeltas is a convenience function that computes deltas on a feature +// file. If you want to deal with features coming in bit by bit you would have +// to use the DeltaFeatures class directly, and do the computation frame by +// frame. Later we will have to come up with a nice mechanism to do this for +// features coming in. +void ComputeDeltas(const DeltaFeaturesOptions &delta_opts, + const MatrixBase &input_features, + Matrix *output_features); + +// ComputeShiftedDeltas computes deltas from a feature file by applying +// ShiftedDeltaFeatures over the frames. This function is provided for +// convenience, however, ShiftedDeltaFeatures can be used directly. +void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts, + const MatrixBase &input_features, + Matrix *output_features); + +// SpliceFrames will normally be used together with LDA. +// It splices frames together to make a window. At the +// start and end of an utterance, it duplicates the first +// and last frames. +// Will throw if input features are empty. +// left_context and right_context must be nonnegative. +// these both represent a number of frames (e.g. 4, 4 is +// a good choice). +void SpliceFrames(const MatrixBase &input_features, + int32 left_context, + int32 right_context, + Matrix *output_features); + +// ReverseFrames reverses the frames in time (used for backwards decoding) +void ReverseFrames(const MatrixBase &input_features, + Matrix *output_features); + + +void InitIdftBases(int32 n_bases, int32 dimension, Matrix *mat_out); + + +// This is used for speaker-id. Also see OnlineCmnOptions in ../online2/, which +// is online CMN with no latency, for online speech recognition. +struct SlidingWindowCmnOptions { + int32 cmn_window; + int32 min_window; + int32 max_warnings; + bool normalize_variance; + bool center; + + SlidingWindowCmnOptions(): + cmn_window(600), + min_window(100), + max_warnings(5), + normalize_variance(false), + center(false) { } + + void Register(OptionsItf *opts) { + opts->Register("cmn-window", &cmn_window, "Window in frames for running " + "average CMN computation"); + opts->Register("min-cmn-window", &min_window, "Minimum CMN window " + "used at start of decoding (adds latency only at start). " + "Only applicable if center == false, ignored if center==true"); + opts->Register("max-warnings", &max_warnings, "Maximum warnings to report " + "per utterance. 0 to disable, -1 to show all."); + opts->Register("norm-vars", &normalize_variance, "If true, normalize " + "variance to one."); // naming this as in apply-cmvn.cc + opts->Register("center", ¢er, "If true, use a window centered on the " + "current frame (to the extent possible, modulo end effects). " + "If false, window is to the left."); + } + void Check() const; +}; + + +/// Applies sliding-window cepstral mean and/or variance normalization. See the +/// strings registering the options in the options class for information on how +/// this works and what the options are. input and output must have the same +/// dimension. +void SlidingWindowCmn(const SlidingWindowCmnOptions &opts, + const MatrixBase &input, + MatrixBase *output); + + +/// @} End of "addtogroup feat" +} // namespace kaldi + + + +#endif // KALDI_FEAT_FEATURE_FUNCTIONS_H_ diff --git a/speechx/speechx/kaldi/feat/feature-mfcc.cc b/speechx/speechx/kaldi/feat/feature-mfcc.cc new file mode 100644 index 00000000..73ab4b31 --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-mfcc.cc @@ -0,0 +1,157 @@ +// feat/feature-mfcc.cc + +// Copyright 2009-2011 Karel Vesely; Petr Motlicek +// 2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "feat/feature-mfcc.h" + + +namespace kaldi { + + +void MfccComputer::Compute(BaseFloat signal_raw_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature) { + KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() && + feature->Dim() == this->Dim()); + + const MelBanks &mel_banks = *(GetMelBanks(vtln_warp)); + + if (opts_.use_energy && !opts_.raw_energy) + signal_raw_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame), + std::numeric_limits::epsilon())); + + if (srfft_ != NULL) // Compute FFT using the split-radix algorithm. + srfft_->Compute(signal_frame->Data(), true); + else // An alternative algorithm that works for non-powers-of-two. + RealFft(signal_frame, true); + + // Convert the FFT into a power spectrum. + ComputePowerSpectrum(signal_frame); + SubVector power_spectrum(*signal_frame, 0, + signal_frame->Dim() / 2 + 1); + + mel_banks.Compute(power_spectrum, &mel_energies_); + + // avoid log of zero (which should be prevented anyway by dithering). + mel_energies_.ApplyFloor(std::numeric_limits::epsilon()); + mel_energies_.ApplyLog(); // take the log. + + feature->SetZero(); // in case there were NaNs. + // feature = dct_matrix_ * mel_energies [which now have log] + feature->AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies_, 0.0); + + if (opts_.cepstral_lifter != 0.0) + feature->MulElements(lifter_coeffs_); + + if (opts_.use_energy) { + if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) + signal_raw_log_energy = log_energy_floor_; + (*feature)(0) = signal_raw_log_energy; + } + + if (opts_.htk_compat) { + BaseFloat energy = (*feature)(0); + for (int32 i = 0; i < opts_.num_ceps - 1; i++) + (*feature)(i) = (*feature)(i+1); + if (!opts_.use_energy) + energy *= M_SQRT2; // scale on C0 (actually removing a scale + // we previously added that's part of one common definition of + // the cosine transform.) + (*feature)(opts_.num_ceps - 1) = energy; + } +} + +MfccComputer::MfccComputer(const MfccOptions &opts): + opts_(opts), srfft_(NULL), + mel_energies_(opts.mel_opts.num_bins) { + + int32 num_bins = opts.mel_opts.num_bins; + if (opts.num_ceps > num_bins) + KALDI_ERR << "num-ceps cannot be larger than num-mel-bins." + << " It should be smaller or equal. You provided num-ceps: " + << opts.num_ceps << " and num-mel-bins: " + << num_bins; + + Matrix dct_matrix(num_bins, num_bins); + ComputeDctMatrix(&dct_matrix); + // Note that we include zeroth dct in either case. If using the + // energy we replace this with the energy. This means a different + // ordering of features than HTK. + SubMatrix dct_rows(dct_matrix, 0, opts.num_ceps, 0, num_bins); + dct_matrix_.Resize(opts.num_ceps, num_bins); + dct_matrix_.CopyFromMat(dct_rows); // subset of rows. + if (opts.cepstral_lifter != 0.0) { + lifter_coeffs_.Resize(opts.num_ceps); + ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_); + } + if (opts.energy_floor > 0.0) + log_energy_floor_ = Log(opts.energy_floor); + + int32 padded_window_size = opts.frame_opts.PaddedWindowSize(); + if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two... + srfft_ = new SplitRadixRealFft(padded_window_size); + + // We'll definitely need the filterbanks info for VTLN warping factor 1.0. + // [note: this call caches it.] + GetMelBanks(1.0); +} + +MfccComputer::MfccComputer(const MfccComputer &other): + opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_), + dct_matrix_(other.dct_matrix_), + log_energy_floor_(other.log_energy_floor_), + mel_banks_(other.mel_banks_), + srfft_(NULL), + mel_energies_(other.mel_energies_.Dim(), kUndefined) { + for (std::map::iterator iter = mel_banks_.begin(); + iter != mel_banks_.end(); ++iter) + iter->second = new MelBanks(*(iter->second)); + if (other.srfft_ != NULL) + srfft_ = new SplitRadixRealFft(*(other.srfft_)); +} + + + +MfccComputer::~MfccComputer() { + for (std::map::iterator iter = mel_banks_.begin(); + iter != mel_banks_.end(); + ++iter) + delete iter->second; + delete srfft_; +} + +const MelBanks *MfccComputer::GetMelBanks(BaseFloat vtln_warp) { + MelBanks *this_mel_banks = NULL; + std::map::iterator iter = mel_banks_.find(vtln_warp); + if (iter == mel_banks_.end()) { + this_mel_banks = new MelBanks(opts_.mel_opts, + opts_.frame_opts, + vtln_warp); + mel_banks_[vtln_warp] = this_mel_banks; + } else { + this_mel_banks = iter->second; + } + return this_mel_banks; +} + + + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/feat/feature-mfcc.h b/speechx/speechx/kaldi/feat/feature-mfcc.h new file mode 100644 index 00000000..dbfb9d60 --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-mfcc.h @@ -0,0 +1,154 @@ +// feat/feature-mfcc.h + +// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Saarland University +// 2014-2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_FEATURE_MFCC_H_ +#define KALDI_FEAT_FEATURE_MFCC_H_ + +#include +#include + +#include "feat/feature-common.h" +#include "feat/feature-functions.h" +#include "feat/feature-window.h" +#include "feat/mel-computations.h" + +namespace kaldi { +/// @addtogroup feat FeatureExtraction +/// @{ + + +/// MfccOptions contains basic options for computing MFCC features. +struct MfccOptions { + FrameExtractionOptions frame_opts; + MelBanksOptions mel_opts; + int32 num_ceps; // e.g. 13: num cepstral coeffs, counting zero. + bool use_energy; // use energy; else C0 + BaseFloat energy_floor; // 0 by default; set to a value like 1.0 or 0.1 if + // you disable dithering. + bool raw_energy; // If true, compute energy before preemphasis and windowing + BaseFloat cepstral_lifter; // Scaling factor on cepstra for HTK compatibility. + // if 0.0, no liftering is done. + bool htk_compat; // if true, put energy/C0 last and introduce a factor of + // sqrt(2) on C0 to be the same as HTK. + + MfccOptions() : mel_opts(23), + // defaults the #mel-banks to 23 for the MFCC computations. + // this seems to be common for 16khz-sampled data, + // but for 8khz-sampled data, 15 may be better. + num_ceps(13), + use_energy(true), + energy_floor(0.0), + raw_energy(true), + cepstral_lifter(22.0), + htk_compat(false) {} + + void Register(OptionsItf *opts) { + frame_opts.Register(opts); + mel_opts.Register(opts); + opts->Register("num-ceps", &num_ceps, + "Number of cepstra in MFCC computation (including C0)"); + opts->Register("use-energy", &use_energy, + "Use energy (not C0) in MFCC computation"); + opts->Register("energy-floor", &energy_floor, + "Floor on energy (absolute, not relative) in MFCC computation. " + "Only makes a difference if --use-energy=true; only necessary if " + "--dither=0.0. Suggested values: 0.1 or 1.0"); + opts->Register("raw-energy", &raw_energy, + "If true, compute energy before preemphasis and windowing"); + opts->Register("cepstral-lifter", &cepstral_lifter, + "Constant that controls scaling of MFCCs"); + opts->Register("htk-compat", &htk_compat, + "If true, put energy or C0 last and use a factor of sqrt(2) on " + "C0. Warning: not sufficient to get HTK compatible features " + "(need to change other parameters)."); + } +}; + + + +// This is the new-style interface to the MFCC computation. +class MfccComputer { + public: + typedef MfccOptions Options; + explicit MfccComputer(const MfccOptions &opts); + MfccComputer(const MfccComputer &other); + + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } + + int32 Dim() const { return opts_.num_ceps; } + + bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; } + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. + */ + void Compute(BaseFloat signal_raw_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature); + + ~MfccComputer(); + private: + // disallow assignment. + MfccComputer &operator = (const MfccComputer &in); + + protected: + const MelBanks *GetMelBanks(BaseFloat vtln_warp); + + MfccOptions opts_; + Vector lifter_coeffs_; + Matrix dct_matrix_; // matrix we left-multiply by to perform DCT. + BaseFloat log_energy_floor_; + std::map mel_banks_; // BaseFloat is VTLN coefficient. + SplitRadixRealFft *srfft_; + + // note: mel_energies_ is specific to the frame we're processing, it's + // just a temporary workspace. + Vector mel_energies_; +}; + +typedef OfflineFeatureTpl Mfcc; + + +/// @} End of "addtogroup feat" +} // namespace kaldi + + +#endif // KALDI_FEAT_FEATURE_MFCC_H_ diff --git a/speechx/speechx/kaldi/feat/feature-plp.cc b/speechx/speechx/kaldi/feat/feature-plp.cc new file mode 100644 index 00000000..e0c270c7 --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-plp.cc @@ -0,0 +1,191 @@ +// feat/feature-plp.cc + +// Copyright 2009-2011 Petr Motlicek; Karel Vesely +// 2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "feat/feature-plp.h" + +namespace kaldi { + +PlpComputer::PlpComputer(const PlpOptions &opts): + opts_(opts), srfft_(NULL), + mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined), + autocorr_coeffs_(opts_.lpc_order + 1, kUndefined), + lpc_coeffs_(opts_.lpc_order, kUndefined), + raw_cepstrum_(opts_.lpc_order, kUndefined) { + + if (opts.cepstral_lifter != 0.0) { + lifter_coeffs_.Resize(opts.num_ceps); + ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_); + } + InitIdftBases(opts_.lpc_order + 1, opts_.mel_opts.num_bins + 2, + &idft_bases_); + + if (opts.energy_floor > 0.0) + log_energy_floor_ = Log(opts.energy_floor); + + int32 padded_window_size = opts.frame_opts.PaddedWindowSize(); + if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two... + srfft_ = new SplitRadixRealFft(padded_window_size); + + // We'll definitely need the filterbanks info for VTLN warping factor 1.0. + // [note: this call caches it.] + GetMelBanks(1.0); +} + +PlpComputer::PlpComputer(const PlpComputer &other): + opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_), + idft_bases_(other.idft_bases_), log_energy_floor_(other.log_energy_floor_), + mel_banks_(other.mel_banks_), equal_loudness_(other.equal_loudness_), + srfft_(NULL), + mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined), + autocorr_coeffs_(opts_.lpc_order + 1, kUndefined), + lpc_coeffs_(opts_.lpc_order, kUndefined), + raw_cepstrum_(opts_.lpc_order, kUndefined) { + for (std::map::iterator iter = mel_banks_.begin(); + iter != mel_banks_.end(); ++iter) + iter->second = new MelBanks(*(iter->second)); + for (std::map*>::iterator + iter = equal_loudness_.begin(); + iter != equal_loudness_.end(); ++iter) + iter->second = new Vector(*(iter->second)); + if (other.srfft_ != NULL) + srfft_ = new SplitRadixRealFft(*(other.srfft_)); +} + +PlpComputer::~PlpComputer() { + for (std::map::iterator iter = mel_banks_.begin(); + iter != mel_banks_.end(); ++iter) + delete iter->second; + for (std::map* >::iterator + iter = equal_loudness_.begin(); + iter != equal_loudness_.end(); ++iter) + delete iter->second; + delete srfft_; +} + +const MelBanks *PlpComputer::GetMelBanks(BaseFloat vtln_warp) { + MelBanks *this_mel_banks = NULL; + std::map::iterator iter = mel_banks_.find(vtln_warp); + if (iter == mel_banks_.end()) { + this_mel_banks = new MelBanks(opts_.mel_opts, + opts_.frame_opts, + vtln_warp); + mel_banks_[vtln_warp] = this_mel_banks; + } else { + this_mel_banks = iter->second; + } + return this_mel_banks; +} + +const Vector *PlpComputer::GetEqualLoudness(BaseFloat vtln_warp) { + const MelBanks *this_mel_banks = GetMelBanks(vtln_warp); + Vector *ans = NULL; + std::map*>::iterator iter + = equal_loudness_.find(vtln_warp); + if (iter == equal_loudness_.end()) { + ans = new Vector; + GetEqualLoudnessVector(*this_mel_banks, ans); + equal_loudness_[vtln_warp] = ans; + } else { + ans = iter->second; + } + return ans; +} + +void PlpComputer::Compute(BaseFloat signal_raw_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature) { + KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() && + feature->Dim() == this->Dim()); + + const MelBanks &mel_banks = *GetMelBanks(vtln_warp); + const Vector &equal_loudness = *GetEqualLoudness(vtln_warp); + + + KALDI_ASSERT(opts_.num_ceps <= opts_.lpc_order+1); // our num-ceps includes C0. + + + if (opts_.use_energy && !opts_.raw_energy) + signal_raw_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame), + std::numeric_limits::min())); + + if (srfft_ != NULL) // Compute FFT using split-radix algorithm. + srfft_->Compute(signal_frame->Data(), true); + else // An alternative algorithm that works for non-powers-of-two. + RealFft(signal_frame, true); + + // Convert the FFT into a power spectrum. + ComputePowerSpectrum(signal_frame); // elements 0 ... signal_frame->Dim()/2 + + SubVector power_spectrum(*signal_frame, + 0, signal_frame->Dim() / 2 + 1); + + int32 num_mel_bins = opts_.mel_opts.num_bins; + + SubVector mel_energies(mel_energies_duplicated_, 1, num_mel_bins); + + mel_banks.Compute(power_spectrum, &mel_energies); + + mel_energies.MulElements(equal_loudness); + + mel_energies.ApplyPow(opts_.compress_factor); + + // duplicate first and last elements + mel_energies_duplicated_(0) = mel_energies_duplicated_(1); + mel_energies_duplicated_(num_mel_bins + 1) = + mel_energies_duplicated_(num_mel_bins); + + autocorr_coeffs_.SetZero(); // In case of NaNs or infs + autocorr_coeffs_.AddMatVec(1.0, idft_bases_, kNoTrans, + mel_energies_duplicated_, 0.0); + + BaseFloat residual_log_energy = ComputeLpc(autocorr_coeffs_, &lpc_coeffs_); + + residual_log_energy = std::max(residual_log_energy, + std::numeric_limits::min()); + + Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs_.Data(), raw_cepstrum_.Data()); + feature->Range(1, opts_.num_ceps - 1).CopyFromVec( + raw_cepstrum_.Range(0, opts_.num_ceps - 1)); + (*feature)(0) = residual_log_energy; + + if (opts_.cepstral_lifter != 0.0) + feature->MulElements(lifter_coeffs_); + + if (opts_.cepstral_scale != 1.0) + feature->Scale(opts_.cepstral_scale); + + if (opts_.use_energy) { + if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) + signal_raw_log_energy = log_energy_floor_; + (*feature)(0) = signal_raw_log_energy; + } + + if (opts_.htk_compat) { // reorder the features. + BaseFloat log_energy = (*feature)(0); + for (int32 i = 0; i < opts_.num_ceps-1; i++) + (*feature)(i) = (*feature)(i+1); + (*feature)(opts_.num_ceps-1) = log_energy; + } +} + + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/feat/feature-plp.h b/speechx/speechx/kaldi/feat/feature-plp.h new file mode 100644 index 00000000..4f156ca1 --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-plp.h @@ -0,0 +1,176 @@ +// feat/feature-plp.h + +// Copyright 2009-2011 Petr Motlicek; Karel Vesely + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_FEATURE_PLP_H_ +#define KALDI_FEAT_FEATURE_PLP_H_ + +#include +#include + +#include "feat/feature-common.h" +#include "feat/feature-functions.h" +#include "feat/feature-window.h" +#include "feat/mel-computations.h" +#include "itf/options-itf.h" + +namespace kaldi { +/// @addtogroup feat FeatureExtraction +/// @{ + + + +/// PlpOptions contains basic options for computing PLP features. +/// It only includes things that can be done in a "stateless" way, i.e. +/// it does not include energy max-normalization. +/// It does not include delta computation. +struct PlpOptions { + FrameExtractionOptions frame_opts; + MelBanksOptions mel_opts; + int32 lpc_order; + int32 num_ceps; // num cepstra including zero + bool use_energy; // use energy; else C0 + BaseFloat energy_floor; + bool raw_energy; // If true, compute energy before preemphasis and windowing + BaseFloat compress_factor; + int32 cepstral_lifter; + BaseFloat cepstral_scale; + + bool htk_compat; // if true, put energy/C0 last and introduce a factor of + // sqrt(2) on C0 to be the same as HTK. + + PlpOptions() : mel_opts(23), + // default number of mel-banks for the PLP computation; this + // seems to be common for 16kHz-sampled data. For 8kHz-sampled + // data, 15 may be better. + lpc_order(12), + num_ceps(13), + use_energy(true), + energy_floor(0.0), + raw_energy(true), + compress_factor(0.33333), + cepstral_lifter(22), + cepstral_scale(1.0), + htk_compat(false) {} + + void Register(OptionsItf *opts) { + frame_opts.Register(opts); + mel_opts.Register(opts); + opts->Register("lpc-order", &lpc_order, + "Order of LPC analysis in PLP computation"); + opts->Register("num-ceps", &num_ceps, + "Number of cepstra in PLP computation (including C0)"); + opts->Register("use-energy", &use_energy, + "Use energy (not C0) for zeroth PLP feature"); + opts->Register("energy-floor", &energy_floor, + "Floor on energy (absolute, not relative) in PLP computation. " + "Only makes a difference if --use-energy=true; only necessary if " + "--dither=0.0. Suggested values: 0.1 or 1.0"); + opts->Register("raw-energy", &raw_energy, + "If true, compute energy before preemphasis and windowing"); + opts->Register("compress-factor", &compress_factor, + "Compression factor in PLP computation"); + opts->Register("cepstral-lifter", &cepstral_lifter, + "Constant that controls scaling of PLPs"); + opts->Register("cepstral-scale", &cepstral_scale, + "Scaling constant in PLP computation"); + opts->Register("htk-compat", &htk_compat, + "If true, put energy or C0 last. Warning: not sufficient " + "to get HTK compatible features (need to change other " + "parameters)."); + } +}; + + +/// This is the new-style interface to the PLP computation. +class PlpComputer { + public: + typedef PlpOptions Options; + explicit PlpComputer(const PlpOptions &opts); + PlpComputer(const PlpComputer &other); + + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } + + int32 Dim() const { return opts_.num_ceps; } + + bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; } + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. + */ + void Compute(BaseFloat signal_raw_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature); + + ~PlpComputer(); + private: + + const MelBanks *GetMelBanks(BaseFloat vtln_warp); + + const Vector *GetEqualLoudness(BaseFloat vtln_warp); + + PlpOptions opts_; + Vector lifter_coeffs_; + Matrix idft_bases_; + BaseFloat log_energy_floor_; + std::map mel_banks_; // BaseFloat is VTLN coefficient. + std::map* > equal_loudness_; + SplitRadixRealFft *srfft_; + + // temporary vector used inside Compute; size is opts_.mel_opts.num_bins + 2 + Vector mel_energies_duplicated_; + // temporary vector used inside Compute; size is opts_.lpc_order + 1 + Vector autocorr_coeffs_; + // temporary vector used inside Compute; size is opts_.lpc_order + Vector lpc_coeffs_; + // temporary vector used inside Compute; size is opts_.lpc_order + Vector raw_cepstrum_; + + // Disallow assignment. + PlpComputer &operator =(const PlpComputer &other); +}; + +typedef OfflineFeatureTpl Plp; + +/// @} End of "addtogroup feat" + +} // namespace kaldi + + +#endif // KALDI_FEAT_FEATURE_PLP_H_ diff --git a/speechx/speechx/kaldi/feat/feature-spectrogram.cc b/speechx/speechx/kaldi/feat/feature-spectrogram.cc new file mode 100644 index 00000000..7eee2643 --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-spectrogram.cc @@ -0,0 +1,82 @@ +// feat/feature-spectrogram.cc + +// Copyright 2009-2012 Karel Vesely +// Copyright 2012 Navdeep Jaitly + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "feat/feature-spectrogram.h" + + +namespace kaldi { + +SpectrogramComputer::SpectrogramComputer(const SpectrogramOptions &opts) + : opts_(opts), srfft_(NULL) { + if (opts.energy_floor > 0.0) + log_energy_floor_ = Log(opts.energy_floor); + + int32 padded_window_size = opts.frame_opts.PaddedWindowSize(); + if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two + srfft_ = new SplitRadixRealFft(padded_window_size); +} + +SpectrogramComputer::SpectrogramComputer(const SpectrogramComputer &other): + opts_(other.opts_), log_energy_floor_(other.log_energy_floor_), srfft_(NULL) { + if (other.srfft_ != NULL) + srfft_ = new SplitRadixRealFft(*other.srfft_); +} + +SpectrogramComputer::~SpectrogramComputer() { + delete srfft_; +} + +void SpectrogramComputer::Compute(BaseFloat signal_raw_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature) { + KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() && + feature->Dim() == this->Dim()); + + + // Compute energy after window function (not the raw one) + if (!opts_.raw_energy) + signal_raw_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame), + std::numeric_limits::epsilon())); + + if (srfft_ != NULL) // Compute FFT using split-radix algorithm. + srfft_->Compute(signal_frame->Data(), true); + else // An alternative algorithm that works for non-powers-of-two + RealFft(signal_frame, true); + + // Convert the FFT into a power spectrum. + ComputePowerSpectrum(signal_frame); + SubVector power_spectrum(*signal_frame, + 0, signal_frame->Dim() / 2 + 1); + + power_spectrum.ApplyFloor(std::numeric_limits::epsilon()); + power_spectrum.ApplyLog(); + + feature->CopyFromVec(power_spectrum); + + if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) + signal_raw_log_energy = log_energy_floor_; + // The zeroth spectrogram component is always set to the signal energy, + // instead of the square of the constant component of the signal. + (*feature)(0) = signal_raw_log_energy; +} + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/feat/feature-spectrogram.h b/speechx/speechx/kaldi/feat/feature-spectrogram.h new file mode 100644 index 00000000..132a6875 --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-spectrogram.h @@ -0,0 +1,117 @@ +// feat/feature-spectrogram.h + +// Copyright 2009-2012 Karel Vesely +// Copyright 2012 Navdeep Jaitly + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_FEATURE_SPECTROGRAM_H_ +#define KALDI_FEAT_FEATURE_SPECTROGRAM_H_ + + +#include + +#include "feat/feature-common.h" +#include "feat/feature-functions.h" +#include "feat/feature-window.h" + +namespace kaldi { +/// @addtogroup feat FeatureExtraction +/// @{ + + +/// SpectrogramOptions contains basic options for computing spectrogram +/// features. +struct SpectrogramOptions { + FrameExtractionOptions frame_opts; + BaseFloat energy_floor; + bool raw_energy; // If true, compute energy before preemphasis and windowing + + SpectrogramOptions() : + energy_floor(0.0), + raw_energy(true) {} + + void Register(OptionsItf *opts) { + frame_opts.Register(opts); + opts->Register("energy-floor", &energy_floor, + "Floor on energy (absolute, not relative) in Spectrogram " + "computation. Caution: this floor is applied to the zeroth " + "component, representing the total signal energy. The " + "floor on the individual spectrogram elements is fixed at " + "std::numeric_limits::epsilon()."); + opts->Register("raw-energy", &raw_energy, + "If true, compute energy before preemphasis and windowing"); + } +}; + +/// Class for computing spectrogram features. +class SpectrogramComputer { + public: + typedef SpectrogramOptions Options; + explicit SpectrogramComputer(const SpectrogramOptions &opts); + SpectrogramComputer(const SpectrogramComputer &other); + + const FrameExtractionOptions& GetFrameOptions() const { + return opts_.frame_opts; + } + + int32 Dim() const { return opts_.frame_opts.PaddedWindowSize() / 2 + 1; } + + bool NeedRawLogEnergy() const { return opts_.raw_energy; } + + + /** + Function that computes one frame of spectrogram features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp This is ignored by this function, it's only + needed for interface compatibility. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. + */ + void Compute(BaseFloat signal_raw_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature); + + ~SpectrogramComputer(); + + private: + SpectrogramOptions opts_; + BaseFloat log_energy_floor_; + SplitRadixRealFft *srfft_; + + // Disallow assignment. + SpectrogramComputer &operator=(const SpectrogramComputer &other); +}; + +typedef OfflineFeatureTpl Spectrogram; + + +/// @} End of "addtogroup feat" +} // namespace kaldi + + +#endif // KALDI_FEAT_FEATURE_SPECTROGRAM_H_ diff --git a/speechx/speechx/kaldi/feat/feature-window.cc b/speechx/speechx/kaldi/feat/feature-window.cc new file mode 100644 index 00000000..c5d4cc29 --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-window.cc @@ -0,0 +1,222 @@ +// feat/feature-window.cc + +// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation +// 2013-2016 Johns Hopkins University (author: Daniel Povey) +// 2014 IMSL, PKU-HKUST (author: Wei Shi) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "feat/feature-window.h" +#include "matrix/matrix-functions.h" + + +namespace kaldi { + + +int64 FirstSampleOfFrame(int32 frame, + const FrameExtractionOptions &opts) { + int64 frame_shift = opts.WindowShift(); + if (opts.snip_edges) { + return frame * frame_shift; + } else { + int64 midpoint_of_frame = frame_shift * frame + frame_shift / 2, + beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2; + return beginning_of_frame; + } +} + +int32 NumFrames(int64 num_samples, + const FrameExtractionOptions &opts, + bool flush) { + int64 frame_shift = opts.WindowShift(); + int64 frame_length = opts.WindowSize(); + if (opts.snip_edges) { + // with --snip-edges=true (the default), we use a HTK-like approach to + // determining the number of frames-- all frames have to fit completely into + // the waveform, and the first frame begins at sample zero. + if (num_samples < frame_length) + return 0; + else + return (1 + ((num_samples - frame_length) / frame_shift)); + // You can understand the expression above as follows: 'num_samples - + // frame_length' is how much room we have to shift the frame within the + // waveform; 'frame_shift' is how much we shift it each time; and the ratio + // is how many times we can shift it (integer arithmetic rounds down). + } else { + // if --snip-edges=false, the number of frames is determined by rounding the + // (file-length / frame-shift) to the nearest integer. The point of this + // formula is to make the number of frames an obvious and predictable + // function of the frame shift and signal length, which makes many + // segmentation-related questions simpler. + // + // Because integer division in C++ rounds toward zero, we add (half the + // frame-shift minus epsilon) before dividing, to have the effect of + // rounding towards the closest integer. + int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift; + + if (flush) + return num_frames; + + // note: 'end' always means the last plus one, i.e. one past the last. + int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts) + + frame_length; + + // the following code is optimized more for clarity than efficiency. + // If flush == false, we can't output frames that extend past the end + // of the signal. + while (num_frames > 0 && end_sample_of_last_frame > num_samples) { + num_frames--; + end_sample_of_last_frame -= frame_shift; + } + return num_frames; + } +} + + +void Dither(VectorBase *waveform, BaseFloat dither_value) { + if (dither_value == 0.0) + return; + int32 dim = waveform->Dim(); + BaseFloat *data = waveform->Data(); + RandomState rstate; + for (int32 i = 0; i < dim; i++) + data[i] += RandGauss(&rstate) * dither_value; +} + + +void Preemphasize(VectorBase *waveform, BaseFloat preemph_coeff) { + if (preemph_coeff == 0.0) return; + KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0); + for (int32 i = waveform->Dim()-1; i > 0; i--) + (*waveform)(i) -= preemph_coeff * (*waveform)(i-1); + (*waveform)(0) -= preemph_coeff * (*waveform)(0); +} + +FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) { + int32 frame_length = opts.WindowSize(); + KALDI_ASSERT(frame_length > 0); + window.Resize(frame_length); + double a = M_2PI / (frame_length-1); + for (int32 i = 0; i < frame_length; i++) { + double i_fl = static_cast(i); + if (opts.window_type == "hanning") { + window(i) = 0.5 - 0.5*cos(a * i_fl); + } else if (opts.window_type == "hamming") { + window(i) = 0.54 - 0.46*cos(a * i_fl); + } else if (opts.window_type == "povey") { // like hamming but goes to zero at edges. + window(i) = pow(0.5 - 0.5*cos(a * i_fl), 0.85); + } else if (opts.window_type == "rectangular") { + window(i) = 1.0; + } else if (opts.window_type == "blackman") { + window(i) = opts.blackman_coeff - 0.5*cos(a * i_fl) + + (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl); + } else { + KALDI_ERR << "Invalid window type " << opts.window_type; + } + } +} + +void ProcessWindow(const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + VectorBase *window, + BaseFloat *log_energy_pre_window) { + int32 frame_length = opts.WindowSize(); + KALDI_ASSERT(window->Dim() == frame_length); + + if (opts.dither != 0.0) + Dither(window, opts.dither); + + if (opts.remove_dc_offset) + window->Add(-window->Sum() / frame_length); + + if (log_energy_pre_window != NULL) { + BaseFloat energy = std::max(VecVec(*window, *window), + std::numeric_limits::epsilon()); + *log_energy_pre_window = Log(energy); + } + + if (opts.preemph_coeff != 0.0) + Preemphasize(window, opts.preemph_coeff); + + window->MulElements(window_function.window); +} + + +// ExtractWindow extracts a windowed frame of waveform with a power-of-two, +// padded size. It does mean subtraction, pre-emphasis and dithering as +// requested. +void ExtractWindow(int64 sample_offset, + const VectorBase &wave, + int32 f, // with 0 <= f < NumFrames(feats, opts) + const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + Vector *window, + BaseFloat *log_energy_pre_window) { + KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0); + int32 frame_length = opts.WindowSize(), + frame_length_padded = opts.PaddedWindowSize(); + int64 num_samples = sample_offset + wave.Dim(), + start_sample = FirstSampleOfFrame(f, opts), + end_sample = start_sample + frame_length; + + if (opts.snip_edges) { + KALDI_ASSERT(start_sample >= sample_offset && + end_sample <= num_samples); + } else { + KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset); + } + + if (window->Dim() != frame_length_padded) + window->Resize(frame_length_padded, kUndefined); + + // wave_start and wave_end are start and end indexes into 'wave', for the + // piece of wave that we're trying to extract. + int32 wave_start = int32(start_sample - sample_offset), + wave_end = wave_start + frame_length; + if (wave_start >= 0 && wave_end <= wave.Dim()) { + // the normal case-- no edge effects to consider. + window->Range(0, frame_length).CopyFromVec( + wave.Range(wave_start, frame_length)); + } else { + // Deal with any end effects by reflection, if needed. This code will only + // be reached for about two frames per utterance, so we don't concern + // ourselves excessively with efficiency. + int32 wave_dim = wave.Dim(); + for (int32 s = 0; s < frame_length; s++) { + int32 s_in_wave = s + wave_start; + while (s_in_wave < 0 || s_in_wave >= wave_dim) { + // reflect around the beginning or end of the wave. + // e.g. -1 -> 0, -2 -> 1. + // dim -> dim - 1, dim + 1 -> dim - 2. + // the code supports repeated reflections, although this + // would only be needed in pathological cases. + if (s_in_wave < 0) s_in_wave = - s_in_wave - 1; + else s_in_wave = 2 * wave_dim - 1 - s_in_wave; + } + (*window)(s) = wave(s_in_wave); + } + } + + if (frame_length_padded > frame_length) + window->Range(frame_length, frame_length_padded - frame_length).SetZero(); + + SubVector frame(*window, 0, frame_length); + + ProcessWindow(opts, window_function, &frame, log_energy_pre_window); +} + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/feat/feature-window.h b/speechx/speechx/kaldi/feat/feature-window.h new file mode 100644 index 00000000..a7abba50 --- /dev/null +++ b/speechx/speechx/kaldi/feat/feature-window.h @@ -0,0 +1,223 @@ +// feat/feature-window.h + +// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Saarland University +// 2014-2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_FEATURE_WINDOW_H_ +#define KALDI_FEAT_FEATURE_WINDOW_H_ + +#include +#include + +#include "matrix/matrix-lib.h" +#include "util/common-utils.h" +#include "base/kaldi-error.h" + +namespace kaldi { +/// @addtogroup feat FeatureExtraction +/// @{ + +struct FrameExtractionOptions { + BaseFloat samp_freq; + BaseFloat frame_shift_ms; // in milliseconds. + BaseFloat frame_length_ms; // in milliseconds. + BaseFloat dither; // Amount of dithering, 0.0 means no dither. + BaseFloat preemph_coeff; // Preemphasis coefficient. + bool remove_dc_offset; // Subtract mean of wave before FFT. + std::string window_type; // e.g. Hamming window + // May be "hamming", "rectangular", "povey", "hanning", "blackman" + // "povey" is a window I made to be similar to Hamming but to go to zero at the + // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) + // I just don't think the Hamming window makes sense as a windowing function. + bool round_to_power_of_two; + BaseFloat blackman_coeff; + bool snip_edges; + bool allow_downsample; + bool allow_upsample; + int max_feature_vectors; + FrameExtractionOptions(): + samp_freq(16000), + frame_shift_ms(10.0), + frame_length_ms(25.0), + dither(1.0), + preemph_coeff(0.97), + remove_dc_offset(true), + window_type("povey"), + round_to_power_of_two(true), + blackman_coeff(0.42), + snip_edges(true), + allow_downsample(false), + allow_upsample(false), + max_feature_vectors(-1) + { } + + void Register(OptionsItf *opts) { + opts->Register("sample-frequency", &samp_freq, + "Waveform data sample frequency (must match the waveform file, " + "if specified there)"); + opts->Register("frame-length", &frame_length_ms, "Frame length in milliseconds"); + opts->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds"); + opts->Register("preemphasis-coefficient", &preemph_coeff, + "Coefficient for use in signal preemphasis"); + opts->Register("remove-dc-offset", &remove_dc_offset, + "Subtract mean from waveform on each frame"); + opts->Register("dither", &dither, "Dithering constant (0.0 means no dither). " + "If you turn this off, you should set the --energy-floor " + "option, e.g. to 1.0 or 0.1"); + opts->Register("window-type", &window_type, "Type of window " + "(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\"" + "|\"blackmann\")"); + opts->Register("blackman-coeff", &blackman_coeff, + "Constant coefficient for generalized Blackman window."); + opts->Register("round-to-power-of-two", &round_to_power_of_two, + "If true, round window size to power of two by zero-padding " + "input to FFT."); + opts->Register("snip-edges", &snip_edges, + "If true, end effects will be handled by outputting only frames that " + "completely fit in the file, and the number of frames depends on the " + "frame-length. If false, the number of frames depends only on the " + "frame-shift, and we reflect the data at the ends."); + opts->Register("allow-downsample", &allow_downsample, + "If true, allow the input waveform to have a higher frequency than " + "the specified --sample-frequency (and we'll downsample)."); + opts->Register("max-feature-vectors", &max_feature_vectors, + "Memory optimization. If larger than 0, periodically remove feature " + "vectors so that only this number of the latest feature vectors is " + "retained."); + opts->Register("allow-upsample", &allow_upsample, + "If true, allow the input waveform to have a lower frequency than " + "the specified --sample-frequency (and we'll upsample)."); + } + int32 WindowShift() const { + return static_cast(samp_freq * 0.001 * frame_shift_ms); + } + int32 WindowSize() const { + return static_cast(samp_freq * 0.001 * frame_length_ms); + } + int32 PaddedWindowSize() const { + return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) : + WindowSize()); + } +}; + + +struct FeatureWindowFunction { + FeatureWindowFunction() {} + explicit FeatureWindowFunction(const FrameExtractionOptions &opts); + FeatureWindowFunction(const FeatureWindowFunction &other): + window(other.window) { } + Vector window; +}; + + +/** + This function returns the number of frames that we can extract from a wave + file with the given number of samples in it (assumed to have the same + sampling rate as specified in 'opts'). + + @param [in] num_samples The number of samples in the wave file. + @param [in] opts The frame-extraction options class + + @param [in] flush True if we are asserting that this number of samples is + 'all there is', false if we expecting more data to possibly come + in. This only makes a difference to the answer if opts.snips_edges + == false. For offline feature extraction you always want flush == + true. In an online-decoding context, once you know (or decide) that + no more data is coming in, you'd call it with flush == true at the + end to flush out any remaining data. +*/ +int32 NumFrames(int64 num_samples, + const FrameExtractionOptions &opts, + bool flush = true); + +/* + This function returns the index of the first sample of the frame indexed + 'frame'. If snip-edges=true, it just returns frame * opts.WindowShift(); if + snip-edges=false, the formula is a little more complicated and the result may + be negative. +*/ +int64 FirstSampleOfFrame(int32 frame, + const FrameExtractionOptions &opts); + + + +void Dither(VectorBase *waveform, BaseFloat dither_value); + +void Preemphasize(VectorBase *waveform, BaseFloat preemph_coeff); + +/** + This function does all the windowing steps after actually + extracting the windowed signal: depending on the + configuration, it does dithering, dc offset removal, + preemphasis, and multiplication by the windowing function. + @param [in] opts The options class to be used + @param [in] window_function The windowing function-- should have + been initialized using 'opts'. + @param [in,out] window A vector of size opts.WindowSize(). Note: + it will typically be a sub-vector of a larger vector of size + opts.PaddedWindowSize(), with the remaining samples zero, + as the FFT code is more efficient if it operates on data with + power-of-two size. + @param [out] log_energy_pre_window If non-NULL, then after dithering and + DC offset removal, this function will write to this pointer the log of + the total energy (i.e. sum-squared) of the frame. + */ +void ProcessWindow(const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + VectorBase *window, + BaseFloat *log_energy_pre_window = NULL); + + +/* + ExtractWindow() extracts a windowed frame of waveform (possibly with a + power-of-two, padded size, depending on the config), including all the + proessing done by ProcessWindow(). + + @param [in] sample_offset If 'wave' is not the entire waveform, but + part of it to the left has been discarded, then the + number of samples prior to 'wave' that we have + already discarded. Set this to zero if you are + processing the entire waveform in one piece, or + if you get 'no matching function' compilation + errors when updating the code. + @param [in] wave The waveform + @param [in] f The frame index to be extracted, with + 0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true) + @param [in] opts The options class to be used + @param [in] window_function The windowing function, as derived from the + options class. + @param [out] window The windowed, possibly-padded waveform to be + extracted. Will be resized as needed. + @param [out] log_energy_pre_window If non-NULL, the log-energy of + the signal prior to pre-emphasis and multiplying by + the windowing function will be written to here. +*/ +void ExtractWindow(int64 sample_offset, + const VectorBase &wave, + int32 f, + const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + Vector *window, + BaseFloat *log_energy_pre_window = NULL); + + +/// @} End of "addtogroup feat" +} // namespace kaldi + + +#endif // KALDI_FEAT_FEATURE_WINDOW_H_ diff --git a/speechx/speechx/kaldi/feat/mel-computations.cc b/speechx/speechx/kaldi/feat/mel-computations.cc new file mode 100644 index 00000000..bb5e9f9a --- /dev/null +++ b/speechx/speechx/kaldi/feat/mel-computations.cc @@ -0,0 +1,340 @@ +// feat/mel-computations.cc + +// Copyright 2009-2011 Phonexia s.r.o.; Karel Vesely; Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "feat/feature-functions.h" +#include "feat/feature-window.h" +#include "feat/mel-computations.h" + +namespace kaldi { + + +MelBanks::MelBanks(const MelBanksOptions &opts, + const FrameExtractionOptions &frame_opts, + BaseFloat vtln_warp_factor): + htk_mode_(opts.htk_mode) { + int32 num_bins = opts.num_bins; + if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins"; + BaseFloat sample_freq = frame_opts.samp_freq; + int32 window_length_padded = frame_opts.PaddedWindowSize(); + KALDI_ASSERT(window_length_padded % 2 == 0); + int32 num_fft_bins = window_length_padded / 2; + BaseFloat nyquist = 0.5 * sample_freq; + + BaseFloat low_freq = opts.low_freq, high_freq; + if (opts.high_freq > 0.0) + high_freq = opts.high_freq; + else + high_freq = nyquist + opts.high_freq; + + if (low_freq < 0.0 || low_freq >= nyquist + || high_freq <= 0.0 || high_freq > nyquist + || high_freq <= low_freq) + KALDI_ERR << "Bad values in options: low-freq " << low_freq + << " and high-freq " << high_freq << " vs. nyquist " + << nyquist; + + BaseFloat fft_bin_width = sample_freq / window_length_padded; + // fft-bin width [think of it as Nyquist-freq / half-window-length] + + BaseFloat mel_low_freq = MelScale(low_freq); + BaseFloat mel_high_freq = MelScale(high_freq); + + debug_ = opts.debug_mel; + + // divide by num_bins+1 in next line because of end-effects where the bins + // spread out to the sides. + BaseFloat mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins+1); + + BaseFloat vtln_low = opts.vtln_low, + vtln_high = opts.vtln_high; + if (vtln_high < 0.0) { + vtln_high += nyquist; + } + + if (vtln_warp_factor != 1.0 && + (vtln_low < 0.0 || vtln_low <= low_freq + || vtln_low >= high_freq + || vtln_high <= 0.0 || vtln_high >= high_freq + || vtln_high <= vtln_low)) + KALDI_ERR << "Bad values in options: vtln-low " << vtln_low + << " and vtln-high " << vtln_high << ", versus " + << "low-freq " << low_freq << " and high-freq " + << high_freq; + + bins_.resize(num_bins); + center_freqs_.Resize(num_bins); + + for (int32 bin = 0; bin < num_bins; bin++) { + BaseFloat left_mel = mel_low_freq + bin * mel_freq_delta, + center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, + right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; + + if (vtln_warp_factor != 1.0) { + left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, left_mel); + center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, center_mel); + right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, right_mel); + } + center_freqs_(bin) = InverseMelScale(center_mel); + // this_bin will be a vector of coefficients that is only + // nonzero where this mel bin is active. + Vector this_bin(num_fft_bins); + int32 first_index = -1, last_index = -1; + for (int32 i = 0; i < num_fft_bins; i++) { + BaseFloat freq = (fft_bin_width * i); // Center frequency of this fft + // bin. + BaseFloat mel = MelScale(freq); + if (mel > left_mel && mel < right_mel) { + BaseFloat weight; + if (mel <= center_mel) + weight = (mel - left_mel) / (center_mel - left_mel); + else + weight = (right_mel-mel) / (right_mel-center_mel); + this_bin(i) = weight; + if (first_index == -1) + first_index = i; + last_index = i; + } + } + KALDI_ASSERT(first_index != -1 && last_index >= first_index + && "You may have set --num-mel-bins too large."); + + bins_[bin].first = first_index; + int32 size = last_index + 1 - first_index; + bins_[bin].second.Resize(size); + bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size)); + + // Replicate a bug in HTK, for testing purposes. + if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0) + bins_[bin].second(0) = 0.0; + + } + if (debug_) { + for (size_t i = 0; i < bins_.size(); i++) { + KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first + << ", vec = " << bins_[i].second; + } + } +} + +MelBanks::MelBanks(const MelBanks &other): + center_freqs_(other.center_freqs_), + bins_(other.bins_), + debug_(other.debug_), + htk_mode_(other.htk_mode_) { } + +BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. + BaseFloat vtln_high_cutoff, + BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation + BaseFloat high_freq, + BaseFloat vtln_warp_factor, + BaseFloat freq) { + /// This computes a VTLN warping function that is not the same as HTK's one, + /// but has similar inputs (this function has the advantage of never producing + /// empty bins). + + /// This function computes a warp function F(freq), defined between low_freq and + /// high_freq inclusive, with the following properties: + /// F(low_freq) == low_freq + /// F(high_freq) == high_freq + /// The function is continuous and piecewise linear with two inflection + /// points. + /// The lower inflection point (measured in terms of the unwarped + /// frequency) is at frequency l, determined as described below. + /// The higher inflection point is at a frequency h, determined as + /// described below. + /// If l <= f <= h, then F(f) = f/vtln_warp_factor. + /// If the higher inflection point (measured in terms of the unwarped + /// frequency) is at h, then max(h, F(h)) == vtln_high_cutoff. + /// Since (by the last point) F(h) == h/vtln_warp_factor, then + /// max(h, h/vtln_warp_factor) == vtln_high_cutoff, so + /// h = vtln_high_cutoff / max(1, 1/vtln_warp_factor). + /// = vtln_high_cutoff * min(1, vtln_warp_factor). + /// If the lower inflection point (measured in terms of the unwarped + /// frequency) is at l, then min(l, F(l)) == vtln_low_cutoff + /// This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor) + /// = vtln_low_cutoff * max(1, vtln_warp_factor) + + + if (freq < low_freq || freq > high_freq) return freq; // in case this gets called + // for out-of-range frequencies, just return the freq. + + KALDI_ASSERT(vtln_low_cutoff > low_freq && + "be sure to set the --vtln-low option higher than --low-freq"); + KALDI_ASSERT(vtln_high_cutoff < high_freq && + "be sure to set the --vtln-high option lower than --high-freq [or negative]"); + BaseFloat one = 1.0; + BaseFloat l = vtln_low_cutoff * std::max(one, vtln_warp_factor); + BaseFloat h = vtln_high_cutoff * std::min(one, vtln_warp_factor); + BaseFloat scale = 1.0 / vtln_warp_factor; + BaseFloat Fl = scale * l; // F(l); + BaseFloat Fh = scale * h; // F(h); + KALDI_ASSERT(l > low_freq && h < high_freq); + // slope of left part of the 3-piece linear function + BaseFloat scale_left = (Fl - low_freq) / (l - low_freq); + // [slope of center part is just "scale"] + + // slope of right part of the 3-piece linear function + BaseFloat scale_right = (high_freq - Fh) / (high_freq - h); + + if (freq < l) { + return low_freq + scale_left * (freq - low_freq); + } else if (freq < h) { + return scale * freq; + } else { // freq >= h + return high_freq + scale_right * (freq - high_freq); + } +} + +BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. + BaseFloat vtln_high_cutoff, + BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation + BaseFloat high_freq, + BaseFloat vtln_warp_factor, + BaseFloat mel_freq) { + return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, + low_freq, high_freq, + vtln_warp_factor, InverseMelScale(mel_freq))); +} + + +// "power_spectrum" contains fft energies. +void MelBanks::Compute(const VectorBase &power_spectrum, + VectorBase *mel_energies_out) const { + int32 num_bins = bins_.size(); + KALDI_ASSERT(mel_energies_out->Dim() == num_bins); + + for (int32 i = 0; i < num_bins; i++) { + int32 offset = bins_[i].first; + const Vector &v(bins_[i].second); + BaseFloat energy = VecVec(v, power_spectrum.Range(offset, v.Dim())); + // HTK-like flooring- for testing purposes (we prefer dither) + if (htk_mode_ && energy < 1.0) energy = 1.0; + (*mel_energies_out)(i) = energy; + + // The following assert was added due to a problem with OpenBlas that + // we had at one point (it was a bug in that library). Just to detect + // it early. + KALDI_ASSERT(!KALDI_ISNAN((*mel_energies_out)(i))); + } + + if (debug_) { + fprintf(stderr, "MEL BANKS:\n"); + for (int32 i = 0; i < num_bins; i++) + fprintf(stderr, " %f", (*mel_energies_out)(i)); + fprintf(stderr, "\n"); + } +} + +void ComputeLifterCoeffs(BaseFloat Q, VectorBase *coeffs) { + // Compute liftering coefficients (scaling on cepstral coeffs) + // coeffs are numbered slightly differently from HTK: the zeroth + // index is C0, which is not affected. + for (int32 i = 0; i < coeffs->Dim(); i++) + (*coeffs)(i) = 1.0 + 0.5 * Q * sin (M_PI * i / Q); +} + + +// Durbin's recursion - converts autocorrelation coefficients to the LPC +// pTmp - temporal place [n] +// pAC - autocorrelation coefficients [n + 1] +// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}}) +// F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator +BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp) { + BaseFloat ki; // reflection coefficient + int i; + int j; + + BaseFloat E = pAC[0]; + + for (i = 0; i < n; i++) { + // next reflection coefficient + ki = pAC[i + 1]; + for (j = 0; j < i; j++) + ki += pLP[j] * pAC[i - j]; + ki = ki / E; + + // new error + BaseFloat c = 1 - ki * ki; + if (c < 1.0e-5) // remove NaNs for constan signal + c = 1.0e-5; + E *= c; + + // new LP coefficients + pTmp[i] = -ki; + for (j = 0; j < i; j++) + pTmp[j] = pLP[j] - ki * pLP[i - j - 1]; + + for (j = 0; j <= i; j++) + pLP[j] = pTmp[j]; + } + + return E; +} + + +void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst) { + for (int32 i = 0; i < n; i++) { + double sum = 0.0; + int j; + for (j = 0; j < i; j++) { + sum += static_cast(i - j) * pLPC[j] * pCepst[i - j - 1]; + } + pCepst[i] = -pLPC[i] - sum / static_cast(i + 1); + } +} + +void GetEqualLoudnessVector(const MelBanks &mel_banks, + Vector *ans) { + int32 n = mel_banks.NumBins(); + // Central frequency of each mel bin. + const Vector &f0 = mel_banks.GetCenterFreqs(); + ans->Resize(n); + for (int32 i = 0; i < n; i++) { + BaseFloat fsq = f0(i) * f0(i); + BaseFloat fsub = fsq / (fsq + 1.6e5); + (*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6)); + } +} + + +// Compute LP coefficients from autocorrelation coefficients. +BaseFloat ComputeLpc(const VectorBase &autocorr_in, + Vector *lpc_out) { + int32 n = autocorr_in.Dim() - 1; + KALDI_ASSERT(lpc_out->Dim() == n); + Vector tmp(n); + BaseFloat ans = Durbin(n, autocorr_in.Data(), + lpc_out->Data(), + tmp.Data()); + if (ans <= 0.0) + KALDI_WARN << "Zero energy in LPC computation"; + return -Log(1.0 / ans); // forms the C0 value +} + + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/feat/mel-computations.h b/speechx/speechx/kaldi/feat/mel-computations.h new file mode 100644 index 00000000..0c1d41ca --- /dev/null +++ b/speechx/speechx/kaldi/feat/mel-computations.h @@ -0,0 +1,171 @@ +// feat/mel-computations.h + +// Copyright 2009-2011 Phonexia s.r.o.; Microsoft Corporation +// 2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_MEL_COMPUTATIONS_H_ +#define KALDI_FEAT_MEL_COMPUTATIONS_H_ + +#include +#include +#include +#include +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "matrix/matrix-lib.h" + + +namespace kaldi { +/// @addtogroup feat FeatureExtraction +/// @{ + +struct FrameExtractionOptions; // defined in feature-window.h + + +struct MelBanksOptions { + int32 num_bins; // e.g. 25; number of triangular bins + BaseFloat low_freq; // e.g. 20; lower frequency cutoff + BaseFloat high_freq; // an upper frequency cutoff; 0 -> no cutoff, negative + // ->added to the Nyquist frequency to get the cutoff. + BaseFloat vtln_low; // vtln lower cutoff of warping function. + BaseFloat vtln_high; // vtln upper cutoff of warping function: if negative, added + // to the Nyquist frequency to get the cutoff. + bool debug_mel; + // htk_mode is a "hidden" config, it does not show up on command line. + // Enables more exact compatibility with HTK, for testing purposes. Affects + // mel-energy flooring and reproduces a bug in HTK. + bool htk_mode; + explicit MelBanksOptions(int num_bins = 25) + : num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100), + vtln_high(-500), debug_mel(false), htk_mode(false) {} + + void Register(OptionsItf *opts) { + opts->Register("num-mel-bins", &num_bins, + "Number of triangular mel-frequency bins"); + opts->Register("low-freq", &low_freq, + "Low cutoff frequency for mel bins"); + opts->Register("high-freq", &high_freq, + "High cutoff frequency for mel bins (if <= 0, offset from Nyquist)"); + opts->Register("vtln-low", &vtln_low, + "Low inflection point in piecewise linear VTLN warping function"); + opts->Register("vtln-high", &vtln_high, + "High inflection point in piecewise linear VTLN warping function" + " (if negative, offset from high-mel-freq"); + opts->Register("debug-mel", &debug_mel, + "Print out debugging information for mel bin computation"); + } +}; + + +class MelBanks { + public: + + static inline BaseFloat InverseMelScale(BaseFloat mel_freq) { + return 700.0f * (expf (mel_freq / 1127.0f) - 1.0f); + } + + static inline BaseFloat MelScale(BaseFloat freq) { + return 1127.0f * logf (1.0f + freq / 700.0f); + } + + static BaseFloat VtlnWarpFreq(BaseFloat vtln_low_cutoff, + BaseFloat vtln_high_cutoff, // discontinuities in warp func + BaseFloat low_freq, + BaseFloat high_freq, // upper+lower frequency cutoffs in + // the mel computation + BaseFloat vtln_warp_factor, + BaseFloat freq); + + static BaseFloat VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, + BaseFloat vtln_high_cutoff, + BaseFloat low_freq, + BaseFloat high_freq, + BaseFloat vtln_warp_factor, + BaseFloat mel_freq); + + + MelBanks(const MelBanksOptions &opts, + const FrameExtractionOptions &frame_opts, + BaseFloat vtln_warp_factor); + + /// Compute Mel energies (note: not log enerties). + /// At input, "fft_energies" contains the FFT energies (not log). + void Compute(const VectorBase &fft_energies, + VectorBase *mel_energies_out) const; + + int32 NumBins() const { return bins_.size(); } + + // returns vector of central freq of each bin; needed by plp code. + const Vector &GetCenterFreqs() const { return center_freqs_; } + + const std::vector > >& GetBins() const { + return bins_; + } + + // Copy constructor + MelBanks(const MelBanks &other); + private: + // Disallow assignment + MelBanks &operator = (const MelBanks &other); + + // center frequencies of bins, numbered from 0 ... num_bins-1. + // Needed by GetCenterFreqs(). + Vector center_freqs_; + + // the "bins_" vector is a vector, one for each bin, of a pair: + // (the first nonzero fft-bin), (the vector of weights). + std::vector > > bins_; + + bool debug_; + bool htk_mode_; +}; + + +// Compute liftering coefficients (scaling on cepstral coeffs) +// coeffs are numbered slightly differently from HTK: the zeroth +// index is C0, which is not affected. +void ComputeLifterCoeffs(BaseFloat Q, VectorBase *coeffs); + + +// Durbin's recursion - converts autocorrelation coefficients to the LPC +// pTmp - temporal place [n] +// pAC - autocorrelation coefficients [n + 1] +// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}}) +// F(z) = 1 / (1 - A(z)), 1 is not stored in the denominator +// Returns log energy of residual (I think) +BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp); + +// Compute LP coefficients from autocorrelation coefficients. +// Returns log energy of residual (I think) +BaseFloat ComputeLpc(const VectorBase &autocorr_in, + Vector *lpc_out); + +void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst); + + + +void GetEqualLoudnessVector(const MelBanks &mel_banks, + Vector *ans); + +/// @} End of "addtogroup feat" +} // namespace kaldi + +#endif // KALDI_FEAT_MEL_COMPUTATIONS_H_ diff --git a/speechx/speechx/kaldi/feat/online-feature.cc b/speechx/speechx/kaldi/feat/online-feature.cc new file mode 100644 index 00000000..047909e7 --- /dev/null +++ b/speechx/speechx/kaldi/feat/online-feature.cc @@ -0,0 +1,679 @@ +// feat/online-feature.cc + +// Copyright 2013 Johns Hopkins University (author: Daniel Povey) +// 2014 Yanqing Sun, Junjie Wang, +// Daniel Povey, Korbinian Riedhammer + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "feat/online-feature.h" +#include "transform/cmvn.h" + +namespace kaldi { + +RecyclingVector::RecyclingVector(int items_to_hold): + items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold), + first_available_index_(0) { +} + +RecyclingVector::~RecyclingVector() { + for (auto *item : items_) { + delete item; + } +} + +Vector *RecyclingVector::At(int index) const { + if (index < first_available_index_) { + KALDI_ERR << "Attempted to retrieve feature vector that was " + "already removed by the RecyclingVector (index = " + << index << "; " + << "first_available_index = " << first_available_index_ << "; " + << "size = " << Size() << ")"; + } + // 'at' does size checking. + return items_.at(index - first_available_index_); +} + +void RecyclingVector::PushBack(Vector *item) { + if (items_.size() == items_to_hold_) { + delete items_.front(); + items_.pop_front(); + ++first_available_index_; + } + items_.push_back(item); +} + +int RecyclingVector::Size() const { + return first_available_index_ + items_.size(); +} + +template +void OnlineGenericBaseFeature::GetFrame(int32 frame, + VectorBase *feat) { + feat->CopyFromVec(*(features_.At(frame))); +}; + +template +OnlineGenericBaseFeature::OnlineGenericBaseFeature( + const typename C::Options &opts): + computer_(opts), window_function_(computer_.GetFrameOptions()), + features_(opts.frame_opts.max_feature_vectors), + input_finished_(false), waveform_offset_(0) { + // RE the following assert: search for ONLINE_IVECTOR_LIMIT in + // online-ivector-feature.cc. + // Casting to uint32, an unsigned type, means that -1 would be treated + // as `very large`. + KALDI_ASSERT(static_cast(opts.frame_opts.max_feature_vectors) > 200); +} + + +template +void OnlineGenericBaseFeature::MaybeCreateResampler( + BaseFloat sampling_rate) { + BaseFloat expected_sampling_rate = computer_.GetFrameOptions().samp_freq; + + if (resampler_ != nullptr) { + KALDI_ASSERT(resampler_->GetInputSamplingRate() == sampling_rate); + KALDI_ASSERT(resampler_->GetOutputSamplingRate() == expected_sampling_rate); + } else if (((sampling_rate < expected_sampling_rate) && + computer_.GetFrameOptions().allow_downsample) || + ((sampling_rate > expected_sampling_rate) && + computer_.GetFrameOptions().allow_upsample)) { + resampler_.reset(new LinearResample( + sampling_rate, expected_sampling_rate, + std::min(sampling_rate / 2, expected_sampling_rate / 2), 6)); + } else if (sampling_rate != expected_sampling_rate) { + KALDI_ERR << "Sampling frequency mismatch, expected " + << expected_sampling_rate << ", got " << sampling_rate + << "\nPerhaps you want to use the options " + "--allow_{upsample,downsample}"; + } +} + +template +void OnlineGenericBaseFeature::InputFinished() { + if (resampler_ != nullptr) { + // There may be a few samples left once we flush the resampler_ object, telling it + // that the file has finished. This should rarely make any difference. + Vector appended_wave; + Vector resampled_wave; + resampler_->Resample(appended_wave, true, &resampled_wave); + + if (resampled_wave.Dim() != 0) { + appended_wave.Resize(waveform_remainder_.Dim() + + resampled_wave.Dim()); + if (waveform_remainder_.Dim() != 0) + appended_wave.Range(0, waveform_remainder_.Dim()) + .CopyFromVec(waveform_remainder_); + appended_wave.Range(waveform_remainder_.Dim(), resampled_wave.Dim()) + .CopyFromVec(resampled_wave); + waveform_remainder_.Swap(&appended_wave); + } + } + input_finished_ = true; + ComputeFeatures(); +} + +template +void OnlineGenericBaseFeature::AcceptWaveform( + BaseFloat sampling_rate, const VectorBase &original_waveform) { + if (original_waveform.Dim() == 0) + return; // Nothing to do. + if (input_finished_) + KALDI_ERR << "AcceptWaveform called after InputFinished() was called."; + + Vector appended_wave; + Vector resampled_wave; + + const VectorBase *waveform; + + MaybeCreateResampler(sampling_rate); + if (resampler_ == nullptr) { + waveform = &original_waveform; + } else { + resampler_->Resample(original_waveform, false, &resampled_wave); + waveform = &resampled_wave; + } + + appended_wave.Resize(waveform_remainder_.Dim() + waveform->Dim()); + if (waveform_remainder_.Dim() != 0) + appended_wave.Range(0, waveform_remainder_.Dim()) + .CopyFromVec(waveform_remainder_); + appended_wave.Range(waveform_remainder_.Dim(), waveform->Dim()) + .CopyFromVec(*waveform); + waveform_remainder_.Swap(&appended_wave); + ComputeFeatures(); +} + +template +void OnlineGenericBaseFeature::ComputeFeatures() { + const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions(); + int64 num_samples_total = waveform_offset_ + waveform_remainder_.Dim(); + int32 num_frames_old = features_.Size(), + num_frames_new = NumFrames(num_samples_total, frame_opts, + input_finished_); + KALDI_ASSERT(num_frames_new >= num_frames_old); + + Vector window; + bool need_raw_log_energy = computer_.NeedRawLogEnergy(); + for (int32 frame = num_frames_old; frame < num_frames_new; frame++) { + BaseFloat raw_log_energy = 0.0; + ExtractWindow(waveform_offset_, waveform_remainder_, frame, + frame_opts, window_function_, &window, + need_raw_log_energy ? &raw_log_energy : NULL); + Vector *this_feature = new Vector(computer_.Dim(), + kUndefined); + // note: this online feature-extraction code does not support VTLN. + BaseFloat vtln_warp = 1.0; + computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature); + features_.PushBack(this_feature); + } + // OK, we will now discard any portion of the signal that will not be + // necessary to compute frames in the future. + int64 first_sample_of_next_frame = FirstSampleOfFrame(num_frames_new, + frame_opts); + int32 samples_to_discard = first_sample_of_next_frame - waveform_offset_; + if (samples_to_discard > 0) { + // discard the leftmost part of the waveform that we no longer need. + int32 new_num_samples = waveform_remainder_.Dim() - samples_to_discard; + if (new_num_samples <= 0) { + // odd, but we'll try to handle it. + waveform_offset_ += waveform_remainder_.Dim(); + waveform_remainder_.Resize(0); + } else { + Vector new_remainder(new_num_samples); + new_remainder.CopyFromVec(waveform_remainder_.Range(samples_to_discard, + new_num_samples)); + waveform_offset_ += samples_to_discard; + waveform_remainder_.Swap(&new_remainder); + } + } +} + +// instantiate the templates defined here for MFCC, PLP and filterbank classes. +template class OnlineGenericBaseFeature; +template class OnlineGenericBaseFeature; +template class OnlineGenericBaseFeature; + +OnlineCmvnState::OnlineCmvnState(const OnlineCmvnState &other): + speaker_cmvn_stats(other.speaker_cmvn_stats), + global_cmvn_stats(other.global_cmvn_stats), + frozen_state(other.frozen_state) { } + +void OnlineCmvnState::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); // magic string. + WriteToken(os, binary, ""); + speaker_cmvn_stats.Write(os, binary); + WriteToken(os, binary, ""); + global_cmvn_stats.Write(os, binary); + WriteToken(os, binary, ""); + frozen_state.Write(os, binary); + WriteToken(os, binary, ""); +} + +void OnlineCmvnState::Read(std::istream &is, bool binary) { + ExpectToken(is, binary, ""); // magic string. + ExpectToken(is, binary, ""); + speaker_cmvn_stats.Read(is, binary); + ExpectToken(is, binary, ""); + global_cmvn_stats.Read(is, binary); + ExpectToken(is, binary, ""); + frozen_state.Read(is, binary); + ExpectToken(is, binary, ""); +} + +OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts, + const OnlineCmvnState &cmvn_state, + OnlineFeatureInterface *src): + opts_(opts), temp_stats_(2, src->Dim() + 1), + temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()), + src_(src) { + SetState(cmvn_state); + if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_)) + KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of " + << "integers)"; +} + +OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts, + OnlineFeatureInterface *src): + opts_(opts), temp_stats_(2, src->Dim() + 1), + temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()), + src_(src) { + if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_)) + KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of " + << "integers)"; +} + + +void OnlineCmvn::GetMostRecentCachedFrame(int32 frame, + int32 *cached_frame, + MatrixBase *stats) { + KALDI_ASSERT(frame >= 0); + InitRingBufferIfNeeded(); + // look for a cached frame on a previous frame as close as possible in time + // to "frame". Return if we get one. + for (int32 t = frame; t >= 0 && t >= frame - opts_.ring_buffer_size; t--) { + if (t % opts_.modulus == 0) { + // if this frame should be cached in cached_stats_modulo_, then + // we'll look there, and we won't go back any further in time. + break; + } + int32 index = t % opts_.ring_buffer_size; + if (cached_stats_ring_[index].first == t) { + *cached_frame = t; + stats->CopyFromMat(cached_stats_ring_[index].second); + return; + } + } + int32 n = frame / opts_.modulus; + if (n >= cached_stats_modulo_.size()) { + if (cached_stats_modulo_.size() == 0) { + *cached_frame = -1; + stats->SetZero(); + return; + } else { + n = static_cast(cached_stats_modulo_.size() - 1); + } + } + *cached_frame = n * opts_.modulus; + KALDI_ASSERT(cached_stats_modulo_[n] != NULL); + stats->CopyFromMat(*(cached_stats_modulo_[n])); +} + +// Initialize ring buffer for caching stats. +void OnlineCmvn::InitRingBufferIfNeeded() { + if (cached_stats_ring_.empty() && opts_.ring_buffer_size > 0) { + Matrix temp(2, this->Dim() + 1); + cached_stats_ring_.resize(opts_.ring_buffer_size, + std::pair >(-1, temp)); + } +} + +void OnlineCmvn::CacheFrame(int32 frame, const MatrixBase &stats) { + KALDI_ASSERT(frame >= 0); + if (frame % opts_.modulus == 0) { // store in cached_stats_modulo_. + int32 n = frame / opts_.modulus; + if (n >= cached_stats_modulo_.size()) { + // The following assert is a limitation on in what order you can call + // CacheFrame. Fortunately the calling code always calls it in sequence, + // which it has to because you need a previous frame to compute the + // current one. + KALDI_ASSERT(n == cached_stats_modulo_.size()); + cached_stats_modulo_.push_back(new Matrix(stats)); + } else { + KALDI_WARN << "Did not expect to reach this part of code."; + // do what seems right, but we shouldn't get here. + cached_stats_modulo_[n]->CopyFromMat(stats); + } + } else { // store in the ring buffer. + InitRingBufferIfNeeded(); + if (!cached_stats_ring_.empty()) { + int32 index = frame % cached_stats_ring_.size(); + cached_stats_ring_[index].first = frame; + cached_stats_ring_[index].second.CopyFromMat(stats); + } + } +} + +OnlineCmvn::~OnlineCmvn() { + for (size_t i = 0; i < cached_stats_modulo_.size(); i++) + delete cached_stats_modulo_[i]; + cached_stats_modulo_.clear(); +} + +void OnlineCmvn::ComputeStatsForFrame(int32 frame, + MatrixBase *stats_out) { + KALDI_ASSERT(frame >= 0 && frame < src_->NumFramesReady()); + + int32 dim = this->Dim(), cur_frame; + GetMostRecentCachedFrame(frame, &cur_frame, stats_out); + + Vector &feats(temp_feats_); + Vector &feats_dbl(temp_feats_dbl_); + while (cur_frame < frame) { + cur_frame++; + src_->GetFrame(cur_frame, &feats); + feats_dbl.CopyFromVec(feats); + stats_out->Row(0).Range(0, dim).AddVec(1.0, feats_dbl); + if (opts_.normalize_variance) + stats_out->Row(1).Range(0, dim).AddVec2(1.0, feats_dbl); + (*stats_out)(0, dim) += 1.0; + // it's a sliding buffer; a frame at the back may be + // leaving the buffer so we have to subtract that. + int32 prev_frame = cur_frame - opts_.cmn_window; + if (prev_frame >= 0) { + // we need to subtract frame prev_f from the stats. + src_->GetFrame(prev_frame, &feats); + feats_dbl.CopyFromVec(feats); + stats_out->Row(0).Range(0, dim).AddVec(-1.0, feats_dbl); + if (opts_.normalize_variance) + stats_out->Row(1).Range(0, dim).AddVec2(-1.0, feats_dbl); + (*stats_out)(0, dim) -= 1.0; + } + CacheFrame(cur_frame, (*stats_out)); + } +} + + +// static +void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase &speaker_stats, + const MatrixBase &global_stats, + const OnlineCmvnOptions &opts, + MatrixBase *stats) { + if (speaker_stats.NumRows() == 2 && !opts.normalize_variance) { + // this is just for efficiency: don't operate on the variance if it's not + // needed. + int32 cols = speaker_stats.NumCols(); // dim + 1 + SubMatrix stats_temp(*stats, 0, 1, 0, cols); + SmoothOnlineCmvnStats(speaker_stats.RowRange(0, 1), + global_stats.RowRange(0, 1), + opts, &stats_temp); + return; + } + int32 dim = stats->NumCols() - 1; + double cur_count = (*stats)(0, dim); + // If count exceeded cmn_window it would be an error in how "window_stats" + // was accumulated. + KALDI_ASSERT(cur_count <= 1.001 * opts.cmn_window); + if (cur_count >= opts.cmn_window) + return; + if (speaker_stats.NumRows() != 0) { // if we have speaker stats.. + double count_from_speaker = opts.cmn_window - cur_count, + speaker_count = speaker_stats(0, dim); + if (count_from_speaker > opts.speaker_frames) + count_from_speaker = opts.speaker_frames; + if (count_from_speaker > speaker_count) + count_from_speaker = speaker_count; + if (count_from_speaker > 0.0) + stats->AddMat(count_from_speaker / speaker_count, + speaker_stats); + cur_count = (*stats)(0, dim); + } + if (cur_count >= opts.cmn_window) + return; + if (global_stats.NumRows() != 0) { + double count_from_global = opts.cmn_window - cur_count, + global_count = global_stats(0, dim); + KALDI_ASSERT(global_count > 0.0); + if (count_from_global > opts.global_frames) + count_from_global = opts.global_frames; + if (count_from_global > 0.0) + stats->AddMat(count_from_global / global_count, + global_stats); + } else { + KALDI_ERR << "Global CMN stats are required"; + } +} + +void OnlineCmvn::GetFrame(int32 frame, + VectorBase *feat) { + src_->GetFrame(frame, feat); + KALDI_ASSERT(feat->Dim() == this->Dim()); + int32 dim = feat->Dim(); + Matrix &stats(temp_stats_); + stats.Resize(2, dim + 1, kUndefined); // Will do nothing if size was correct. + if (frozen_state_.NumRows() != 0) { // the CMVN state has been frozen. + stats.CopyFromMat(frozen_state_); + } else { + // first get the raw CMVN stats (this involves caching..) + this->ComputeStatsForFrame(frame, &stats); + // now smooth them. + SmoothOnlineCmvnStats(orig_state_.speaker_cmvn_stats, + orig_state_.global_cmvn_stats, + opts_, + &stats); + } + + if (!skip_dims_.empty()) + FakeStatsForSomeDims(skip_dims_, &stats); + + // call the function ApplyCmvn declared in ../transform/cmvn.h, which + // requires a matrix. + // 1 row; num-cols == dim; stride == dim. + SubMatrix feat_mat(feat->Data(), 1, dim, dim); + // the function ApplyCmvn takes a matrix, so form a one-row matrix to give it. + if (opts_.normalize_mean) + ApplyCmvn(stats, opts_.normalize_variance, &feat_mat); + else + KALDI_ASSERT(!opts_.normalize_variance); +} + +void OnlineCmvn::Freeze(int32 cur_frame) { + int32 dim = this->Dim(); + Matrix stats(2, dim + 1); + // get the raw CMVN stats + this->ComputeStatsForFrame(cur_frame, &stats); + // now smooth them. + SmoothOnlineCmvnStats(orig_state_.speaker_cmvn_stats, + orig_state_.global_cmvn_stats, + opts_, + &stats); + this->frozen_state_ = stats; +} + +void OnlineCmvn::GetState(int32 cur_frame, + OnlineCmvnState *state_out) { + *state_out = this->orig_state_; + { // This block updates state_out->speaker_cmvn_stats + int32 dim = this->Dim(); + if (state_out->speaker_cmvn_stats.NumRows() == 0) + state_out->speaker_cmvn_stats.Resize(2, dim + 1); + Vector feat(dim); + Vector feat_dbl(dim); + for (int32 t = 0; t <= cur_frame; t++) { + src_->GetFrame(t, &feat); + feat_dbl.CopyFromVec(feat); + state_out->speaker_cmvn_stats(0, dim) += 1.0; + state_out->speaker_cmvn_stats.Row(0).Range(0, dim).AddVec(1.0, feat_dbl); + state_out->speaker_cmvn_stats.Row(1).Range(0, dim).AddVec2(1.0, feat_dbl); + } + } + // Store any frozen state (the effect of the user possibly + // having called Freeze(). + state_out->frozen_state = frozen_state_; +} + +void OnlineCmvn::SetState(const OnlineCmvnState &cmvn_state) { + KALDI_ASSERT(cached_stats_modulo_.empty() && + "You cannot call SetState() after processing data."); + orig_state_ = cmvn_state; + frozen_state_ = cmvn_state.frozen_state; +} + +int32 OnlineSpliceFrames::NumFramesReady() const { + int32 num_frames = src_->NumFramesReady(); + if (num_frames > 0 && src_->IsLastFrame(num_frames - 1)) + return num_frames; + else + return std::max(0, num_frames - right_context_); +} + +void OnlineSpliceFrames::GetFrame(int32 frame, VectorBase *feat) { + KALDI_ASSERT(left_context_ >= 0 && right_context_ >= 0); + KALDI_ASSERT(frame >= 0 && frame < NumFramesReady()); + int32 dim_in = src_->Dim(); + KALDI_ASSERT(feat->Dim() == dim_in * (1 + left_context_ + right_context_)); + int32 T = src_->NumFramesReady(); + for (int32 t2 = frame - left_context_; t2 <= frame + right_context_; t2++) { + int32 t2_limited = t2; + if (t2_limited < 0) t2_limited = 0; + if (t2_limited >= T) t2_limited = T - 1; + int32 n = t2 - (frame - left_context_); // 0 for left-most frame, + // increases to the right. + SubVector part(*feat, n * dim_in, dim_in); + src_->GetFrame(t2_limited, &part); + } +} + +OnlineTransform::OnlineTransform(const MatrixBase &transform, + OnlineFeatureInterface *src): + src_(src) { + int32 src_dim = src_->Dim(); + if (transform.NumCols() == src_dim) { // Linear transform + linear_term_ = transform; + offset_.Resize(transform.NumRows()); // Resize() will zero it. + } else if (transform.NumCols() == src_dim + 1) { // Affine transform + linear_term_ = transform.Range(0, transform.NumRows(), 0, src_dim); + offset_.Resize(transform.NumRows()); + offset_.CopyColFromMat(transform, src_dim); + } else { + KALDI_ERR << "Dimension mismatch: source features have dimension " + << src_dim << " and LDA #cols is " << transform.NumCols(); + } +} + +void OnlineTransform::GetFrame(int32 frame, VectorBase *feat) { + Vector input_feat(linear_term_.NumCols()); + src_->GetFrame(frame, &input_feat); + feat->CopyFromVec(offset_); + feat->AddMatVec(1.0, linear_term_, kNoTrans, input_feat, 1.0); +} + +void OnlineTransform::GetFrames( + const std::vector &frames, MatrixBase *feats) { + KALDI_ASSERT(static_cast(frames.size()) == feats->NumRows()); + int32 num_frames = feats->NumRows(), + input_dim = linear_term_.NumCols(); + Matrix input_feats(num_frames, input_dim, kUndefined); + src_->GetFrames(frames, &input_feats); + feats->CopyRowsFromVec(offset_); + feats->AddMatMat(1.0, input_feats, kNoTrans, linear_term_, kTrans, 1.0); +} + + +int32 OnlineDeltaFeature::Dim() const { + int32 src_dim = src_->Dim(); + return src_dim * (1 + opts_.order); +} + +int32 OnlineDeltaFeature::NumFramesReady() const { + int32 num_frames = src_->NumFramesReady(), + context = opts_.order * opts_.window; + // "context" is the number of frames on the left or (more relevant + // here) right which we need in order to produce the output. + if (num_frames > 0 && src_->IsLastFrame(num_frames-1)) + return num_frames; + else + return std::max(0, num_frames - context); +} + +void OnlineDeltaFeature::GetFrame(int32 frame, + VectorBase *feat) { + KALDI_ASSERT(frame >= 0 && frame < NumFramesReady()); + KALDI_ASSERT(feat->Dim() == Dim()); + // We'll produce a temporary matrix containing the features we want to + // compute deltas on, but truncated to the necessary context. + int32 context = opts_.order * opts_.window; + int32 left_frame = frame - context, + right_frame = frame + context, + src_frames_ready = src_->NumFramesReady(); + if (left_frame < 0) left_frame = 0; + if (right_frame >= src_frames_ready) + right_frame = src_frames_ready - 1; + KALDI_ASSERT(right_frame >= left_frame); + int32 temp_num_frames = right_frame + 1 - left_frame, + src_dim = src_->Dim(); + Matrix temp_src(temp_num_frames, src_dim); + for (int32 t = left_frame; t <= right_frame; t++) { + SubVector temp_row(temp_src, t - left_frame); + src_->GetFrame(t, &temp_row); + } + int32 temp_t = frame - left_frame; // temp_t is the offset of frame "frame" + // within temp_src + delta_features_.Process(temp_src, temp_t, feat); +} + + +OnlineDeltaFeature::OnlineDeltaFeature(const DeltaFeaturesOptions &opts, + OnlineFeatureInterface *src): + src_(src), opts_(opts), delta_features_(opts) { } + +void OnlineCacheFeature::GetFrame(int32 frame, VectorBase *feat) { + KALDI_ASSERT(frame >= 0); + if (static_cast(frame) < cache_.size() && cache_[frame] != NULL) { + feat->CopyFromVec(*(cache_[frame])); + } else { + if (static_cast(frame) >= cache_.size()) + cache_.resize(frame + 1, NULL); + int32 dim = this->Dim(); + cache_[frame] = new Vector(dim); + // The following call will crash if frame "frame" is not ready. + src_->GetFrame(frame, cache_[frame]); + feat->CopyFromVec(*(cache_[frame])); + } +} + +void OnlineCacheFeature::GetFrames( + const std::vector &frames, MatrixBase *feats) { + int32 num_frames = frames.size(); + // non_cached_frames will be the subset of 't' values in 'frames' which were + // not previously cached, which we therefore need to get from src_. + std::vector non_cached_frames; + // 'non_cached_indexes' stores the indexes 'i' into 'frames' corresponding to + // the corresponding frames in 'non_cached_frames'. + std::vector non_cached_indexes; + non_cached_frames.reserve(frames.size()); + non_cached_indexes.reserve(frames.size()); + for (int32 i = 0; i < num_frames; i++) { + int32 t = frames[i]; + if (static_cast(t) < cache_.size() && cache_[t] != NULL) { + feats->Row(i).CopyFromVec(*(cache_[t])); + } else { + non_cached_frames.push_back(t); + non_cached_indexes.push_back(i); + } + } + if (non_cached_frames.empty()) + return; + int32 num_non_cached_frames = non_cached_frames.size(), + dim = this->Dim(); + Matrix non_cached_feats(num_non_cached_frames, dim, + kUndefined); + src_->GetFrames(non_cached_frames, &non_cached_feats); + for (int32 i = 0; i < num_non_cached_frames; i++) { + int32 t = non_cached_frames[i]; + if (static_cast(t) < cache_.size() && cache_[t] != NULL) { + // We can reach this point due to repeat indexes in 'non_cached_frames'. + feats->Row(non_cached_indexes[i]).CopyFromVec(*(cache_[t])); + } else { + SubVector this_feat(non_cached_feats, i); + feats->Row(non_cached_indexes[i]).CopyFromVec(this_feat); + if (static_cast(t) >= cache_.size()) + cache_.resize(t + 1, NULL); + cache_[t] = new Vector(this_feat); + } + } +} + + +void OnlineCacheFeature::ClearCache() { + for (size_t i = 0; i < cache_.size(); i++) + delete cache_[i]; + cache_.resize(0); +} + + +void OnlineAppendFeature::GetFrame(int32 frame, VectorBase *feat) { + KALDI_ASSERT(feat->Dim() == Dim()); + + SubVector feat1(*feat, 0, src1_->Dim()); + SubVector feat2(*feat, src1_->Dim(), src2_->Dim()); + src1_->GetFrame(frame, &feat1); + src2_->GetFrame(frame, &feat2); +}; + + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/feat/online-feature.h b/speechx/speechx/kaldi/feat/online-feature.h new file mode 100644 index 00000000..f2ebe45b --- /dev/null +++ b/speechx/speechx/kaldi/feat/online-feature.h @@ -0,0 +1,632 @@ +// feat/online-feature.h + +// Copyright 2013 Johns Hopkins University (author: Daniel Povey) +// 2014 Yanqing Sun, Junjie Wang, +// Daniel Povey, Korbinian Riedhammer + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_FEAT_ONLINE_FEATURE_H_ +#define KALDI_FEAT_ONLINE_FEATURE_H_ + +#include +#include +#include + +#include "matrix/matrix-lib.h" +#include "util/common-utils.h" +#include "base/kaldi-error.h" +#include "feat/feature-functions.h" +#include "feat/feature-mfcc.h" +#include "feat/feature-plp.h" +#include "feat/feature-fbank.h" +#include "itf/online-feature-itf.h" + +namespace kaldi { +/// @addtogroup onlinefeat OnlineFeatureExtraction +/// @{ + + +/// This class serves as a storage for feature vectors with an option to limit +/// the memory usage by removing old elements. The deleted frames indices are +/// "remembered" so that regardless of the MAX_ITEMS setting, the user always +/// provides the indices as if no deletion was being performed. +/// This is useful when processing very long recordings which would otherwise +/// cause the memory to eventually blow up when the features are not being removed. +class RecyclingVector { +public: + /// By default it does not remove any elements. + RecyclingVector(int items_to_hold = -1); + + /// The ownership is being retained by this collection - do not delete the item. + Vector *At(int index) const; + + /// The ownership of the item is passed to this collection - do not delete the item. + void PushBack(Vector *item); + + /// This method returns the size as if no "recycling" had happened, + /// i.e. equivalent to the number of times the PushBack method has been called. + int Size() const; + + ~RecyclingVector(); + +private: + std::deque*> items_; + int items_to_hold_; + int first_available_index_; +}; + + +/// This is a templated class for online feature extraction; +/// it's templated on a class like MfccComputer or PlpComputer +/// that does the basic feature extraction. +template +class OnlineGenericBaseFeature: public OnlineBaseFeature { + public: + // + // First, functions that are present in the interface: + // + virtual int32 Dim() const { return computer_.Dim(); } + + // Note: IsLastFrame() will only ever return true if you have called + // InputFinished() (and this frame is the last frame). + virtual bool IsLastFrame(int32 frame) const { + return input_finished_ && frame == NumFramesReady() - 1; + } + virtual BaseFloat FrameShiftInSeconds() const { + return computer_.GetFrameOptions().frame_shift_ms / 1000.0f; + } + + virtual int32 NumFramesReady() const { return features_.Size(); } + + virtual void GetFrame(int32 frame, VectorBase *feat); + + // Next, functions that are not in the interface. + + + // Constructor from options class + explicit OnlineGenericBaseFeature(const typename C::Options &opts); + + // This would be called from the application, when you get + // more wave data. Note: the sampling_rate is only provided so + // the code can assert that it matches the sampling rate + // expected in the options. + virtual void AcceptWaveform(BaseFloat sampling_rate, + const VectorBase &waveform); + + + // InputFinished() tells the class you won't be providing any + // more waveform. This will help flush out the last frame or two + // of features, in the case where snip-edges == false; it also + // affects the return value of IsLastFrame(). + virtual void InputFinished(); + + private: + // This function computes any additional feature frames that it is possible to + // compute from 'waveform_remainder_', which at this point may contain more + // than just a remainder-sized quantity (because AcceptWaveform() appends to + // waveform_remainder_ before calling this function). It adds these feature + // frames to features_, and shifts off any now-unneeded samples of input from + // waveform_remainder_ while incrementing waveform_offset_ by the same amount. + void ComputeFeatures(); + + void MaybeCreateResampler(BaseFloat sampling_rate); + + C computer_; // class that does the MFCC or PLP or filterbank computation + + // resampler in cases when the input sampling frequency is not equal to + // the expected sampling rate + std::unique_ptr resampler_; + + FeatureWindowFunction window_function_; + + // features_ is the Mfcc or Plp or Fbank features that we have already computed. + + RecyclingVector features_; + + // True if the user has called "InputFinished()" + bool input_finished_; + + // The sampling frequency, extracted from the config. Should + // be identical to the waveform supplied. + BaseFloat sampling_frequency_; + + // waveform_offset_ is the number of samples of waveform that we have + // already discarded, i.e. that were prior to 'waveform_remainder_'. + int64 waveform_offset_; + + // waveform_remainder_ is a short piece of waveform that we may need to keep + // after extracting all the whole frames we can (whatever length of feature + // will be required for the next phase of computation). + Vector waveform_remainder_; +}; + +typedef OnlineGenericBaseFeature OnlineMfcc; +typedef OnlineGenericBaseFeature OnlinePlp; +typedef OnlineGenericBaseFeature OnlineFbank; + + +/// This class takes a Matrix and wraps it as an +/// OnlineFeatureInterface: this can be useful where some earlier stage of +/// feature processing has been done offline but you want to use part of the +/// online pipeline. +class OnlineMatrixFeature: public OnlineFeatureInterface { + public: + /// Caution: this class maintains the const reference from the constructor, so + /// don't let it go out of scope while this object exists. + explicit OnlineMatrixFeature(const MatrixBase &mat): mat_(mat) { } + + virtual int32 Dim() const { return mat_.NumCols(); } + + virtual BaseFloat FrameShiftInSeconds() const { + return 0.01f; + } + + virtual int32 NumFramesReady() const { return mat_.NumRows(); } + + virtual void GetFrame(int32 frame, VectorBase *feat) { + feat->CopyFromVec(mat_.Row(frame)); + } + + virtual bool IsLastFrame(int32 frame) const { + return (frame + 1 == mat_.NumRows()); + } + + + private: + const MatrixBase &mat_; +}; + + +// Note the similarity with SlidingWindowCmnOptions, but there +// are also differences. One which doesn't appear in the config +// itself, because it's a difference between the setups, is that +// in OnlineCmn, we carry over data from the previous utterance, +// or, if no previous utterance is available, from global stats, +// or, if previous utterances are available but the total amount +// of data is less than prev_frames, we pad with up to "global_frames" +// frames from the global stats. +struct OnlineCmvnOptions { + int32 cmn_window; + int32 speaker_frames; // must be <= cmn_window + int32 global_frames; // must be <= speaker_frames. + bool normalize_mean; // Must be true if normalize_variance==true. + bool normalize_variance; + + int32 modulus; // not configurable from command line, relates to how the + // class computes the cmvn internally. smaller->more + // time-efficient but less memory-efficient. Must be >= 1. + int32 ring_buffer_size; // not configurable from command line; size of ring + // buffer used for caching CMVN stats. Must be >= + // modulus. + std::string skip_dims; // Colon-separated list of dimensions to skip normalization + // of, e.g. 13:14:15. + + OnlineCmvnOptions(): + cmn_window(600), + speaker_frames(600), + global_frames(200), + normalize_mean(true), + normalize_variance(false), + modulus(20), + ring_buffer_size(20), + skip_dims("") { } + + void Check() const { + KALDI_ASSERT(speaker_frames <= cmn_window && global_frames <= speaker_frames + && modulus > 0); + } + + void Register(ParseOptions *po) { + po->Register("cmn-window", &cmn_window, "Number of frames of sliding " + "context for cepstral mean normalization."); + po->Register("global-frames", &global_frames, "Number of frames of " + "global-average cepstral mean normalization stats to use for " + "first utterance of a speaker"); + po->Register("speaker-frames", &speaker_frames, "Number of frames of " + "previous utterance(s) from this speaker to use in cepstral " + "mean normalization"); + // we name the config string "norm-vars" for compatibility with + // ../featbin/apply-cmvn.cc + po->Register("norm-vars", &normalize_variance, "If true, do " + "cepstral variance normalization in addition to cepstral mean " + "normalization "); + po->Register("norm-means", &normalize_mean, "If true, do mean normalization " + "(note: you cannot normalize the variance but not the mean)"); + po->Register("skip-dims", &skip_dims, "Dimensions to skip normalization of " + "(colon-separated list of integers)");} +}; + + + +/** Struct OnlineCmvnState stores the state of CMVN adaptation between + utterances (but not the state of the computation within an utterance). It + stores the global CMVN stats and the stats of the current speaker (if we + have seen previous utterances for this speaker), and possibly will have a + member "frozen_state": if the user has called the function Freeze() of class + OnlineCmvn, to fix the CMVN so we can estimate fMLLR on top of the fixed + value of cmvn. If nonempty, "frozen_state" will reflect how we were + normalizing the mean and (if applicable) variance at the time when that + function was called. +*/ +struct OnlineCmvnState { + // The following is the total CMVN stats for this speaker (up till now), in + // the same format. + Matrix speaker_cmvn_stats; + + // The following is the global CMVN stats, in the usual + // format, of dimension 2 x (dim+1), as [ sum-stats count + // sum-squared-stats 0 ] + Matrix global_cmvn_stats; + + // If nonempty, contains CMVN stats representing the "frozen" state + // of CMVN that reflects how we were normalizing the data when the + // user called the Freeze() function in class OnlineCmvn. + Matrix frozen_state; + + OnlineCmvnState() { } + + explicit OnlineCmvnState(const Matrix &global_stats): + global_cmvn_stats(global_stats) { } + + // Copy constructor + OnlineCmvnState(const OnlineCmvnState &other); + + void Write(std::ostream &os, bool binary) const; + void Read(std::istream &is, bool binary); + + // Use the default assignment operator. +}; + +/** + This class does an online version of the cepstral mean and [optionally] + variance, but note that this is not equivalent to the offline version. This + is necessarily so, as the offline computation involves looking into the + future. If you plan to use features normalized with this type of CMVN then + you need to train in a `matched' way, i.e. with the same type of features. + We normally only do so in the "online" GMM-based decoding, e.g. in + online2bin/online2-wav-gmm-latgen-faster.cc; see also the script + steps/online/prepare_online_decoding.sh and steps/online/decode.sh. + + In the steady state (in the middle of a long utterance), this class + accumulates CMVN statistics from the previous "cmn_window" frames (default 600 + frames, or 6 seconds), and uses these to normalize the mean and possibly + variance of the current frame. + + The config variables "speaker_frames" and "global_frames" relate to what + happens at the beginning of the utterance when we have seen fewer than + "cmn_window" frames of context, and so might not have very good stats to + normalize with. Basically, we first augment any existing stats with up + to "speaker_frames" frames of stats from previous utterances of the current + speaker, and if this doesn't take us up to the required "cmn_window" frame + count, we further augment with up to "global_frames" frames of global + stats. The global stats are CMVN stats accumulated from training or testing + data, that give us a reasonable source of mean and variance for "typical" + data. + */ +class OnlineCmvn: public OnlineFeatureInterface { + public: + + // + // First, functions that are present in the interface: + // + virtual int32 Dim() const { return src_->Dim(); } + + virtual bool IsLastFrame(int32 frame) const { + return src_->IsLastFrame(frame); + } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } + + // The online cmvn does not introduce any additional latency. + virtual int32 NumFramesReady() const { return src_->NumFramesReady(); } + + virtual void GetFrame(int32 frame, VectorBase *feat); + + // + // Next, functions that are not in the interface. + // + + /// Initializer that sets the cmvn state. If you don't have previous + /// utterances from the same speaker you are supposed to initialize the CMVN + /// state from some global CMVN stats, which you can get from summing all cmvn + /// stats you have in your training data using "sum-matrix". This just gives + /// it a reasonable starting point at the start of the file. + /// If you do have previous utterances from the same speaker or at least a + /// similar environment, you are supposed to initialize it by calling GetState + /// from the previous utterance + OnlineCmvn(const OnlineCmvnOptions &opts, + const OnlineCmvnState &cmvn_state, + OnlineFeatureInterface *src); + + /// Initializer that does not set the cmvn state: + /// after calling this, you should call SetState(). + OnlineCmvn(const OnlineCmvnOptions &opts, + OnlineFeatureInterface *src); + + // Outputs any state information from this utterance to "cmvn_state". + // The value of "cmvn_state" before the call does not matter: the output + // depends on the value of OnlineCmvnState the class was initialized + // with, the input feature values up to cur_frame, and the effects + // of the user possibly having called Freeze(). + // If cur_frame is -1, it will just output the unmodified original + // state that was supplied to this object. + void GetState(int32 cur_frame, + OnlineCmvnState *cmvn_state); + + // This function can be used to modify the state of the CMVN computation + // from outside, but must only be called before you have processed any data + // (otherwise it will crash). This "state" is really just the information + // that is propagated between utterances, not the state of the computation + // inside an utterance. + void SetState(const OnlineCmvnState &cmvn_state); + + // From this point it will freeze the CMN to what it would have been if + // measured at frame "cur_frame", and it will stop it from changing + // further. This also applies retroactively for this utterance, so if you + // call GetFrame() on previous frames, it will use the CMVN stats + // from cur_frame; and it applies in the future too if you then + // call OutputState() and use this state to initialize the next + // utterance's CMVN object. + void Freeze(int32 cur_frame); + + virtual ~OnlineCmvn(); + private: + + /// Smooth the CMVN stats "stats" (which are stored in the normal format as a + /// 2 x (dim+1) matrix), by possibly adding some stats from "global_stats" + /// and/or "speaker_stats", controlled by the config. The best way to + /// understand the smoothing rule we use is just to look at the code. + static void SmoothOnlineCmvnStats(const MatrixBase &speaker_stats, + const MatrixBase &global_stats, + const OnlineCmvnOptions &opts, + MatrixBase *stats); + + /// Get the most recent cached frame of CMVN stats. [If no frames + /// were cached, sets up empty stats for frame zero and returns that]. + void GetMostRecentCachedFrame(int32 frame, + int32 *cached_frame, + MatrixBase *stats); + + /// Cache this frame of stats. + void CacheFrame(int32 frame, const MatrixBase &stats); + + /// Initialize ring buffer for caching stats. + inline void InitRingBufferIfNeeded(); + + /// Computes the raw CMVN stats for this frame, making use of (and updating if + /// necessary) the cached statistics in raw_stats_. This means the (x, + /// x^2, count) stats for the last up to opts_.cmn_window frames. + void ComputeStatsForFrame(int32 frame, + MatrixBase *stats); + + + OnlineCmvnOptions opts_; + std::vector skip_dims_; // Skip CMVN for these dimensions. Derived from opts_. + OnlineCmvnState orig_state_; // reflects the state before we saw this + // utterance. + Matrix frozen_state_; // If the user called Freeze(), this variable + // will reflect the CMVN state that we froze + // at. + + // The variable below reflects the raw (count, x, x^2) statistics of the + // input, computed every opts_.modulus frames. raw_stats_[n / opts_.modulus] + // contains the (count, x, x^2) statistics for the frames from + // std::max(0, n - opts_.cmn_window) through n. + std::vector*> cached_stats_modulo_; + // the variable below is a ring-buffer of cached stats. the int32 is the + // frame index. + std::vector > > cached_stats_ring_; + + // Some temporary variables used inside functions of this class, which + // put here to avoid reallocation. + Matrix temp_stats_; + Vector temp_feats_; + Vector temp_feats_dbl_; + + OnlineFeatureInterface *src_; // Not owned here +}; + + +struct OnlineSpliceOptions { + int32 left_context; + int32 right_context; + OnlineSpliceOptions(): left_context(4), right_context(4) { } + void Register(ParseOptions *po) { + po->Register("left-context", &left_context, "Left-context for frame " + "splicing prior to LDA"); + po->Register("right-context", &right_context, "Right-context for frame " + "splicing prior to LDA"); + } +}; + +class OnlineSpliceFrames: public OnlineFeatureInterface { + public: + // + // First, functions that are present in the interface: + // + virtual int32 Dim() const { + return src_->Dim() * (1 + left_context_ + right_context_); + } + + virtual bool IsLastFrame(int32 frame) const { + return src_->IsLastFrame(frame); + } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } + + virtual int32 NumFramesReady() const; + + virtual void GetFrame(int32 frame, VectorBase *feat); + + // + // Next, functions that are not in the interface. + // + OnlineSpliceFrames(const OnlineSpliceOptions &opts, + OnlineFeatureInterface *src): + left_context_(opts.left_context), right_context_(opts.right_context), + src_(src) { } + + private: + int32 left_context_; + int32 right_context_; + OnlineFeatureInterface *src_; // Not owned here +}; + +/// This online-feature class implements any affine or linear transform. +class OnlineTransform: public OnlineFeatureInterface { + public: + // + // First, functions that are present in the interface: + // + virtual int32 Dim() const { return offset_.Dim(); } + + virtual bool IsLastFrame(int32 frame) const { + return src_->IsLastFrame(frame); + } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } + + virtual int32 NumFramesReady() const { return src_->NumFramesReady(); } + + virtual void GetFrame(int32 frame, VectorBase *feat); + + virtual void GetFrames(const std::vector &frames, + MatrixBase *feats); + + // + // Next, functions that are not in the interface. + // + + /// The transform can be a linear transform, or an affine transform + /// where the last column is the offset. + OnlineTransform(const MatrixBase &transform, + OnlineFeatureInterface *src); + + + private: + OnlineFeatureInterface *src_; // Not owned here + Matrix linear_term_; + Vector offset_; +}; + +class OnlineDeltaFeature: public OnlineFeatureInterface { + public: + // + // First, functions that are present in the interface: + // + virtual int32 Dim() const; + + virtual bool IsLastFrame(int32 frame) const { + return src_->IsLastFrame(frame); + } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } + + virtual int32 NumFramesReady() const; + + virtual void GetFrame(int32 frame, VectorBase *feat); + + // + // Next, functions that are not in the interface. + // + OnlineDeltaFeature(const DeltaFeaturesOptions &opts, + OnlineFeatureInterface *src); + + private: + OnlineFeatureInterface *src_; // Not owned here + DeltaFeaturesOptions opts_; + DeltaFeatures delta_features_; // This class contains just a few + // coefficients. +}; + + +/// This feature type can be used to cache its input, to avoid +/// repetition of computation in a multi-pass decoding context. +class OnlineCacheFeature: public OnlineFeatureInterface { + public: + virtual int32 Dim() const { return src_->Dim(); } + + virtual bool IsLastFrame(int32 frame) const { + return src_->IsLastFrame(frame); + } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } + + virtual int32 NumFramesReady() const { return src_->NumFramesReady(); } + + virtual void GetFrame(int32 frame, VectorBase *feat); + + virtual void GetFrames(const std::vector &frames, + MatrixBase *feats); + + virtual ~OnlineCacheFeature() { ClearCache(); } + + // Things that are not in the shared interface: + + void ClearCache(); // this should be called if you change the underlying + // features in some way. + + explicit OnlineCacheFeature(OnlineFeatureInterface *src): src_(src) { } + private: + + OnlineFeatureInterface *src_; // Not owned here + std::vector* > cache_; +}; + + + + +/// This online-feature class implements combination of two feature +/// streams (such as pitch, plp) into one stream. +class OnlineAppendFeature: public OnlineFeatureInterface { + public: + virtual int32 Dim() const { return src1_->Dim() + src2_->Dim(); } + + virtual bool IsLastFrame(int32 frame) const { + return (src1_->IsLastFrame(frame) || src2_->IsLastFrame(frame)); + } + // Hopefully sources have the same rate + virtual BaseFloat FrameShiftInSeconds() const { + return src1_->FrameShiftInSeconds(); + } + + virtual int32 NumFramesReady() const { + return std::min(src1_->NumFramesReady(), src2_->NumFramesReady()); + } + + virtual void GetFrame(int32 frame, VectorBase *feat); + + virtual ~OnlineAppendFeature() { } + + OnlineAppendFeature(OnlineFeatureInterface *src1, + OnlineFeatureInterface *src2): src1_(src1), src2_(src2) { } + private: + + OnlineFeatureInterface *src1_; + OnlineFeatureInterface *src2_; +}; + +/// @} End of "addtogroup onlinefeat" +} // namespace kaldi + +#endif // KALDI_FEAT_ONLINE_FEATURE_H_ diff --git a/speechx/speechx/kaldi/feat/pitch-functions.cc b/speechx/speechx/kaldi/feat/pitch-functions.cc new file mode 100644 index 00000000..430e9bdb --- /dev/null +++ b/speechx/speechx/kaldi/feat/pitch-functions.cc @@ -0,0 +1,1667 @@ +// feat/pitch-functions.cc + +// Copyright 2013 Pegah Ghahremani +// 2014 IMSL, PKU-HKUST (author: Wei Shi) +// 2014 Yanqing Sun, Junjie Wang, +// Daniel Povey, Korbinian Riedhammer +// Xin Lei + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "feat/feature-functions.h" +#include "feat/mel-computations.h" +#include "feat/online-feature.h" +#include "feat/pitch-functions.h" +#include "feat/resample.h" +#include "matrix/matrix-functions.h" + +namespace kaldi { + +/** + This function processes the NCCF n to a POV feature f by applying the formula + f = (1.0001 - n)^0.15 - 1.0 + This is a nonlinear function designed to make the output reasonably Gaussian + distributed. Before doing this, the NCCF distribution is in the range [-1, + 1] but has a strong peak just before 1.0, which this function smooths out. +*/ + +BaseFloat NccfToPovFeature(BaseFloat n) { + if (n > 1.0) { + n = 1.0; + } else if (n < -1.0) { + n = -1.0; + } + BaseFloat f = pow((1.0001 - n), 0.15) - 1.0; + KALDI_ASSERT(f - f == 0); // check for NaN,inf. + return f; +} + +/** + This function processes the NCCF n to a reasonably accurate probability + of voicing p by applying the formula: + + n' = fabs(n) + r = -5.2 + 5.4 * exp(7.5 * (n' - 1.0)) + + 4.8 * n' - 2.0 * exp(-10.0 * n') + 4.2 * exp(20.0 * (n' - 1.0)); + p = 1.0 / (1 + exp(-1.0 * r)); + + How did we get this formula? We plotted the empirical log-prob-ratio of voicing + r = log( p[voiced] / p[not-voiced] ) + [on the Keele database where voicing is marked], as a function of the NCCF at + the delay picked by our algorithm. This was done on intervals of the NCCF, so + we had enough statistics to get that ratio. The NCCF covers [-1, 1]; almost + all of the probability mass is on [0, 1] but the empirical POV seems fairly + symmetric with a minimum near zero, so we chose to make it a function of n' = fabs(n). + + Then we manually tuned a function (the one you see above) that approximated + the log-prob-ratio of voicing fairly well as a function of the absolute-value + NCCF n'; however, wasn't a very exact match since we were also trying to make + the transformed NCCF fairly Gaussian distributed, with a view to using it as + a feature-- an idea we later abandoned after a simpler formula worked better. + */ +BaseFloat NccfToPov(BaseFloat n) { + BaseFloat ndash = fabs(n); + if (ndash > 1.0) ndash = 1.0; // just in case it was slightly outside [-1, 1] + + BaseFloat r = -5.2 + 5.4 * Exp(7.5 * (ndash - 1.0)) + 4.8 * ndash - + 2.0 * Exp(-10.0 * ndash) + 4.2 * Exp(20.0 * (ndash - 1.0)); + // r is the approximate log-prob-ratio of voicing, log(p/(1-p)). + BaseFloat p = 1.0 / (1 + Exp(-1.0 * r)); + KALDI_ASSERT(p - p == 0); // Check for NaN/inf + return p; +} + +/** + This function computes some dot products that are required + while computing the NCCF. + For each integer lag from start to end-1, this function + outputs to (*inner_prod)(lag - start), the dot-product + of a window starting at 0 with a window starting at + lag. All windows are of length nccf_window_size. It + outputs to (*norm_prod)(lag - start), e1 * e2, where + e1 is the dot-product of the un-shifted window with itself, + and d2 is the dot-product of the window shifted by "lag" + with itself. + */ +void ComputeCorrelation(const VectorBase &wave, + int32 first_lag, int32 last_lag, + int32 nccf_window_size, + VectorBase *inner_prod, + VectorBase *norm_prod) { + Vector zero_mean_wave(wave); + // TODO: possibly fix this, the mean normalization is done in a strange way. + SubVector wave_part(wave, 0, nccf_window_size); + // subtract mean-frame from wave + zero_mean_wave.Add(-wave_part.Sum() / nccf_window_size); + BaseFloat e1, e2, sum; + SubVector sub_vec1(zero_mean_wave, 0, nccf_window_size); + e1 = VecVec(sub_vec1, sub_vec1); + for (int32 lag = first_lag; lag <= last_lag; lag++) { + SubVector sub_vec2(zero_mean_wave, lag, nccf_window_size); + e2 = VecVec(sub_vec2, sub_vec2); + sum = VecVec(sub_vec1, sub_vec2); + (*inner_prod)(lag - first_lag) = sum; + (*norm_prod)(lag - first_lag) = e1 * e2; + } +} + +/** + Computes the NCCF as a fraction of the numerator term (a dot product between + two vectors) and a denominator term which equals sqrt(e1*e2 + nccf_ballast) + where e1 and e2 are both dot-products of bits of the wave with themselves, + and e1*e2 is supplied as "norm_prod". These quantities are computed by + "ComputeCorrelation". +*/ +void ComputeNccf(const VectorBase &inner_prod, + const VectorBase &norm_prod, + BaseFloat nccf_ballast, + VectorBase *nccf_vec) { + KALDI_ASSERT(inner_prod.Dim() == norm_prod.Dim() && + inner_prod.Dim() == nccf_vec->Dim()); + for (int32 lag = 0; lag < inner_prod.Dim(); lag++) { + BaseFloat numerator = inner_prod(lag), + denominator = pow(norm_prod(lag) + nccf_ballast, 0.5), + nccf; + if (denominator != 0.0) { + nccf = numerator / denominator; + } else { + KALDI_ASSERT(numerator == 0.0); + nccf = 0.0; + } + KALDI_ASSERT(nccf < 1.01 && nccf > -1.01); + (*nccf_vec)(lag) = nccf; + } +} + +/** + This function selects the lags at which we measure the NCCF: we need + to select lags from 1/max_f0 to 1/min_f0, in a geometric progression + with ratio 1 + d. + */ +void SelectLags(const PitchExtractionOptions &opts, + Vector *lags) { + // choose lags relative to acceptable pitch tolerance + BaseFloat min_lag = 1.0 / opts.max_f0, max_lag = 1.0 / opts.min_f0; + + std::vector tmp_lags; + for (BaseFloat lag = min_lag; lag <= max_lag; lag *= 1.0 + opts.delta_pitch) + tmp_lags.push_back(lag); + lags->Resize(tmp_lags.size()); + std::copy(tmp_lags.begin(), tmp_lags.end(), lags->Data()); +} + + +/** + This function computes the local-cost for the Viterbi computation, + see eq. (5) in the paper. + @param opts The options as provided by the user + @param nccf_pitch The nccf as computed for the pitch computation (with ballast). + @param lags The log-spaced lags at which nccf_pitch is sampled. + @param local_cost We output the local-cost to here. +*/ +void ComputeLocalCost(const VectorBase &nccf_pitch, + const VectorBase &lags, + const PitchExtractionOptions &opts, + VectorBase *local_cost) { + // from the paper, eq. 5, local_cost = 1 - Phi(t,i)(1 - soft_min_f0 L_i) + // nccf is the nccf on this frame measured at the lags in "lags". + KALDI_ASSERT(nccf_pitch.Dim() == local_cost->Dim() && + nccf_pitch.Dim() == lags.Dim()); + local_cost->Set(1.0); + // add the term -Phi(t,i): + local_cost->AddVec(-1.0, nccf_pitch); + // add the term soft_min_f0 Phi(t,i) L_i + local_cost->AddVecVec(opts.soft_min_f0, lags, nccf_pitch, 1.0); +} + + + +// class PitchFrameInfo is used inside class OnlinePitchFeatureImpl. +// It stores the information we need to keep around for a single frame +// of the pitch computation. +class PitchFrameInfo { + public: + /// This function resizes the arrays for this object and updates the reference + /// counts for the previous object (by decrementing those reference counts + /// when we destroy a StateInfo object). A StateInfo object is considered to + /// be destroyed when we delete it, not when its reference counts goes to + /// zero. + void Cleanup(PitchFrameInfo *prev_frame); + + /// This function may be called for the last (most recent) PitchFrameInfo + /// object with the best state (obtained from the externally held + /// forward-costs). It traces back as far as needed to set the + /// cur_best_state_, and as it's going it sets the lag-index and pov_nccf in + /// pitch_pov_iter, which when it's called is an iterator to where to put the + /// info for the final state; the iterator will be decremented inside this + /// function. + void SetBestState(int32 best_state, + std::vector > &lag_nccf); + + /// This function may be called on the last (most recent) PitchFrameInfo + /// object; it computes how many frames of latency there is because the + /// traceback has not yet settled on a single value for frames in the past. + /// It actually returns the minimum of max_latency and the actual latency, + /// which is an optimization because we won't care about latency past + /// a user-specified maximum latency. + int32 ComputeLatency(int32 max_latency); + + /// This function updates + bool UpdatePreviousBestState(PitchFrameInfo *prev_frame); + + /// This constructor is used for frame -1; it sets the costs to be all zeros + /// the pov_nccf's to zero and the backpointers to -1. + explicit PitchFrameInfo(int32 num_states); + + /// This constructor is used for subsequent frames (not -1). + PitchFrameInfo(PitchFrameInfo *prev); + + /// Record the nccf_pov value. + /// @param nccf_pov The nccf as computed for the POV computation (without ballast). + void SetNccfPov(const VectorBase &nccf_pov); + + /// This constructor is used for frames apart from frame -1; the bulk of + /// the Viterbi computation takes place inside this constructor. + /// @param opts The options as provided by the user + /// @param nccf_pitch The nccf as computed for the pitch computation + /// (with ballast). + /// @param nccf_pov The nccf as computed for the POV computation + /// (without ballast). + /// @param lags The log-spaced lags at which nccf_pitch and + /// nccf_pov are sampled. + /// @param prev_frame_forward_cost The forward-cost vector for the + /// previous frame. + /// @param index_info A pointer to a temporary vector used by this function + /// @param this_forward_cost The forward-cost vector for this frame + /// (to be computed). + void ComputeBacktraces(const PitchExtractionOptions &opts, + const VectorBase &nccf_pitch, + const VectorBase &lags, + const VectorBase &prev_forward_cost, + std::vector > *index_info, + VectorBase *this_forward_cost); + private: + // struct StateInfo is the information we keep for a single one of the + // log-spaced lags, for a single frame. This is a state in the Viterbi + // computation. + struct StateInfo { + /// The state index on the previous frame that is the best preceding state + /// for this state. + int32 backpointer; + /// the version of the NCCF we keep for the POV computation (without the + /// ballast term). + BaseFloat pov_nccf; + StateInfo(): backpointer(0), pov_nccf(0.0) { } + }; + std::vector state_info_; + /// the state index of the first entry in "state_info"; this will initially be + /// zero, but after cleanup might be nonzero. + int32 state_offset_; + + /// The current best state in the backtrace from the end. + int32 cur_best_state_; + + /// The structure for the previous frame. + PitchFrameInfo *prev_info_; +}; + + +// This constructor is used for frame -1; it sets the costs to be all zeros +// the pov_nccf's to zero and the backpointers to -1. +PitchFrameInfo::PitchFrameInfo(int32 num_states) + :state_info_(num_states), state_offset_(0), + cur_best_state_(-1), prev_info_(NULL) { } + + +bool pitch_use_naive_search = false; // This is used in unit-tests. + + +PitchFrameInfo::PitchFrameInfo(PitchFrameInfo *prev_info): + state_info_(prev_info->state_info_.size()), state_offset_(0), + cur_best_state_(-1), prev_info_(prev_info) { } + +void PitchFrameInfo::SetNccfPov(const VectorBase &nccf_pov) { + int32 num_states = nccf_pov.Dim(); + KALDI_ASSERT(num_states == state_info_.size()); + for (int32 i = 0; i < num_states; i++) + state_info_[i].pov_nccf = nccf_pov(i); +} + +void PitchFrameInfo::ComputeBacktraces( + const PitchExtractionOptions &opts, + const VectorBase &nccf_pitch, + const VectorBase &lags, + const VectorBase &prev_forward_cost_vec, + std::vector > *index_info, + VectorBase *this_forward_cost_vec) { + int32 num_states = nccf_pitch.Dim(); + + Vector local_cost(num_states, kUndefined); + ComputeLocalCost(nccf_pitch, lags, opts, &local_cost); + + const BaseFloat delta_pitch_sq = pow(Log(1.0 + opts.delta_pitch), 2.0), + inter_frame_factor = delta_pitch_sq * opts.penalty_factor; + + // index local_cost, prev_forward_cost and this_forward_cost using raw pointer + // indexing not operator (), since this is the very inner loop and a lot of + // time is taken here. + const BaseFloat *prev_forward_cost = prev_forward_cost_vec.Data(); + BaseFloat *this_forward_cost = this_forward_cost_vec->Data(); + + if (index_info->empty()) + index_info->resize(num_states); + + // make it a reference for more concise indexing. + std::vector > &bounds = *index_info; + + /* bounds[i].first will be a lower bound on the backpointer for state i, + bounds[i].second will be an upper bound on it. We progressively tighten + these bounds till we know the backpointers exactly. + */ + + if (pitch_use_naive_search) { + // This branch is only taken in unit-testing code. + for (int32 i = 0; i < num_states; i++) { + BaseFloat best_cost = std::numeric_limits::infinity(); + int32 best_j = -1; + for (int32 j = 0; j < num_states; j++) { + BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor + + prev_forward_cost[j]; + if (this_cost < best_cost) { + best_cost = this_cost; + best_j = j; + } + } + this_forward_cost[i] = best_cost; + state_info_[i].backpointer = best_j; + } + } else { + int32 last_backpointer = 0; + for (int32 i = 0; i < num_states; i++) { + int32 start_j = last_backpointer; + BaseFloat best_cost = (start_j - i) * (start_j - i) * inter_frame_factor + + prev_forward_cost[start_j]; + int32 best_j = start_j; + + for (int32 j = start_j + 1; j < num_states; j++) { + BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor + + prev_forward_cost[j]; + if (this_cost < best_cost) { + best_cost = this_cost; + best_j = j; + } else { // as soon as the costs stop improving, we stop searching. + break; // this is a loose lower bound we're getting. + } + } + state_info_[i].backpointer = best_j; + this_forward_cost[i] = best_cost; + bounds[i].first = best_j; // this is now a lower bound on the + // backpointer. + bounds[i].second = num_states - 1; // we have no meaningful upper bound + // yet. + last_backpointer = best_j; + } + + // We iterate, progressively refining the upper and lower bounds until they + // meet and we know that the resulting backtraces are optimal. Each + // iteration takes time linear in num_states. We won't normally iterate as + // far as num_states; normally we only do two iterations; when printing out + // the number of iterations, it's rarely more than that (once I saw seven + // iterations). Anyway, this part of the computation does not dominate. + for (int32 iter = 0; iter < num_states; iter++) { + bool changed = false; + if (iter % 2 == 0) { // go backwards through the states + last_backpointer = num_states - 1; + for (int32 i = num_states - 1; i >= 0; i--) { + int32 lower_bound = bounds[i].first, + upper_bound = std::min(last_backpointer, bounds[i].second); + if (upper_bound == lower_bound) { + last_backpointer = lower_bound; + continue; + } + BaseFloat best_cost = this_forward_cost[i]; + int32 best_j = state_info_[i].backpointer, initial_best_j = best_j; + + if (best_j == upper_bound) { + // if best_j already equals upper bound, don't bother tightening the + // upper bound, we'll tighten the lower bound when the time comes. + last_backpointer = best_j; + continue; + } + // Below, we have j > lower_bound + 1 because we know we've already + // evaluated lower_bound and lower_bound + 1 [via knowledge of + // this algorithm.] + for (int32 j = upper_bound; j > lower_bound + 1; j--) { + BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor + + prev_forward_cost[j]; + if (this_cost < best_cost) { + best_cost = this_cost; + best_j = j; + } else { // as soon as the costs stop improving, we stop searching, + // unless the best j is still lower than j, in which case + // we obviously need to keep moving. + if (best_j > j) + break; // this is a loose lower bound we're getting. + } + } + // our "best_j" is now an upper bound on the backpointer. + bounds[i].second = best_j; + if (best_j != initial_best_j) { + this_forward_cost[i] = best_cost; + state_info_[i].backpointer = best_j; + changed = true; + } + last_backpointer = best_j; + } + } else { // go forwards through the states. + last_backpointer = 0; + for (int32 i = 0; i < num_states; i++) { + int32 lower_bound = std::max(last_backpointer, bounds[i].first), + upper_bound = bounds[i].second; + if (upper_bound == lower_bound) { + last_backpointer = lower_bound; + continue; + } + BaseFloat best_cost = this_forward_cost[i]; + int32 best_j = state_info_[i].backpointer, initial_best_j = best_j; + + if (best_j == lower_bound) { + // if best_j already equals lower bound, we don't bother tightening + // the lower bound, we'll tighten the upper bound when the time + // comes. + last_backpointer = best_j; + continue; + } + // Below, we have j < upper_bound because we know we've already + // evaluated that point. + for (int32 j = lower_bound; j < upper_bound - 1; j++) { + BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor + + prev_forward_cost[j]; + if (this_cost < best_cost) { + best_cost = this_cost; + best_j = j; + } else { // as soon as the costs stop improving, we stop searching, + // unless the best j is still higher than j, in which case + // we obviously need to keep moving. + if (best_j < j) + break; // this is a loose lower bound we're getting. + } + } + // our "best_j" is now a lower bound on the backpointer. + bounds[i].first = best_j; + if (best_j != initial_best_j) { + this_forward_cost[i] = best_cost; + state_info_[i].backpointer = best_j; + changed = true; + } + last_backpointer = best_j; + } + } + if (!changed) + break; + } + } + // The next statement is needed due to RecomputeBacktraces: we have to + // invalidate the previously computed best-state info. + cur_best_state_ = -1; + this_forward_cost_vec->AddVec(1.0, local_cost); +} + +void PitchFrameInfo::SetBestState( + int32 best_state, + std::vector > &lag_nccf) { + + // This function would naturally be recursive, but we have coded this to avoid + // recursion, which would otherwise eat up the stack. Think of it as a static + // member function, except we do use "this" right at the beginning. + + std::vector >::reverse_iterator iter = lag_nccf.rbegin(); + + PitchFrameInfo *this_info = this; // it will change in the loop. + while (this_info != NULL) { + PitchFrameInfo *prev_info = this_info->prev_info_; + if (best_state == this_info->cur_best_state_) + return; // no change + if (prev_info != NULL) // don't write anything for frame -1. + iter->first = best_state; + size_t state_info_index = best_state - this_info->state_offset_; + KALDI_ASSERT(state_info_index < this_info->state_info_.size()); + this_info->cur_best_state_ = best_state; + best_state = this_info->state_info_[state_info_index].backpointer; + if (prev_info != NULL) // don't write anything for frame -1. + iter->second = this_info->state_info_[state_info_index].pov_nccf; + this_info = prev_info; + if (this_info != NULL) ++iter; + } +} + +int32 PitchFrameInfo::ComputeLatency(int32 max_latency) { + if (max_latency <= 0) return 0; + + int32 latency = 0; + + // This function would naturally be recursive, but we have coded this to avoid + // recursion, which would otherwise eat up the stack. Think of it as a static + // member function, except we do use "this" right at the beginning. + // This function is called only on the most recent PitchFrameInfo object. + int32 num_states = state_info_.size(); + int32 min_living_state = 0, max_living_state = num_states - 1; + PitchFrameInfo *this_info = this; // it will change in the loop. + + + for (; this_info != NULL && latency < max_latency;) { + int32 offset = this_info->state_offset_; + KALDI_ASSERT(min_living_state >= offset && + max_living_state - offset < this_info->state_info_.size()); + min_living_state = + this_info->state_info_[min_living_state - offset].backpointer; + max_living_state = + this_info->state_info_[max_living_state - offset].backpointer; + if (min_living_state == max_living_state) { + return latency; + } + this_info = this_info->prev_info_; + if (this_info != NULL) // avoid incrementing latency for frame -1, + latency++; // as it's not a real frame. + } + return latency; +} + +void PitchFrameInfo::Cleanup(PitchFrameInfo *prev_frame) { + KALDI_ERR << "Cleanup not implemented."; +} + + +// struct NccfInfo is used to cache certain quantities that we need for online +// operation, for the first "recompute_frame" frames of the file (e.g. 300); +// after that many frames, or after the user calls InputFinished(), we redo the +// initial backtraces, as we'll then have a better estimate of the average signal +// energy. +struct NccfInfo { + + Vector nccf_pitch_resampled; // resampled nccf_pitch + BaseFloat avg_norm_prod; // average value of e1 * e2. + BaseFloat mean_square_energy; // mean_square energy we used when computing the + // original ballast term for + // "nccf_pitch_resampled". + + NccfInfo(BaseFloat avg_norm_prod, + BaseFloat mean_square_energy): + avg_norm_prod(avg_norm_prod), + mean_square_energy(mean_square_energy) { } +}; + + + +// We could inherit from OnlineBaseFeature as we have the same interface, +// but this will unnecessary force a lot of our functions to be virtual. +class OnlinePitchFeatureImpl { + public: + explicit OnlinePitchFeatureImpl(const PitchExtractionOptions &opts); + + int32 Dim() const { return 2; } + + BaseFloat FrameShiftInSeconds() const; + + int32 NumFramesReady() const; + + bool IsLastFrame(int32 frame) const; + + void GetFrame(int32 frame, VectorBase *feat); + + void AcceptWaveform(BaseFloat sampling_rate, + const VectorBase &waveform); + + void InputFinished(); + + ~OnlinePitchFeatureImpl(); + + + // Copy-constructor, can be used to obtain a new copy of this object, + // any state from this utterance. + OnlinePitchFeatureImpl(const OnlinePitchFeatureImpl &other); + + private: + + /// This function works out from the signal how many frames are currently + /// available to process (this is called from inside AcceptWaveform()). + /// Note: the number of frames differs slightly from the number the + /// old pitch code gave. + /// Note: the number this returns depends on whether input_finished_ == true; + /// if it is, it will "force out" a final frame or two. + int32 NumFramesAvailable(int64 num_downsampled_samples, bool snip_edges) const; + + /// This function extracts from the signal the samples numbered from + /// "sample_index" (numbered in the full downsampled signal, not just this + /// part), and of length equal to window->Dim(). It uses the data members + /// downsampled_samples_discarded_ and downsampled_signal_remainder_, as well + /// as the more recent part of the downsampled wave "downsampled_wave_part" + /// which is provided. + /// + /// @param downsampled_wave_part One chunk of the downsampled wave, + /// starting from sample-index downsampled_samples_discarded_. + /// @param sample_index The desired starting sample index (measured from + /// the start of the whole signal, not just this part). + /// @param window The part of the signal is output to here. + void ExtractFrame(const VectorBase &downsampled_wave_part, + int64 frame_index, + VectorBase *window); + + + /// This function is called after we reach frame "recompute_frame", or when + /// InputFinished() is called, whichever comes sooner. It recomputes the + /// backtraces for frames zero through recompute_frame, if needed because the + /// average energy of the signal has changed, affecting the nccf ballast term. + /// It works out the average signal energy from + /// downsampled_samples_processed_, signal_sum_ and signal_sumsq_ (which, if + /// you see the calling code, might include more frames than just + /// "recompute_frame", it might include up to the end of the current chunk). + void RecomputeBacktraces(); + + + /// This function updates downsampled_signal_remainder_, + /// downsampled_samples_processed_, signal_sum_ and signal_sumsq_; it's called + /// from AcceptWaveform(). + void UpdateRemainder(const VectorBase &downsampled_wave_part); + + + // The following variables don't change throughout the lifetime + // of this object. + PitchExtractionOptions opts_; + + // the first lag of the downsampled signal at which we measure NCCF + int32 nccf_first_lag_; + // the last lag of the downsampled signal at which we measure NCCF + int32 nccf_last_lag_; + + // The log-spaced lags at which we will resample the NCCF + Vector lags_; + + // This object is used to resample from evenly spaced to log-evenly-spaced + // nccf values. It's a pointer for convenience of initialization, so we don't + // have to use the initializer from the constructor. + ArbitraryResample *nccf_resampler_; + + // The following objects may change during the lifetime of this object. + + // This object is used to resample the signal. + LinearResample *signal_resampler_; + + // frame_info_ is indexed by [frame-index + 1]. frame_info_[0] is an object + // that corresponds to frame -1, which is not a real frame. + std::vector frame_info_; + + + // nccf_info_ is indexed by frame-index, from frame 0 to at most + // opts_.recompute_frame - 1. It contains some information we'll + // need to recompute the tracebacks after getting a better estimate + // of the average energy of the signal. + std::vector nccf_info_; + + // Current number of frames which we can't output because Viterbi has not + // converged for them, or opts_.max_frames_latency if we have reached that + // limit. + int32 frames_latency_; + + // The forward-cost at the current frame (the last frame in frame_info_); + // this has the same dimension as lags_. We normalize each time so + // the lowest cost is zero, for numerical accuracy and so we can use float. + Vector forward_cost_; + + // stores the constant part of forward_cost_. + double forward_cost_remainder_; + + // The resampled-lag index and the NCCF (as computed for POV, without ballast + // term) for each frame, as determined by Viterbi traceback from the best + // final state. + std::vector > lag_nccf_; + + bool input_finished_; + + /// sum-squared of previously processed parts of signal; used to get NCCF + /// ballast term. Denominator is downsampled_samples_processed_. + double signal_sumsq_; + + /// sum of previously processed parts of signal; used to do mean-subtraction + /// when getting sum-squared, along with signal_sumsq_. + double signal_sum_; + + /// downsampled_samples_processed is the number of samples (after + /// downsampling) that we got in previous calls to AcceptWaveform(). + int64 downsampled_samples_processed_; + /// This is a small remainder of the previous downsampled signal; + /// it's used by ExtractFrame for frames near the boundary of two + /// waveforms supplied to AcceptWaveform(). + Vector downsampled_signal_remainder_; +}; + + +OnlinePitchFeatureImpl::OnlinePitchFeatureImpl( + const PitchExtractionOptions &opts): + opts_(opts), forward_cost_remainder_(0.0), input_finished_(false), + signal_sumsq_(0.0), signal_sum_(0.0), downsampled_samples_processed_(0) { + signal_resampler_ = new LinearResample(opts.samp_freq, opts.resample_freq, + opts.lowpass_cutoff, + opts.lowpass_filter_width); + + double outer_min_lag = 1.0 / opts.max_f0 - + (opts.upsample_filter_width/(2.0 * opts.resample_freq)); + double outer_max_lag = 1.0 / opts.min_f0 + + (opts.upsample_filter_width/(2.0 * opts.resample_freq)); + nccf_first_lag_ = ceil(opts.resample_freq * outer_min_lag); + nccf_last_lag_ = floor(opts.resample_freq * outer_max_lag); + + frames_latency_ = 0; // will be set in AcceptWaveform() + + // Choose the lags at which we resample the NCCF. + SelectLags(opts, &lags_); + + // upsample_cutoff is the filter cutoff for upsampling the NCCF, which is the + // Nyquist of the resampling frequency. The NCCF is (almost completely) + // bandlimited to around "lowpass_cutoff" (1000 by default), and when the + // spectrum of this bandlimited signal is convolved with the spectrum of an + // impulse train with frequency "resample_freq", which are separated by 4kHz, + // we get energy at -5000,-3000, -1000...1000, 3000..5000, etc. Filtering at + // half the Nyquist (2000 by default) is sufficient to get only the first + // repetition. + BaseFloat upsample_cutoff = opts.resample_freq * 0.5; + + + Vector lags_offset(lags_); + // lags_offset equals lags_ (which are the log-spaced lag values we want to + // measure the NCCF at) with nccf_first_lag_ / opts.resample_freq subtracted + // from each element, so we can treat the measured NCCF values as as starting + // from sample zero in a signal that starts at the point start / + // opts.resample_freq. This is necessary because the ArbitraryResample code + // assumes that the input signal starts from sample zero. + lags_offset.Add(-nccf_first_lag_ / opts.resample_freq); + + int32 num_measured_lags = nccf_last_lag_ + 1 - nccf_first_lag_; + + nccf_resampler_ = new ArbitraryResample(num_measured_lags, opts.resample_freq, + upsample_cutoff, lags_offset, + opts.upsample_filter_width); + + // add a PitchInfo object for frame -1 (not a real frame). + frame_info_.push_back(new PitchFrameInfo(lags_.Dim())); + // zeroes forward_cost_; this is what we want for the fake frame -1. + forward_cost_.Resize(lags_.Dim()); +} + + +int32 OnlinePitchFeatureImpl::NumFramesAvailable( + int64 num_downsampled_samples, bool snip_edges) const { + int32 frame_shift = opts_.NccfWindowShift(), + frame_length = opts_.NccfWindowSize(); + // Use the "full frame length" to compute the number + // of frames only if the input is not finished. + if (!input_finished_) + frame_length += nccf_last_lag_; + if (num_downsampled_samples < frame_length) { + return 0; + } else { + if (!snip_edges) { + if (input_finished_) { + return static_cast(num_downsampled_samples * 1.0f / + frame_shift + 0.5f); + } else { + return static_cast((num_downsampled_samples - frame_length / 2) * + 1.0f / frame_shift + 0.5f); + } + } else { + return static_cast((num_downsampled_samples - frame_length) / + frame_shift + 1); + } + } +} + +void OnlinePitchFeatureImpl::UpdateRemainder( + const VectorBase &downsampled_wave_part) { + // frame_info_ has an extra element at frame-1, so subtract + // one from the length. + int64 num_frames = static_cast(frame_info_.size()) - 1, + next_frame = num_frames, + frame_shift = opts_.NccfWindowShift(), + next_frame_sample = frame_shift * next_frame; + + signal_sumsq_ += VecVec(downsampled_wave_part, downsampled_wave_part); + signal_sum_ += downsampled_wave_part.Sum(); + + // next_frame_sample is the first sample index we'll need for the + // next frame. + int64 next_downsampled_samples_processed = + downsampled_samples_processed_ + downsampled_wave_part.Dim(); + + if (next_frame_sample > next_downsampled_samples_processed) { + // this could only happen in the weird situation that the full frame length + // is less than the frame shift. + int32 full_frame_length = opts_.NccfWindowSize() + nccf_last_lag_; + KALDI_ASSERT(full_frame_length < frame_shift && "Code error"); + downsampled_signal_remainder_.Resize(0); + } else { + Vector new_remainder(next_downsampled_samples_processed - + next_frame_sample); + // note: next_frame_sample is the index into the entire signal, of + // new_remainder(0). + // i is the absolute index of the signal. + for (int64 i = next_frame_sample; + i < next_downsampled_samples_processed; i++) { + if (i >= downsampled_samples_processed_) { // in current signal. + new_remainder(i - next_frame_sample) = + downsampled_wave_part(i - downsampled_samples_processed_); + } else { // in old remainder; only reach here if waveform supplied is + new_remainder(i - next_frame_sample) = // tiny. + downsampled_signal_remainder_(i - downsampled_samples_processed_ + + downsampled_signal_remainder_.Dim()); + } + } + downsampled_signal_remainder_.Swap(&new_remainder); + } + downsampled_samples_processed_ = next_downsampled_samples_processed; +} + +void OnlinePitchFeatureImpl::ExtractFrame( + const VectorBase &downsampled_wave_part, + int64 sample_index, + VectorBase *window) { + int32 full_frame_length = window->Dim(); + int32 offset = static_cast(sample_index - + downsampled_samples_processed_); + + // Treat edge cases first + if (sample_index < 0) { + // Part of the frame is before the beginning of the signal. This + // should only happen if opts_.snip_edges == false, when we are + // processing the first few frames of signal. In this case + // we pad with zeros. + KALDI_ASSERT(opts_.snip_edges == false); + int32 sub_frame_length = sample_index + full_frame_length; + int32 sub_frame_index = full_frame_length - sub_frame_length; + KALDI_ASSERT(sub_frame_length > 0 && sub_frame_index > 0); + window->SetZero(); + SubVector sub_window(*window, sub_frame_index, sub_frame_length); + ExtractFrame(downsampled_wave_part, 0, &sub_window); + return; + } + + if (offset + full_frame_length > downsampled_wave_part.Dim()) { + // Requested frame is past end of the signal. This should only happen if + // input_finished_ == true, when we're flushing out the last couple of + // frames of signal. In this case we pad with zeros. + KALDI_ASSERT(input_finished_); + int32 sub_frame_length = downsampled_wave_part.Dim() - offset; + KALDI_ASSERT(sub_frame_length > 0); + window->SetZero(); + SubVector sub_window(*window, 0, sub_frame_length); + ExtractFrame(downsampled_wave_part, sample_index, &sub_window); + return; + } + + // "offset" is the offset of the start of the frame, into this + // signal. + if (offset >= 0) { + // frame is full inside the new part of the signal. + window->CopyFromVec(downsampled_wave_part.Range(offset, full_frame_length)); + } else { + // frame is partly in the remainder and partly in the new part. + int32 remainder_offset = downsampled_signal_remainder_.Dim() + offset; + KALDI_ASSERT(remainder_offset >= 0); // or we didn't keep enough remainder. + KALDI_ASSERT(offset + full_frame_length > 0); // or we should have + // processed this frame last + // time. + + int32 old_length = -offset, new_length = offset + full_frame_length; + window->Range(0, old_length).CopyFromVec( + downsampled_signal_remainder_.Range(remainder_offset, old_length)); + window->Range(old_length, new_length).CopyFromVec( + downsampled_wave_part.Range(0, new_length)); + } + if (opts_.preemph_coeff != 0.0) { + BaseFloat preemph_coeff = opts_.preemph_coeff; + for (int32 i = window->Dim() - 1; i > 0; i--) + (*window)(i) -= preemph_coeff * (*window)(i-1); + (*window)(0) *= (1.0 - preemph_coeff); + } +} + +bool OnlinePitchFeatureImpl::IsLastFrame(int32 frame) const { + int32 T = NumFramesReady(); + KALDI_ASSERT(frame < T); + return (input_finished_ && frame + 1 == T); +} + +BaseFloat OnlinePitchFeatureImpl::FrameShiftInSeconds() const { + return opts_.frame_shift_ms / 1000.0f; +} + +int32 OnlinePitchFeatureImpl::NumFramesReady() const { + int32 num_frames = lag_nccf_.size(), + latency = frames_latency_; + KALDI_ASSERT(latency <= num_frames); + return num_frames - latency; +} + + +void OnlinePitchFeatureImpl::GetFrame(int32 frame, + VectorBase *feat) { + KALDI_ASSERT(frame < NumFramesReady() && feat->Dim() == 2); + (*feat)(0) = lag_nccf_[frame].second; + (*feat)(1) = 1.0 / lags_(lag_nccf_[frame].first); +} + +void OnlinePitchFeatureImpl::InputFinished() { + input_finished_ = true; + // Process an empty waveform; this has an effect because + // after setting input_finished_ to true, NumFramesAvailable() + // will return a slightly larger number. + AcceptWaveform(opts_.samp_freq, Vector()); + int32 num_frames = static_cast(frame_info_.size() - 1); + if (num_frames < opts_.recompute_frame && !opts_.nccf_ballast_online) + RecomputeBacktraces(); + frames_latency_ = 0; + KALDI_VLOG(3) << "Pitch-tracking Viterbi cost is " + << (forward_cost_remainder_ / num_frames) + << " per frame, over " << num_frames << " frames."; +} + +// see comment with declaration. This is only relevant for online +// operation (it gets called for non-online mode, but is a no-op). +void OnlinePitchFeatureImpl::RecomputeBacktraces() { + KALDI_ASSERT(!opts_.nccf_ballast_online); + int32 num_frames = static_cast(frame_info_.size()) - 1; + + // The assertion reflects how we believe this function will be called. + KALDI_ASSERT(num_frames <= opts_.recompute_frame); + KALDI_ASSERT(nccf_info_.size() == static_cast(num_frames)); + if (num_frames == 0) + return; + double num_samp = downsampled_samples_processed_, sum = signal_sum_, + sumsq = signal_sumsq_, mean = sum / num_samp; + BaseFloat mean_square = sumsq / num_samp - mean * mean; + + bool must_recompute = false; + BaseFloat threshold = 0.01; + for (int32 frame = 0; frame < num_frames; frame++) + if (!ApproxEqual(nccf_info_[frame]->mean_square_energy, + mean_square, threshold)) + must_recompute = true; + + if (!must_recompute) { + // Nothing to do. We'll reach here, for instance, if everything was in one + // chunk and opts_.nccf_ballast_online == false. This is the case for + // offline processing. + for (size_t i = 0; i < nccf_info_.size(); i++) + delete nccf_info_[i]; + nccf_info_.clear(); + return; + } + + int32 num_states = forward_cost_.Dim(), + basic_frame_length = opts_.NccfWindowSize(); + + BaseFloat new_nccf_ballast = pow(mean_square * basic_frame_length, 2) * + opts_.nccf_ballast; + + double forward_cost_remainder = 0.0; + Vector forward_cost(num_states), // start off at zero. + next_forward_cost(forward_cost); + std::vector > index_info; + + for (int32 frame = 0; frame < num_frames; frame++) { + NccfInfo &nccf_info = *nccf_info_[frame]; + BaseFloat old_mean_square = nccf_info_[frame]->mean_square_energy, + avg_norm_prod = nccf_info_[frame]->avg_norm_prod, + old_nccf_ballast = pow(old_mean_square * basic_frame_length, 2) * + opts_.nccf_ballast, + nccf_scale = pow((old_nccf_ballast + avg_norm_prod) / + (new_nccf_ballast + avg_norm_prod), + static_cast(0.5)); + // The "nccf_scale" is an estimate of the scaling factor by which the NCCF + // would change on this frame, on average, by changing the ballast term from + // "old_nccf_ballast" to "new_nccf_ballast". It's not exact because the + // "avg_norm_prod" is just an average of the product e1 * e2 of frame + // energies of the (frame, shifted-frame), but these won't change that much + // within a frame, and even if they do, the inaccuracy of the scaled NCCF + // will still be very small if the ballast term didn't change much, or if + // it's much larger or smaller than e1*e2. By doing it as a simple scaling, + // we save the overhead of the NCCF resampling, which is a considerable part + // of the whole computation. + nccf_info.nccf_pitch_resampled.Scale(nccf_scale); + + frame_info_[frame + 1]->ComputeBacktraces( + opts_, nccf_info.nccf_pitch_resampled, lags_, + forward_cost, &index_info, &next_forward_cost); + + forward_cost.Swap(&next_forward_cost); + BaseFloat remainder = forward_cost.Min(); + forward_cost_remainder += remainder; + forward_cost.Add(-remainder); + } + KALDI_VLOG(3) << "Forward-cost per frame changed from " + << (forward_cost_remainder_ / num_frames) << " to " + << (forward_cost_remainder / num_frames); + + forward_cost_remainder_ = forward_cost_remainder; + forward_cost_.Swap(&forward_cost); + + int32 best_final_state; + forward_cost_.Min(&best_final_state); + + if (lag_nccf_.size() != static_cast(num_frames)) + lag_nccf_.resize(num_frames); + + frame_info_.back()->SetBestState(best_final_state, lag_nccf_); + frames_latency_ = + frame_info_.back()->ComputeLatency(opts_.max_frames_latency); + for (size_t i = 0; i < nccf_info_.size(); i++) + delete nccf_info_[i]; + nccf_info_.clear(); +} + +OnlinePitchFeatureImpl::~OnlinePitchFeatureImpl() { + delete nccf_resampler_; + delete signal_resampler_; + for (size_t i = 0; i < frame_info_.size(); i++) + delete frame_info_[i]; + for (size_t i = 0; i < nccf_info_.size(); i++) + delete nccf_info_[i]; +} + +void OnlinePitchFeatureImpl::AcceptWaveform( + BaseFloat sampling_rate, + const VectorBase &wave) { + // flush out the last few samples of input waveform only if input_finished_ == + // true. + const bool flush = input_finished_; + + Vector downsampled_wave; + signal_resampler_->Resample(wave, flush, &downsampled_wave); + + // these variables will be used to compute the root-mean-square value of the + // signal for the ballast term. + double cur_sumsq = signal_sumsq_, cur_sum = signal_sum_; + int64 cur_num_samp = downsampled_samples_processed_, + prev_frame_end_sample = 0; + if (!opts_.nccf_ballast_online) { + cur_sumsq += VecVec(downsampled_wave, downsampled_wave); + cur_sum += downsampled_wave.Sum(); + cur_num_samp += downsampled_wave.Dim(); + } + + // end_frame is the total number of frames we can now process, including + // previously processed ones. + int32 end_frame = NumFramesAvailable( + downsampled_samples_processed_ + downsampled_wave.Dim(), opts_.snip_edges); + // "start_frame" is the first frame-index we process + int32 start_frame = frame_info_.size() - 1, + num_new_frames = end_frame - start_frame; + + if (num_new_frames == 0) { + UpdateRemainder(downsampled_wave); + return; + // continuing to the rest of the code would generate + // an error when sizing matrices with zero rows, and + // anyway is a waste of time. + } + + int32 num_measured_lags = nccf_last_lag_ + 1 - nccf_first_lag_, + num_resampled_lags = lags_.Dim(), + frame_shift = opts_.NccfWindowShift(), + basic_frame_length = opts_.NccfWindowSize(), + full_frame_length = basic_frame_length + nccf_last_lag_; + + Vector window(full_frame_length), + inner_prod(num_measured_lags), + norm_prod(num_measured_lags); + Matrix nccf_pitch(num_new_frames, num_measured_lags), + nccf_pov(num_new_frames, num_measured_lags); + + Vector cur_forward_cost(num_resampled_lags); + + + // Because the resampling of the NCCF is more efficient when grouped together, + // we first compute the NCCF for all frames, then resample as a matrix, then + // do the Viterbi [that happens inside the constructor of PitchFrameInfo]. + + for (int32 frame = start_frame; frame < end_frame; frame++) { + // start_sample is index into the whole wave, not just this part. + int64 start_sample; + if (opts_.snip_edges) { + // Usual case: offset starts at 0 + start_sample = static_cast(frame) * frame_shift; + } else { + // When we are not snipping the edges, the first offsets may be + // negative. In this case we will pad with zeros, it should not impact + // the pitch tracker. + start_sample = + static_cast((frame + 0.5) * frame_shift) - full_frame_length / 2; + } + ExtractFrame(downsampled_wave, start_sample, &window); + if (opts_.nccf_ballast_online) { + // use only up to end of current frame to compute root-mean-square value. + // end_sample will be the sample-index into "downsampled_wave", so + // not really comparable to start_sample. + int64 end_sample = start_sample + full_frame_length - + downsampled_samples_processed_; + KALDI_ASSERT(end_sample > 0); // or should have processed this frame last + // time. Note: end_sample is one past last + // sample. + if (end_sample > downsampled_wave.Dim()) { + KALDI_ASSERT(input_finished_); + end_sample = downsampled_wave.Dim(); + } + SubVector new_part(downsampled_wave, prev_frame_end_sample, + end_sample - prev_frame_end_sample); + cur_num_samp += new_part.Dim(); + cur_sumsq += VecVec(new_part, new_part); + cur_sum += new_part.Sum(); + prev_frame_end_sample = end_sample; + } + double mean_square = cur_sumsq / cur_num_samp - + pow(cur_sum / cur_num_samp, 2.0); + + ComputeCorrelation(window, nccf_first_lag_, nccf_last_lag_, + basic_frame_length, &inner_prod, &norm_prod); + double nccf_ballast_pov = 0.0, + nccf_ballast_pitch = pow(mean_square * basic_frame_length, 2) * + opts_.nccf_ballast, + avg_norm_prod = norm_prod.Sum() / norm_prod.Dim(); + SubVector nccf_pitch_row(nccf_pitch, frame - start_frame); + ComputeNccf(inner_prod, norm_prod, nccf_ballast_pitch, + &nccf_pitch_row); + SubVector nccf_pov_row(nccf_pov, frame - start_frame); + ComputeNccf(inner_prod, norm_prod, nccf_ballast_pov, + &nccf_pov_row); + if (frame < opts_.recompute_frame) + nccf_info_.push_back(new NccfInfo(avg_norm_prod, mean_square)); + } + + Matrix nccf_pitch_resampled(num_new_frames, num_resampled_lags); + nccf_resampler_->Resample(nccf_pitch, &nccf_pitch_resampled); + nccf_pitch.Resize(0, 0); // no longer needed. + Matrix nccf_pov_resampled(num_new_frames, num_resampled_lags); + nccf_resampler_->Resample(nccf_pov, &nccf_pov_resampled); + nccf_pov.Resize(0, 0); // no longer needed. + + // We've finished dealing with the waveform so we can call UpdateRemainder + // now; we need to call it before we possibly call RecomputeBacktraces() + // below, which is why we don't do it at the very end. + UpdateRemainder(downsampled_wave); + + std::vector > index_info; + + for (int32 frame = start_frame; frame < end_frame; frame++) { + int32 frame_idx = frame - start_frame; + PitchFrameInfo *prev_info = frame_info_.back(), + *cur_info = new PitchFrameInfo(prev_info); + cur_info->SetNccfPov(nccf_pov_resampled.Row(frame_idx)); + cur_info->ComputeBacktraces(opts_, nccf_pitch_resampled.Row(frame_idx), + lags_, forward_cost_, &index_info, + &cur_forward_cost); + forward_cost_.Swap(&cur_forward_cost); + // Renormalize forward_cost so smallest element is zero. + BaseFloat remainder = forward_cost_.Min(); + forward_cost_remainder_ += remainder; + forward_cost_.Add(-remainder); + frame_info_.push_back(cur_info); + if (frame < opts_.recompute_frame) + nccf_info_[frame]->nccf_pitch_resampled = + nccf_pitch_resampled.Row(frame_idx); + if (frame == opts_.recompute_frame - 1 && !opts_.nccf_ballast_online) + RecomputeBacktraces(); + } + + // Trace back the best-path. + int32 best_final_state; + forward_cost_.Min(&best_final_state); + lag_nccf_.resize(frame_info_.size() - 1); // will keep any existing data. + frame_info_.back()->SetBestState(best_final_state, lag_nccf_); + frames_latency_ = + frame_info_.back()->ComputeLatency(opts_.max_frames_latency); + KALDI_VLOG(4) << "Latency is " << frames_latency_; +} + + + +// Some functions that forward from OnlinePitchFeature to +// OnlinePitchFeatureImpl. +int32 OnlinePitchFeature::NumFramesReady() const { + return impl_->NumFramesReady(); +} + +OnlinePitchFeature::OnlinePitchFeature(const PitchExtractionOptions &opts) + :impl_(new OnlinePitchFeatureImpl(opts)) { } + +bool OnlinePitchFeature::IsLastFrame(int32 frame) const { + return impl_->IsLastFrame(frame); +} + +BaseFloat OnlinePitchFeature::FrameShiftInSeconds() const { + return impl_->FrameShiftInSeconds(); +} + +void OnlinePitchFeature::GetFrame(int32 frame, VectorBase *feat) { + impl_->GetFrame(frame, feat); +} + +void OnlinePitchFeature::AcceptWaveform( + BaseFloat sampling_rate, + const VectorBase &waveform) { + impl_->AcceptWaveform(sampling_rate, waveform); +} + +void OnlinePitchFeature::InputFinished() { + impl_->InputFinished(); +} + +OnlinePitchFeature::~OnlinePitchFeature() { + delete impl_; +} + + +/** + This function is called from ComputeKaldiPitch when the user + specifies opts.simulate_first_pass_online == true. It gives + the "first-pass" version of the features, which you would get + on the first decoding pass in an online setting. These may + differ slightly from the final features due to both the + way the Viterbi traceback works (this is affected by + opts.max_frames_latency), and the online way we compute + the average signal energy. +*/ +void ComputeKaldiPitchFirstPass( + const PitchExtractionOptions &opts, + const VectorBase &wave, + Matrix *output) { + + int32 cur_rows = 100; + Matrix feats(cur_rows, 2); + + OnlinePitchFeature pitch_extractor(opts); + KALDI_ASSERT(opts.frames_per_chunk > 0 && + "--simulate-first-pass-online option does not make sense " + "unless you specify --frames-per-chunk"); + + int32 cur_offset = 0, cur_frame = 0, samp_per_chunk = + opts.frames_per_chunk * opts.samp_freq * opts.frame_shift_ms / 1000.0f; + + while (cur_offset < wave.Dim()) { + int32 num_samp = std::min(samp_per_chunk, wave.Dim() - cur_offset); + SubVector wave_chunk(wave, cur_offset, num_samp); + pitch_extractor.AcceptWaveform(opts.samp_freq, wave_chunk); + cur_offset += num_samp; + if (cur_offset == wave.Dim()) + pitch_extractor.InputFinished(); + // Get each frame as soon as it is ready. + for (; cur_frame < pitch_extractor.NumFramesReady(); cur_frame++) { + if (cur_frame >= cur_rows) { + cur_rows *= 2; + feats.Resize(cur_rows, 2, kCopyData); + } + SubVector row(feats, cur_frame); + pitch_extractor.GetFrame(cur_frame, &row); + } + } + if (cur_frame == 0) { + KALDI_WARN << "No features output since wave file too short"; + output->Resize(0, 0); + } else { + *output = feats.RowRange(0, cur_frame); + } +} + + + +void ComputeKaldiPitch(const PitchExtractionOptions &opts, + const VectorBase &wave, + Matrix *output) { + if (opts.simulate_first_pass_online) { + ComputeKaldiPitchFirstPass(opts, wave, output); + return; + } + OnlinePitchFeature pitch_extractor(opts); + + if (opts.frames_per_chunk == 0) { + pitch_extractor.AcceptWaveform(opts.samp_freq, wave); + } else { + // the user may set opts.frames_per_chunk for better compatibility with + // online operation. + KALDI_ASSERT(opts.frames_per_chunk > 0); + int32 cur_offset = 0, samp_per_chunk = + opts.frames_per_chunk * opts.samp_freq * opts.frame_shift_ms / 1000.0f; + while (cur_offset < wave.Dim()) { + int32 num_samp = std::min(samp_per_chunk, wave.Dim() - cur_offset); + SubVector wave_chunk(wave, cur_offset, num_samp); + pitch_extractor.AcceptWaveform(opts.samp_freq, wave_chunk); + cur_offset += num_samp; + } + } + pitch_extractor.InputFinished(); + int32 num_frames = pitch_extractor.NumFramesReady(); + if (num_frames == 0) { + KALDI_WARN << "No frames output in pitch extraction"; + output->Resize(0, 0); + return; + } + output->Resize(num_frames, 2); + for (int32 frame = 0; frame < num_frames; frame++) { + SubVector row(*output, frame); + pitch_extractor.GetFrame(frame, &row); + } +} + + +/* + This comment describes our invesigation of how much latency the + online-processing algorithm introduces, i.e. how many frames you would + typically have to wait until the traceback converges, if you were to set the + --max-frames-latency to a very large value. + + This was done on a couple of files of language-id data. + + /home/dpovey/kaldi-online/src/featbin/compute-kaldi-pitch-feats --frames-per-chunk=10 --max-frames-latency=100 --verbose=4 --sample-frequency=8000 --resample-frequency=2600 "scp:head -n 2 data/train/wav.scp |" ark:/dev/null 2>&1 | grep Latency | wc + 4871 24355 443991 + /home/dpovey/kaldi-online/src/featbin/compute-kaldi-pitch-feats --frames-per-chunk=10 --max-frames-latency=100 --verbose=4 --sample-frequency=8000 --resample-frequency=2600 "scp:head -n 2 data/train/wav.scp |" ark:/dev/null 2>&1 | grep Latency | grep 100 | wc + 1534 7670 141128 + +# as above, but with 50 instead of 10 in the --max-frames-latency and grep statements. + 2070 10350 188370 +# as above, but with 10 instead of 50. + 4067 20335 370097 + + This says that out of 4871 selected frames [we measured the latency every 10 + frames, since --frames-per-chunk=10], in 1534 frames (31%), the latency was + >= 100 frames, i.e. >= 1 second. Including the other numbers, we can see + that + + 31% of frames had latency >= 1 second + 42% of frames had latency >= 0.5 second + 83% of frames had latency >= 0.1 second. + + This doesn't necessarily mean that we actually have a latency of >= 1 second 31% of + the time when using these features, since by using the --max-frames-latency option + (default: 30 frames), it will limit the latency to, say, 0.3 seconds, and trace back + from the best current pitch. Most of the time this will probably cause no change in + the pitch traceback since the best current pitch is probably the "right" point to + trace back from. And anyway, in the online-decoding, we will most likely rescore + the features at the end anyway, and the traceback gets recomputed, so there will + be no inaccuracy (assuming the first-pass lattice had everything we needed). + + Probably the greater source of inaccuracy due to the online algorithm is the + online energy-normalization, which affects the NCCF-ballast term, and which, + for reasons of efficiency, we don't attempt to "correct" in a later rescoring + pass. This will make the most difference in the first few frames of the file, + before the first voicing, where it will tend to produce more pitch movement + than the offline version of the algorithm. +*/ + + +// Function to do data accumulation for on-line usage +template +inline void AppendVector(const VectorBase &src, Vector *dst) { + if (src.Dim() == 0) return; + dst->Resize(dst->Dim() + src.Dim(), kCopyData); + dst->Range(dst->Dim() - src.Dim(), src.Dim()).CopyFromVec(src); +} + +/** + Note on the implementation of OnlineProcessPitch: the + OnlineFeatureInterface allows random access to features (i.e. not necessarily + sequential order), so we need to support that. But we don't need to support + it very efficiently, and our implementation is most efficient if frames are + accessed in sequential order. + + Also note: we have to be a bit careful in this implementation because + the input features may change. That is: if we call + src_->GetFrame(t, &vec) from GetFrame(), we can't guarantee that a later + call to src_->GetFrame(t, &vec) from another GetFrame() will return the + same value. In fact, while designing this class we used some knowledge + of how the OnlinePitchFeature class works to minimize the amount of + re-querying we had to do. +*/ +OnlineProcessPitch::OnlineProcessPitch( + const ProcessPitchOptions &opts, + OnlineFeatureInterface *src): + opts_(opts), src_(src), + dim_ ((opts.add_pov_feature ? 1 : 0) + + (opts.add_normalized_log_pitch ? 1 : 0) + + (opts.add_delta_pitch ? 1 : 0) + + (opts.add_raw_log_pitch ? 1 : 0)) { + KALDI_ASSERT(dim_ > 0 && + " At least one of the pitch features should be chosen. " + "Check your post-process-pitch options."); + KALDI_ASSERT(src->Dim() == kRawFeatureDim && + "Input feature must be pitch feature (should have dimension 2)"); +} + + +void OnlineProcessPitch::GetFrame(int32 frame, + VectorBase *feat) { + int32 frame_delayed = frame < opts_.delay ? 0 : frame - opts_.delay; + KALDI_ASSERT(feat->Dim() == dim_ && + frame_delayed < NumFramesReady()); + int32 index = 0; + if (opts_.add_pov_feature) + (*feat)(index++) = GetPovFeature(frame_delayed); + if (opts_.add_normalized_log_pitch) + (*feat)(index++) = GetNormalizedLogPitchFeature(frame_delayed); + if (opts_.add_delta_pitch) + (*feat)(index++) = GetDeltaPitchFeature(frame_delayed); + if (opts_.add_raw_log_pitch) + (*feat)(index++) = GetRawLogPitchFeature(frame_delayed); + KALDI_ASSERT(index == dim_); +} + +BaseFloat OnlineProcessPitch::GetPovFeature(int32 frame) const { + Vector tmp(kRawFeatureDim); + src_->GetFrame(frame, &tmp); // (NCCF, pitch) from pitch extractor + BaseFloat nccf = tmp(0); + return opts_.pov_scale * NccfToPovFeature(nccf) + + opts_.pov_offset; +} + +BaseFloat OnlineProcessPitch::GetDeltaPitchFeature(int32 frame) { + // Rather than computing the delta pitch directly in code here, + // which might seem easier, we accumulate a small window of features + // and call ComputeDeltas. This might seem like overkill; the reason + // we do it this way is to ensure that the end effects (at file + // beginning and end) are handled in a consistent way. + int32 context = opts_.delta_window; + int32 start_frame = std::max(0, frame - context), + end_frame = std::min(frame + context + 1, src_->NumFramesReady()), + frames_in_window = end_frame - start_frame; + Matrix feats(frames_in_window, 1), + delta_feats; + + for (int32 f = start_frame; f < end_frame; f++) + feats(f - start_frame, 0) = GetRawLogPitchFeature(f); + + DeltaFeaturesOptions delta_opts; + delta_opts.order = 1; + delta_opts.window = opts_.delta_window; + ComputeDeltas(delta_opts, feats, &delta_feats); + while (delta_feature_noise_.size() <= static_cast(frame)) { + delta_feature_noise_.push_back(RandGauss() * + opts_.delta_pitch_noise_stddev); + } + // note: delta_feats will have two columns, second contains deltas. + return (delta_feats(frame - start_frame, 1) + delta_feature_noise_[frame]) * + opts_.delta_pitch_scale; +} + +BaseFloat OnlineProcessPitch::GetRawLogPitchFeature(int32 frame) const { + Vector tmp(kRawFeatureDim); + src_->GetFrame(frame, &tmp); + BaseFloat pitch = tmp(1); + KALDI_ASSERT(pitch > 0); + return Log(pitch); +} + +BaseFloat OnlineProcessPitch::GetNormalizedLogPitchFeature(int32 frame) { + UpdateNormalizationStats(frame); + BaseFloat log_pitch = GetRawLogPitchFeature(frame), + avg_log_pitch = normalization_stats_[frame].sum_log_pitch_pov / + normalization_stats_[frame].sum_pov, + normalized_log_pitch = log_pitch - avg_log_pitch; + return normalized_log_pitch * opts_.pitch_scale; +} + + +// inline +void OnlineProcessPitch::GetNormalizationWindow(int32 t, + int32 src_frames_ready, + int32 *window_begin, + int32 *window_end) const { + int32 left_context = opts_.normalization_left_context; + int32 right_context = opts_.normalization_right_context; + *window_begin = std::max(0, t - left_context); + *window_end = std::min(t + right_context + 1, src_frames_ready); +} + + +// Makes sure the entry in normalization_stats_ for this frame is up to date; +// called from GetNormalizedLogPitchFeature. +// the cur_num_frames and input_finished variables are needed because the +// pitch features for a given frame may change as we see more data. +void OnlineProcessPitch::UpdateNormalizationStats(int32 frame) { + KALDI_ASSERT(frame >= 0); + if (normalization_stats_.size() <= frame) + normalization_stats_.resize(frame + 1); + int32 cur_num_frames = src_->NumFramesReady(); + bool input_finished = src_->IsLastFrame(cur_num_frames - 1); + + NormalizationStats &this_stats = normalization_stats_[frame]; + if (this_stats.cur_num_frames == cur_num_frames && + this_stats.input_finished == input_finished) { + // Stats are fully up-to-date. + return; + } + int32 this_window_begin, this_window_end; + GetNormalizationWindow(frame, cur_num_frames, + &this_window_begin, &this_window_end); + + if (frame > 0) { + const NormalizationStats &prev_stats = normalization_stats_[frame - 1]; + if (prev_stats.cur_num_frames == cur_num_frames && + prev_stats.input_finished == input_finished) { + // we'll derive this_stats efficiently from prev_stats. + // Checking that cur_num_frames and input_finished have not changed + // ensures that the underlying features will not have changed. + this_stats = prev_stats; + int32 prev_window_begin, prev_window_end; + GetNormalizationWindow(frame - 1, cur_num_frames, + &prev_window_begin, &prev_window_end); + if (this_window_begin != prev_window_begin) { + KALDI_ASSERT(this_window_begin == prev_window_begin + 1); + Vector tmp(kRawFeatureDim); + src_->GetFrame(prev_window_begin, &tmp); + BaseFloat accurate_pov = NccfToPov(tmp(0)), + log_pitch = Log(tmp(1)); + this_stats.sum_pov -= accurate_pov; + this_stats.sum_log_pitch_pov -= accurate_pov * log_pitch; + } + if (this_window_end != prev_window_end) { + KALDI_ASSERT(this_window_end == prev_window_end + 1); + Vector tmp(kRawFeatureDim); + src_->GetFrame(prev_window_end, &tmp); + BaseFloat accurate_pov = NccfToPov(tmp(0)), + log_pitch = Log(tmp(1)); + this_stats.sum_pov += accurate_pov; + this_stats.sum_log_pitch_pov += accurate_pov * log_pitch; + } + return; + } + } + // The way we do it here is not the most efficient way to do it; + // we'll see if it becomes a problem. The issue is we have to redo + // this computation from scratch each time we process a new chunk, which + // may be a little inefficient if the chunk-size is very small. + this_stats.cur_num_frames = cur_num_frames; + this_stats.input_finished = input_finished; + this_stats.sum_pov = 0.0; + this_stats.sum_log_pitch_pov = 0.0; + Vector tmp(kRawFeatureDim); + for (int32 f = this_window_begin; f < this_window_end; f++) { + src_->GetFrame(f, &tmp); + BaseFloat accurate_pov = NccfToPov(tmp(0)), + log_pitch = Log(tmp(1)); + this_stats.sum_pov += accurate_pov; + this_stats.sum_log_pitch_pov += accurate_pov * log_pitch; + } +} + +int32 OnlineProcessPitch::NumFramesReady() const { + int32 src_frames_ready = src_->NumFramesReady(); + if (src_frames_ready == 0) { + return 0; + } else if (src_->IsLastFrame(src_frames_ready - 1)) { + return src_frames_ready + opts_.delay; + } else { + return std::max(0, src_frames_ready - + opts_.normalization_right_context + opts_.delay); + } +} + +void ProcessPitch(const ProcessPitchOptions &opts, + const MatrixBase &input, + Matrix *output) { + OnlineMatrixFeature pitch_feat(input); + + OnlineProcessPitch online_process_pitch(opts, &pitch_feat); + + output->Resize(online_process_pitch.NumFramesReady(), + online_process_pitch.Dim()); + for (int32 t = 0; t < online_process_pitch.NumFramesReady(); t++) { + SubVector row(*output, t); + online_process_pitch.GetFrame(t, &row); + } +} + + +void ComputeAndProcessKaldiPitch( + const PitchExtractionOptions &pitch_opts, + const ProcessPitchOptions &process_opts, + const VectorBase &wave, + Matrix *output) { + + OnlinePitchFeature pitch_extractor(pitch_opts); + + if (pitch_opts.simulate_first_pass_online) { + KALDI_ASSERT(pitch_opts.frames_per_chunk > 0 && + "--simulate-first-pass-online option does not make sense " + "unless you specify --frames-per-chunk"); + } + + OnlineProcessPitch post_process(process_opts, &pitch_extractor); + + int32 cur_rows = 100; + Matrix feats(cur_rows, post_process.Dim()); + + int32 cur_offset = 0, cur_frame = 0, + samp_per_chunk = pitch_opts.frames_per_chunk * + pitch_opts.samp_freq * pitch_opts.frame_shift_ms / 1000.0f; + + // We request the first-pass features as soon as they are available, + // regardless of whether opts.simulate_first_pass_online == true. If + // opts.simulate_first_pass_online == true this should + // not affect the features generated, but it helps us to test the code + // in a way that's closer to what online decoding would see. + + while (cur_offset < wave.Dim()) { + int32 num_samp; + if (samp_per_chunk > 0) + num_samp = std::min(samp_per_chunk, wave.Dim() - cur_offset); + else // user left opts.frames_per_chunk at zero. + num_samp = wave.Dim(); + SubVector wave_chunk(wave, cur_offset, num_samp); + pitch_extractor.AcceptWaveform(pitch_opts.samp_freq, wave_chunk); + cur_offset += num_samp; + if (cur_offset == wave.Dim()) + pitch_extractor.InputFinished(); + + // Get each frame as soon as it is ready. + for (; cur_frame < post_process.NumFramesReady(); cur_frame++) { + if (cur_frame >= cur_rows) { + cur_rows *= 2; + feats.Resize(cur_rows, post_process.Dim(), kCopyData); + } + SubVector row(feats, cur_frame); + post_process.GetFrame(cur_frame, &row); + } + } + + if (pitch_opts.simulate_first_pass_online) { + if (cur_frame == 0) { + KALDI_WARN << "No features output since wave file too short"; + output->Resize(0, 0); + } else { + *output = feats.RowRange(0, cur_frame); + } + } else { + // want the "final" features for second pass, so get them again. + output->Resize(post_process.NumFramesReady(), post_process.Dim()); + for (int32 frame = 0; frame < post_process.NumFramesReady(); frame++) { + SubVector row(*output, frame); + post_process.GetFrame(frame, &row); + } + } +} + + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/feat/pitch-functions.h b/speechx/speechx/kaldi/feat/pitch-functions.h new file mode 100644 index 00000000..70e85380 --- /dev/null +++ b/speechx/speechx/kaldi/feat/pitch-functions.h @@ -0,0 +1,450 @@ +// feat/pitch-functions.h + +// Copyright 2013 Pegah Ghahremani +// 2014 IMSL, PKU-HKUST (author: Wei Shi) +// 2014 Yanqing Sun, Junjie Wang, +// Daniel Povey, Korbinian Riedhammer +// Xin Lei + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_PITCH_FUNCTIONS_H_ +#define KALDI_FEAT_PITCH_FUNCTIONS_H_ + +#include +#include +#include +#include + +#include "base/kaldi-error.h" +#include "feat/mel-computations.h" +#include "itf/online-feature-itf.h" +#include "matrix/matrix-lib.h" +#include "util/common-utils.h" + +namespace kaldi { +/// @addtogroup feat FeatureExtraction +/// @{ + +struct PitchExtractionOptions { + // FrameExtractionOptions frame_opts; + BaseFloat samp_freq; // sample frequency in hertz + BaseFloat frame_shift_ms; // in milliseconds. + BaseFloat frame_length_ms; // in milliseconds. + BaseFloat preemph_coeff; // Preemphasis coefficient. [use is deprecated.] + BaseFloat min_f0; // min f0 to search (Hz) + BaseFloat max_f0; // max f0 to search (Hz) + BaseFloat soft_min_f0; // Minimum f0, applied in soft way, must not + // exceed min-f0 + BaseFloat penalty_factor; // cost factor for FO change + BaseFloat lowpass_cutoff; // cutoff frequency for Low pass filter + BaseFloat resample_freq; // Integer that determines filter width when + // upsampling NCCF + BaseFloat delta_pitch; // the pitch tolerance in pruning lags + BaseFloat nccf_ballast; // Increasing this factor reduces NCCF for + // quiet frames, helping ensure pitch + // continuity in unvoiced region + int32 lowpass_filter_width; // Integer that determines filter width of + // lowpass filter + int32 upsample_filter_width; // Integer that determines filter width when + // upsampling NCCF + + // Below are newer config variables, not present in the original paper, + // that relate to the online pitch extraction algorithm. + + // The maximum number of frames of latency that we allow the pitch-processing + // to introduce, for online operation. If you set this to a large value, + // there would be no inaccuracy from the Viterbi traceback (but it might make + // you wait to see the pitch). This is not very relevant for the online + // operation: normalization-right-context is more relevant, you + // can just leave this value at zero. + int32 max_frames_latency; + + // Only relevant for the function ComputeKaldiPitch which is called by + // compute-kaldi-pitch-feats. If nonzero, we provide the input as chunks of + // this size. This affects the energy normalization which has a small effect + // on the resulting features, especially at the beginning of a file. For best + // compatibility with online operation (e.g. if you plan to train models for + // the online-deocding setup), you might want to set this to a small value, + // like one frame. + int32 frames_per_chunk; + + // Only relevant for the function ComputeKaldiPitch which is called by + // compute-kaldi-pitch-feats, and only relevant if frames_per_chunk is + // nonzero. If true, it will query the features as soon as they are + // available, which simulates the first-pass features you would get in online + // decoding. If false, the features you will get will be the same as those + // available at the end of the utterance, after InputFinished() has been + // called: e.g. during lattice rescoring. + bool simulate_first_pass_online; + + // Only relevant for online operation or when emulating online operation + // (e.g. when setting frames_per_chunk). This is the frame-index on which we + // recompute the NCCF (e.g. frame-index 500 = after 5 seconds); if the + // segment ends before this we do it when the segment ends. We do this by + // re-computing the signal average energy, which affects the NCCF via the + // "ballast term", scaling the resampled NCCF by a factor derived from the + // average change in the "ballast term", and re-doing the backtrace + // computation. Making this infinity would be the most exact, but would + // introduce unwanted latency at the end of long utterances, for little + // benefit. + int32 recompute_frame; + + // This is a "hidden config" used only for testing the online pitch + // extraction. If true, we compute the signal root-mean-squared for the + // ballast term, only up to the current frame, rather than the end of the + // current chunk of signal. This makes the output insensitive to the + // chunking, which is useful for testing purposes. + bool nccf_ballast_online; + bool snip_edges; + PitchExtractionOptions(): + samp_freq(16000), + frame_shift_ms(10.0), + frame_length_ms(25.0), + preemph_coeff(0.0), + min_f0(50), + max_f0(400), + soft_min_f0(10.0), + penalty_factor(0.1), + lowpass_cutoff(1000), + resample_freq(4000), + delta_pitch(0.005), + nccf_ballast(7000), + lowpass_filter_width(1), + upsample_filter_width(5), + max_frames_latency(0), + frames_per_chunk(0), + simulate_first_pass_online(false), + recompute_frame(500), + nccf_ballast_online(false), + snip_edges(true) { } + + void Register(OptionsItf *opts) { + opts->Register("sample-frequency", &samp_freq, + "Waveform data sample frequency (must match the waveform " + "file, if specified there)"); + opts->Register("frame-length", &frame_length_ms, "Frame length in " + "milliseconds"); + opts->Register("frame-shift", &frame_shift_ms, "Frame shift in " + "milliseconds"); + opts->Register("preemphasis-coefficient", &preemph_coeff, + "Coefficient for use in signal preemphasis (deprecated)"); + opts->Register("min-f0", &min_f0, + "min. F0 to search for (Hz)"); + opts->Register("max-f0", &max_f0, + "max. F0 to search for (Hz)"); + opts->Register("soft-min-f0", &soft_min_f0, + "Minimum f0, applied in soft way, must not exceed min-f0"); + opts->Register("penalty-factor", &penalty_factor, + "cost factor for FO change."); + opts->Register("lowpass-cutoff", &lowpass_cutoff, + "cutoff frequency for LowPass filter (Hz) "); + opts->Register("resample-frequency", &resample_freq, + "Frequency that we down-sample the signal to. Must be " + "more than twice lowpass-cutoff"); + opts->Register("delta-pitch", &delta_pitch, + "Smallest relative change in pitch that our algorithm " + "measures"); + opts->Register("nccf-ballast", &nccf_ballast, + "Increasing this factor reduces NCCF for quiet frames"); + opts->Register("nccf-ballast-online", &nccf_ballast_online, + "This is useful mainly for debug; it affects how the NCCF " + "ballast is computed."); + opts->Register("lowpass-filter-width", &lowpass_filter_width, + "Integer that determines filter width of " + "lowpass filter, more gives sharper filter"); + opts->Register("upsample-filter-width", &upsample_filter_width, + "Integer that determines filter width when upsampling NCCF"); + opts->Register("frames-per-chunk", &frames_per_chunk, "Only relevant for " + "offline pitch extraction (e.g. compute-kaldi-pitch-feats), " + "you can set it to a small nonzero value, such as 10, for " + "better feature compatibility with online decoding (affects " + "energy normalization in the algorithm)"); + opts->Register("simulate-first-pass-online", &simulate_first_pass_online, + "If true, compute-kaldi-pitch-feats will output features " + "that correspond to what an online decoder would see in the " + "first pass of decoding-- not the final version of the " + "features, which is the default. Relevant if " + "--frames-per-chunk > 0"); + opts->Register("recompute-frame", &recompute_frame, "Only relevant for " + "online pitch extraction, or for compatibility with online " + "pitch extraction. A non-critical parameter; the frame at " + "which we recompute some of the forward pointers, after " + "revising our estimate of the signal energy. Relevant if" + "--frames-per-chunk > 0"); + opts->Register("max-frames-latency", &max_frames_latency, "Maximum number " + "of frames of latency that we allow pitch tracking to " + "introduce into the feature processing (affects output only " + "if --frames-per-chunk > 0 and " + "--simulate-first-pass-online=true"); + opts->Register("snip-edges", &snip_edges, "If this is set to false, the " + "incomplete frames near the ending edge won't be snipped, " + "so that the number of frames is the file size divided by " + "the frame-shift. This makes different types of features " + "give the same number of frames."); + } + /// Returns the window-size in samples, after resampling. This is the + /// "basic window size", not the full window size after extending by max-lag. + // Because of floating point representation, it is more reliable to divide + // by 1000 instead of multiplying by 0.001, but it is a bit slower. + int32 NccfWindowSize() const { + return static_cast(resample_freq * frame_length_ms / 1000.0); + } + /// Returns the window-shift in samples, after resampling. + int32 NccfWindowShift() const { + return static_cast(resample_freq * frame_shift_ms / 1000.0); + } +}; + +struct ProcessPitchOptions { + BaseFloat pitch_scale; // the final normalized-log-pitch feature is scaled + // with this value + BaseFloat pov_scale; // the final POV feature is scaled with this value + BaseFloat pov_offset; // An offset that can be added to the final POV + // feature (useful for online-decoding, where we don't + // do CMN to the pitch-derived features. + + BaseFloat delta_pitch_scale; + BaseFloat delta_pitch_noise_stddev; // stddev of noise we add to delta-pitch + int32 normalization_left_context; // left-context used for sliding-window + // normalization + int32 normalization_right_context; // this should be reduced in online + // decoding to reduce latency + + int32 delta_window; + int32 delay; + + bool add_pov_feature; + bool add_normalized_log_pitch; + bool add_delta_pitch; + bool add_raw_log_pitch; + + ProcessPitchOptions() : + pitch_scale(2.0), + pov_scale(2.0), + pov_offset(0.0), + delta_pitch_scale(10.0), + delta_pitch_noise_stddev(0.005), + normalization_left_context(75), + normalization_right_context(75), + delta_window(2), + delay(0), + add_pov_feature(true), + add_normalized_log_pitch(true), + add_delta_pitch(true), + add_raw_log_pitch(false) { } + + + void Register(ParseOptions *opts) { + opts->Register("pitch-scale", &pitch_scale, + "Scaling factor for the final normalized log-pitch value"); + opts->Register("pov-scale", &pov_scale, + "Scaling factor for final POV (probability of voicing) " + "feature"); + opts->Register("pov-offset", &pov_offset, + "This can be used to add an offset to the POV feature. " + "Intended for use in online decoding as a substitute for " + " CMN."); + opts->Register("delta-pitch-scale", &delta_pitch_scale, + "Term to scale the final delta log-pitch feature"); + opts->Register("delta-pitch-noise-stddev", &delta_pitch_noise_stddev, + "Standard deviation for noise we add to the delta log-pitch " + "(before scaling); should be about the same as delta-pitch " + "option to pitch creation. The purpose is to get rid of " + "peaks in the delta-pitch caused by discretization of pitch " + "values."); + opts->Register("normalization-left-context", &normalization_left_context, + "Left-context (in frames) for moving window normalization"); + opts->Register("normalization-right-context", &normalization_right_context, + "Right-context (in frames) for moving window normalization"); + opts->Register("delta-window", &delta_window, + "Number of frames on each side of central frame, to use for " + "delta window."); + opts->Register("delay", &delay, + "Number of frames by which the pitch information is " + "delayed."); + opts->Register("add-pov-feature", &add_pov_feature, + "If true, the warped NCCF is added to output features"); + opts->Register("add-normalized-log-pitch", &add_normalized_log_pitch, + "If true, the log-pitch with POV-weighted mean subtraction " + "over 1.5 second window is added to output features"); + opts->Register("add-delta-pitch", &add_delta_pitch, + "If true, time derivative of log-pitch is added to output " + "features"); + opts->Register("add-raw-log-pitch", &add_raw_log_pitch, + "If true, log(pitch) is added to output features"); + } +}; + + +// We don't want to expose the pitch-extraction internals here as it's +// quite complex, so we use a private implementation. +class OnlinePitchFeatureImpl; + + +// Note: to start on a new waveform, just construct a new version +// of this object. +class OnlinePitchFeature: public OnlineBaseFeature { + public: + explicit OnlinePitchFeature(const PitchExtractionOptions &opts); + + virtual int32 Dim() const { return 2; /* (NCCF, pitch) */ } + + virtual int32 NumFramesReady() const; + + virtual BaseFloat FrameShiftInSeconds() const; + + virtual bool IsLastFrame(int32 frame) const; + + /// Outputs the two-dimensional feature consisting of (pitch, NCCF). You + /// should probably post-process this using class OnlineProcessPitch. + virtual void GetFrame(int32 frame, VectorBase *feat); + + virtual void AcceptWaveform(BaseFloat sampling_rate, + const VectorBase &waveform); + + virtual void InputFinished(); + + virtual ~OnlinePitchFeature(); + + private: + OnlinePitchFeatureImpl *impl_; +}; + + +/// This online-feature class implements post processing of pitch features. +/// Inputs are original 2 dims (nccf, pitch). It can produce various +/// kinds of outputs, using the default options it will be (pov-feature, +/// normalized-log-pitch, delta-log-pitch). +class OnlineProcessPitch: public OnlineFeatureInterface { + public: + virtual int32 Dim() const { return dim_; } + + virtual bool IsLastFrame(int32 frame) const { + if (frame <= -1) + return src_->IsLastFrame(-1); + else if (frame < opts_.delay) + return src_->IsLastFrame(-1) == true ? false : src_->IsLastFrame(0); + else + return src_->IsLastFrame(frame - opts_.delay); + } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } + + virtual int32 NumFramesReady() const; + + virtual void GetFrame(int32 frame, VectorBase *feat); + + virtual ~OnlineProcessPitch() { } + + // Does not take ownership of "src". + OnlineProcessPitch(const ProcessPitchOptions &opts, + OnlineFeatureInterface *src); + + private: + enum { kRawFeatureDim = 2}; // anonymous enum to define a constant. + // kRawFeatureDim defines the dimension + // of the input: (nccf, pitch) + + ProcessPitchOptions opts_; + OnlineFeatureInterface *src_; + int32 dim_; // Output feature dimension, set in initializer. + + struct NormalizationStats { + int32 cur_num_frames; // value of src_->NumFramesReady() when + // "mean_pitch" was set. + bool input_finished; // true if input data was finished when + // "mean_pitch" was computed. + double sum_pov; // sum of pov over relevant range + double sum_log_pitch_pov; // sum of log(pitch) * pov over relevant range + + NormalizationStats(): cur_num_frames(-1), input_finished(false), + sum_pov(0.0), sum_log_pitch_pov(0.0) { } + }; + + std::vector delta_feature_noise_; + + std::vector normalization_stats_; + + /// Computes and returns the POV feature for this frame. + /// Called from GetFrame(). + inline BaseFloat GetPovFeature(int32 frame) const; + + /// Computes and returns the delta-log-pitch feature for this frame. + /// Called from GetFrame(). + inline BaseFloat GetDeltaPitchFeature(int32 frame); + + /// Computes and returns the raw log-pitch feature for this frame. + /// Called from GetFrame(). + inline BaseFloat GetRawLogPitchFeature(int32 frame) const; + + /// Computes and returns the mean-subtracted log-pitch feature for this frame. + /// Called from GetFrame(). + inline BaseFloat GetNormalizedLogPitchFeature(int32 frame); + + /// Computes the normalization window sizes. + inline void GetNormalizationWindow(int32 frame, + int32 src_frames_ready, + int32 *window_begin, + int32 *window_end) const; + + /// Makes sure the entry in normalization_stats_ for this frame is up to date; + /// called from GetNormalizedLogPitchFeature. + inline void UpdateNormalizationStats(int32 frame); +}; + + +/// This function extracts (pitch, NCCF) per frame, using the pitch extraction +/// method described in "A Pitch Extraction Algorithm Tuned for Automatic Speech +/// Recognition", Pegah Ghahremani, Bagher BabaAli, Daniel Povey, Korbinian +/// Riedhammer, Jan Trmal and Sanjeev Khudanpur, ICASSP 2014. The output will +/// have as many rows as there are frames, and two columns corresponding to +/// (NCCF, pitch) +void ComputeKaldiPitch(const PitchExtractionOptions &opts, + const VectorBase &wave, + Matrix *output); + +/// This function processes the raw (NCCF, pitch) quantities computed by +/// ComputeKaldiPitch, and processes them into features. By default it will +/// output three-dimensional features, (POV-feature, mean-subtracted-log-pitch, +/// delta-of-raw-pitch), but this is configurable in the options. The number of +/// rows of "output" will be the number of frames (rows) in "input", and the +/// number of columns will be the number of different types of features +/// requested (by default, 3; 4 is the max). The four config variables +/// --add-pov-feature, --add-normalized-log-pitch, --add-delta-pitch, +/// --add-raw-log-pitch determine which features we create; by default we create +/// the first three. +void ProcessPitch(const ProcessPitchOptions &opts, + const MatrixBase &input, + Matrix *output); + +/// This function combines ComputeKaldiPitch and ProcessPitch. The reason +/// why we need a separate function to do this is in order to be able to +/// accurately simulate the online pitch-processing, for testing and for +/// training models matched to the "first-pass" features. It is sensitive to +/// the variables in pitch_opts that relate to online processing, +/// i.e. max_frames_latency, frames_per_chunk, simulate_first_pass_online, +/// recompute_frame. +void ComputeAndProcessKaldiPitch(const PitchExtractionOptions &pitch_opts, + const ProcessPitchOptions &process_opts, + const VectorBase &wave, + Matrix *output); + + +/// @} End of "addtogroup feat" +} // namespace kaldi +#endif // KALDI_FEAT_PITCH_FUNCTIONS_H_ diff --git a/speechx/speechx/kaldi/feat/resample.cc b/speechx/speechx/kaldi/feat/resample.cc new file mode 100644 index 00000000..11f4c62b --- /dev/null +++ b/speechx/speechx/kaldi/feat/resample.cc @@ -0,0 +1,377 @@ +// feat/resample.cc + +// Copyright 2013 Pegah Ghahremani +// 2014 IMSL, PKU-HKUST (author: Wei Shi) +// 2014 Yanqing Sun, Junjie Wang +// 2014 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include +#include +#include "feat/feature-functions.h" +#include "matrix/matrix-functions.h" +#include "feat/resample.h" + +namespace kaldi { + + +LinearResample::LinearResample(int32 samp_rate_in_hz, + int32 samp_rate_out_hz, + BaseFloat filter_cutoff_hz, + int32 num_zeros): + samp_rate_in_(samp_rate_in_hz), + samp_rate_out_(samp_rate_out_hz), + filter_cutoff_(filter_cutoff_hz), + num_zeros_(num_zeros) { + KALDI_ASSERT(samp_rate_in_hz > 0.0 && + samp_rate_out_hz > 0.0 && + filter_cutoff_hz > 0.0 && + filter_cutoff_hz*2 <= samp_rate_in_hz && + filter_cutoff_hz*2 <= samp_rate_out_hz && + num_zeros > 0); + + // base_freq is the frequency of the repeating unit, which is the gcd + // of the input frequencies. + int32 base_freq = Gcd(samp_rate_in_, samp_rate_out_); + input_samples_in_unit_ = samp_rate_in_ / base_freq; + output_samples_in_unit_ = samp_rate_out_ / base_freq; + + SetIndexesAndWeights(); + Reset(); +} + +int64 LinearResample::GetNumOutputSamples(int64 input_num_samp, + bool flush) const { + // For exact computation, we measure time in "ticks" of 1.0 / tick_freq, + // where tick_freq is the least common multiple of samp_rate_in_ and + // samp_rate_out_. + int32 tick_freq = Lcm(samp_rate_in_, samp_rate_out_); + int32 ticks_per_input_period = tick_freq / samp_rate_in_; + + // work out the number of ticks in the time interval + // [ 0, input_num_samp/samp_rate_in_ ). + int64 interval_length_in_ticks = input_num_samp * ticks_per_input_period; + if (!flush) { + BaseFloat window_width = num_zeros_ / (2.0 * filter_cutoff_); + // To count the window-width in ticks we take the floor. This + // is because since we're looking for the largest integer num-out-samp + // that fits in the interval, which is open on the right, a reduction + // in interval length of less than a tick will never make a difference. + // For example, the largest integer in the interval [ 0, 2 ) and the + // largest integer in the interval [ 0, 2 - 0.9 ) are the same (both one). + // So when we're subtracting the window-width we can ignore the fractional + // part. + int32 window_width_ticks = floor(window_width * tick_freq); + // The time-period of the output that we can sample gets reduced + // by the window-width (which is actually the distance from the + // center to the edge of the windowing function) if we're not + // "flushing the output". + interval_length_in_ticks -= window_width_ticks; + } + if (interval_length_in_ticks <= 0) + return 0; + int32 ticks_per_output_period = tick_freq / samp_rate_out_; + // Get the last output-sample in the closed interval, i.e. replacing [ ) with + // [ ]. Note: integer division rounds down. See + // http://en.wikipedia.org/wiki/Interval_(mathematics) for an explanation of + // the notation. + int64 last_output_samp = interval_length_in_ticks / ticks_per_output_period; + // We need the last output-sample in the open interval, so if it takes us to + // the end of the interval exactly, subtract one. + if (last_output_samp * ticks_per_output_period == interval_length_in_ticks) + last_output_samp--; + // First output-sample index is zero, so the number of output samples + // is the last output-sample plus one. + int64 num_output_samp = last_output_samp + 1; + return num_output_samp; +} + +void LinearResample::SetIndexesAndWeights() { + first_index_.resize(output_samples_in_unit_); + weights_.resize(output_samples_in_unit_); + + double window_width = num_zeros_ / (2.0 * filter_cutoff_); + + for (int32 i = 0; i < output_samples_in_unit_; i++) { + double output_t = i / static_cast(samp_rate_out_); + double min_t = output_t - window_width, max_t = output_t + window_width; + // we do ceil on the min and floor on the max, because if we did it + // the other way around we would unnecessarily include indexes just + // outside the window, with zero coefficients. It's possible + // if the arguments to the ceil and floor expressions are integers + // (e.g. if filter_cutoff_ has an exact ratio with the sample rates), + // that we unnecessarily include something with a zero coefficient, + // but this is only a slight efficiency issue. + int32 min_input_index = ceil(min_t * samp_rate_in_), + max_input_index = floor(max_t * samp_rate_in_), + num_indices = max_input_index - min_input_index + 1; + first_index_[i] = min_input_index; + weights_[i].Resize(num_indices); + for (int32 j = 0; j < num_indices; j++) { + int32 input_index = min_input_index + j; + double input_t = input_index / static_cast(samp_rate_in_), + delta_t = input_t - output_t; + // sign of delta_t doesn't matter. + weights_[i](j) = FilterFunc(delta_t) / samp_rate_in_; + } + } +} + + +// inline +void LinearResample::GetIndexes(int64 samp_out, + int64 *first_samp_in, + int32 *samp_out_wrapped) const { + // A unit is the smallest nonzero amount of time that is an exact + // multiple of the input and output sample periods. The unit index + // is the answer to "which numbered unit we are in". + int64 unit_index = samp_out / output_samples_in_unit_; + // samp_out_wrapped is equal to samp_out % output_samples_in_unit_ + *samp_out_wrapped = static_cast(samp_out - + unit_index * output_samples_in_unit_); + *first_samp_in = first_index_[*samp_out_wrapped] + + unit_index * input_samples_in_unit_; +} + + +void LinearResample::Resample(const VectorBase &input, + bool flush, + Vector *output) { + int32 input_dim = input.Dim(); + int64 tot_input_samp = input_sample_offset_ + input_dim, + tot_output_samp = GetNumOutputSamples(tot_input_samp, flush); + + KALDI_ASSERT(tot_output_samp >= output_sample_offset_); + + output->Resize(tot_output_samp - output_sample_offset_); + + // samp_out is the index into the total output signal, not just the part + // of it we are producing here. + for (int64 samp_out = output_sample_offset_; + samp_out < tot_output_samp; + samp_out++) { + int64 first_samp_in; + int32 samp_out_wrapped; + GetIndexes(samp_out, &first_samp_in, &samp_out_wrapped); + const Vector &weights = weights_[samp_out_wrapped]; + // first_input_index is the first index into "input" that we have a weight + // for. + int32 first_input_index = static_cast(first_samp_in - + input_sample_offset_); + BaseFloat this_output; + if (first_input_index >= 0 && + first_input_index + weights.Dim() <= input_dim) { + SubVector input_part(input, first_input_index, weights.Dim()); + this_output = VecVec(input_part, weights); + } else { // Handle edge cases. + this_output = 0.0; + for (int32 i = 0; i < weights.Dim(); i++) { + BaseFloat weight = weights(i); + int32 input_index = first_input_index + i; + if (input_index < 0 && input_remainder_.Dim() + input_index >= 0) { + this_output += weight * + input_remainder_(input_remainder_.Dim() + input_index); + } else if (input_index >= 0 && input_index < input_dim) { + this_output += weight * input(input_index); + } else if (input_index >= input_dim) { + // We're past the end of the input and are adding zero; should only + // happen if the user specified flush == true, or else we would not + // be trying to output this sample. + KALDI_ASSERT(flush); + } + } + } + int32 output_index = static_cast(samp_out - output_sample_offset_); + (*output)(output_index) = this_output; + } + + if (flush) { + Reset(); // Reset the internal state. + } else { + SetRemainder(input); + input_sample_offset_ = tot_input_samp; + output_sample_offset_ = tot_output_samp; + } +} + +void LinearResample::SetRemainder(const VectorBase &input) { + Vector old_remainder(input_remainder_); + // max_remainder_needed is the width of the filter from side to side, + // measured in input samples. you might think it should be half that, + // but you have to consider that you might be wanting to output samples + // that are "in the past" relative to the beginning of the latest + // input... anyway, storing more remainder than needed is not harmful. + int32 max_remainder_needed = ceil(samp_rate_in_ * num_zeros_ / + filter_cutoff_); + input_remainder_.Resize(max_remainder_needed); + for (int32 index = - input_remainder_.Dim(); index < 0; index++) { + // we interpret "index" as an offset from the end of "input" and + // from the end of input_remainder_. + int32 input_index = index + input.Dim(); + if (input_index >= 0) + input_remainder_(index + input_remainder_.Dim()) = input(input_index); + else if (input_index + old_remainder.Dim() >= 0) + input_remainder_(index + input_remainder_.Dim()) = + old_remainder(input_index + old_remainder.Dim()); + // else leave it at zero. + } +} + +void LinearResample::Reset() { + input_sample_offset_ = 0; + output_sample_offset_ = 0; + input_remainder_.Resize(0); +} + +/** Here, t is a time in seconds representing an offset from + the center of the windowed filter function, and FilterFunction(t) + returns the windowed filter function, described + in the header as h(t) = f(t)g(t), evaluated at t. +*/ +BaseFloat LinearResample::FilterFunc(BaseFloat t) const { + BaseFloat window, // raised-cosine (Hanning) window of width + // num_zeros_/2*filter_cutoff_ + filter; // sinc filter function + if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_)) + window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t)); + else + window = 0.0; // outside support of window function + if (t != 0) + filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t); + else + filter = 2 * filter_cutoff_; // limit of the function at t = 0 + return filter * window; +} + + +ArbitraryResample::ArbitraryResample( + int32 num_samples_in, BaseFloat samp_rate_in, + BaseFloat filter_cutoff, const Vector &sample_points, + int32 num_zeros): + num_samples_in_(num_samples_in), + samp_rate_in_(samp_rate_in), + filter_cutoff_(filter_cutoff), + num_zeros_(num_zeros) { + KALDI_ASSERT(num_samples_in > 0 && samp_rate_in > 0.0 && + filter_cutoff > 0.0 && + filter_cutoff * 2.0 <= samp_rate_in + && num_zeros > 0); + // set up weights_ and indices_. Please try to keep all functions short and + SetIndexes(sample_points); + SetWeights(sample_points); +} + + +void ArbitraryResample::Resample(const MatrixBase &input, + MatrixBase *output) const { + // each row of "input" corresponds to the data to resample; + // the corresponding row of "output" is the resampled data. + + KALDI_ASSERT(input.NumRows() == output->NumRows() && + input.NumCols() == num_samples_in_ && + output->NumCols() == weights_.size()); + + Vector output_col(output->NumRows()); + for (int32 i = 0; i < NumSamplesOut(); i++) { + SubMatrix input_part(input, 0, input.NumRows(), + first_index_[i], + weights_[i].Dim()); + const Vector &weight_vec(weights_[i]); + output_col.AddMatVec(1.0, input_part, + kNoTrans, weight_vec, 0.0); + output->CopyColFromVec(output_col, i); + } +} + +void ArbitraryResample::Resample(const VectorBase &input, + VectorBase *output) const { + KALDI_ASSERT(input.Dim() == num_samples_in_ && + output->Dim() == weights_.size()); + + int32 output_dim = output->Dim(); + for (int32 i = 0; i < output_dim; i++) { + SubVector input_part(input, first_index_[i], weights_[i].Dim()); + (*output)(i) = VecVec(input_part, weights_[i]); + } +} + +void ArbitraryResample::SetIndexes(const Vector &sample_points) { + int32 num_samples = sample_points.Dim(); + first_index_.resize(num_samples); + weights_.resize(num_samples); + BaseFloat filter_width = num_zeros_ / (2.0 * filter_cutoff_); + for (int32 i = 0; i < num_samples; i++) { + // the t values are in seconds. + BaseFloat t = sample_points(i), + t_min = t - filter_width, t_max = t + filter_width; + int32 index_min = ceil(samp_rate_in_ * t_min), + index_max = floor(samp_rate_in_ * t_max); + // the ceil on index min and the floor on index_max are because there + // is no point using indices just outside the window (coeffs would be zero). + if (index_min < 0) + index_min = 0; + if (index_max >= num_samples_in_) + index_max = num_samples_in_ - 1; + first_index_[i] = index_min; + weights_[i].Resize(index_max - index_min + 1); + } +} + +void ArbitraryResample::SetWeights(const Vector &sample_points) { + int32 num_samples_out = NumSamplesOut(); + for (int32 i = 0; i < num_samples_out; i++) { + for (int32 j = 0 ; j < weights_[i].Dim(); j++) { + BaseFloat delta_t = sample_points(i) - + (first_index_[i] + j) / samp_rate_in_; + // Include at this point the factor of 1.0 / samp_rate_in_ which + // appears in the math. + weights_[i](j) = FilterFunc(delta_t) / samp_rate_in_; + } + } +} + +/** Here, t is a time in seconds representing an offset from + the center of the windowed filter function, and FilterFunction(t) + returns the windowed filter function, described + in the header as h(t) = f(t)g(t), evaluated at t. +*/ +BaseFloat ArbitraryResample::FilterFunc(BaseFloat t) const { + BaseFloat window, // raised-cosine (Hanning) window of width + // num_zeros_/2*filter_cutoff_ + filter; // sinc filter function + if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_)) + window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t)); + else + window = 0.0; // outside support of window function + if (t != 0.0) + filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t); + else + filter = 2.0 * filter_cutoff_; // limit of the function at zero. + return filter * window; +} + +void ResampleWaveform(BaseFloat orig_freq, const VectorBase &wave, + BaseFloat new_freq, Vector *new_wave) { + BaseFloat min_freq = std::min(orig_freq, new_freq); + BaseFloat lowpass_cutoff = 0.99 * 0.5 * min_freq; + int32 lowpass_filter_width = 6; + LinearResample resampler(orig_freq, new_freq, + lowpass_cutoff, lowpass_filter_width); + resampler.Resample(wave, true, new_wave); +} +} // namespace kaldi diff --git a/speechx/speechx/kaldi/feat/resample.h b/speechx/speechx/kaldi/feat/resample.h new file mode 100644 index 00000000..e0b4688c --- /dev/null +++ b/speechx/speechx/kaldi/feat/resample.h @@ -0,0 +1,287 @@ +// feat/resample.h + +// Copyright 2013 Pegah Ghahremani +// 2014 IMSL, PKU-HKUST (author: Wei Shi) +// 2014 Yanqing Sun, Junjie Wang +// 2014 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_FEAT_RESAMPLE_H_ +#define KALDI_FEAT_RESAMPLE_H_ + +#include +#include +#include +#include + + +#include "matrix/matrix-lib.h" +#include "util/common-utils.h" +#include "base/kaldi-error.h" + +namespace kaldi { +/// @addtogroup feat FeatureExtraction +/// @{ + +/** + \file[resample.h] + + This header contains declarations of classes for resampling signals. The + normal cases of resampling a signal are upsampling and downsampling + (increasing and decreasing the sample rate of a signal, respectively), + although the ArbitraryResample class allows a more generic case where + we want to get samples of a signal at uneven intervals (for instance, + log-spaced). + + The input signal is always evenly spaced, say sampled with frequency S, and + we assume the original signal was band-limited to S/2 or lower. The n'th + input sample x_n (with n = 0, 1, ...) is interpreted as the original + signal's value at time n/S. + + For resampling, it is convenient to view the input signal as a + continuous function x(t) of t, where each sample x_n becomes a delta function + with magnitude x_n/S, at time n/S. If we band limit this to the Nyquist + frequency S/2, we can show that this is the same as the original signal + that was sampled. [assuming the original signal was periodic and band + limited.] In general we want to bandlimit to lower than S/2, because + we don't have a perfect filter and also because if we want to resample + at a lower frequency than S, we need to bandlimit to below half of that. + Anyway, suppose we want to bandlimit to C, with 0 < C < S/2. The perfect + rectangular filter with cutoff C is the sinc function, + \f[ f(t) = 2C sinc(2Ct), \f] + where sinc is the normalized sinc function \f$ sinc(t) = sin(pi t) / (pi t) \f$, with + \f$ sinc(0) = 1 \f$. This is not a practical filter, though, because it has + infinite support. At the cost of less-than-perfect rolloff, we can choose + a suitable windowing function g(t), and use f(t) g(t) as the filter. For + a windowing function we choose raised-cosine (Hanning) window with support + on [-w/2C, w/2C], where w >= 2 is an integer chosen by the user. w = 1 + means we window the sinc function out to its first zero on the left and right, + w = 2 means the second zero, and so on; we normally choose w to be at least two. + We call this num_zeros, not w, in the code. + + Convolving the signal x(t) with this windowed filter h(t) = f(t)g(t) and evaluating the resulting + signal s(t) at an arbitrary time t is easy: we have + \f[ s(t) = 1/S \sum_n x_n h(t - n/S) \f]. + (note: the sign of t - n/S might be wrong, but it doesn't matter as the filter + and window are symmetric). + This is true for arbitrary values of t. What the class ArbitraryResample does + is to allow you to evaluate the signal for specified values of t. +*/ + + +/** + Class ArbitraryResample allows you to resample a signal (assumed zero outside + the sample region, not periodic) at arbitrary specified time values, which + don't have to be linearly spaced. The low-pass filter cutoff + "filter_cutoff_hz" should be less than half the sample rate; + "num_zeros" should probably be at least two preferably more; higher numbers give + sharper filters but will be less efficient. +*/ +class ArbitraryResample { + public: + ArbitraryResample(int32 num_samples_in, + BaseFloat samp_rate_hz, + BaseFloat filter_cutoff_hz, + const Vector &sample_points_secs, + int32 num_zeros); + + int32 NumSamplesIn() const { return num_samples_in_; } + + int32 NumSamplesOut() const { return weights_.size(); } + + /// This function does the resampling. + /// input.NumRows() and output.NumRows() should be equal + /// and nonzero. + /// input.NumCols() should equal NumSamplesIn() + /// and output.NumCols() should equal NumSamplesOut(). + void Resample(const MatrixBase &input, + MatrixBase *output) const; + + /// This version of the Resample function processes just + /// one vector. + void Resample(const VectorBase &input, + VectorBase *output) const; + private: + void SetIndexes(const Vector &sample_points); + + void SetWeights(const Vector &sample_points); + + BaseFloat FilterFunc(BaseFloat t) const; + + int32 num_samples_in_; + BaseFloat samp_rate_in_; + BaseFloat filter_cutoff_; + int32 num_zeros_; + + std::vector first_index_; // The first input-sample index that we sum + // over, for this output-sample index. + std::vector > weights_; +}; + + +/** + LinearResample is a special case of ArbitraryResample, where we want to + resample a signal at linearly spaced intervals (this means we want to + upsample or downsample the signal). It is more efficient than + ArbitraryResample because we can construct it just once. + + We require that the input and output sampling rate be specified as + integers, as this is an easy way to specify that their ratio be rational. +*/ + +class LinearResample { + public: + /// Constructor. We make the input and output sample rates integers, because + /// we are going to need to find a common divisor. This should just remind + /// you that they need to be integers. The filter cutoff needs to be less + /// than samp_rate_in_hz/2 and less than samp_rate_out_hz/2. num_zeros + /// controls the sharpness of the filter, more == sharper but less efficient. + /// We suggest around 4 to 10 for normal use. + LinearResample(int32 samp_rate_in_hz, + int32 samp_rate_out_hz, + BaseFloat filter_cutoff_hz, + int32 num_zeros); + + /// This function does the resampling. If you call it with flush == true and + /// you have never called it with flush == false, it just resamples the input + /// signal (it resizes the output to a suitable number of samples). + /// + /// You can also use this function to process a signal a piece at a time. + /// suppose you break it into piece1, piece2, ... pieceN. You can call + /// \code{.cc} + /// Resample(piece1, &output1, false); + /// Resample(piece2, &output2, false); + /// Resample(piece3, &output3, true); + /// \endcode + /// If you call it with flush == false, it won't output the last few samples + /// but will remember them, so that if you later give it a second piece of + /// the input signal it can process it correctly. + /// If your most recent call to the object was with flush == false, it will + /// have internal state; you can remove this by calling Reset(). + /// Empty input is acceptable. + void Resample(const VectorBase &input, + bool flush, + Vector *output); + + /// Calling the function Reset() resets the state of the object prior to + /// processing a new signal; it is only necessary if you have called + /// Resample(x, y, false) for some signal, leading to a remainder of the + /// signal being called, but then abandon processing the signal before calling + /// Resample(x, y, true) for the last piece. Call it unnecessarily between + /// signals will not do any harm. + void Reset(); + + //// Return the input and output sampling rates (for checks, for example) + inline int32 GetInputSamplingRate() { return samp_rate_in_; } + inline int32 GetOutputSamplingRate() { return samp_rate_out_; } + private: + /// This function outputs the number of output samples we will output + /// for a signal with "input_num_samp" input samples. If flush == true, + /// we return the largest n such that + /// (n/samp_rate_out_) is in the interval [ 0, input_num_samp/samp_rate_in_ ), + /// and note that the interval is half-open. If flush == false, + /// define window_width as num_zeros / (2.0 * filter_cutoff_); + /// we return the largest n such that (n/samp_rate_out_) is in the interval + /// [ 0, input_num_samp/samp_rate_in_ - window_width ). + int64 GetNumOutputSamples(int64 input_num_samp, bool flush) const; + + + /// Given an output-sample index, this function outputs to *first_samp_in the + /// first input-sample index that we have a weight on (may be negative), + /// and to *samp_out_wrapped the index into weights_ where we can get the + /// corresponding weights on the input. + inline void GetIndexes(int64 samp_out, + int64 *first_samp_in, + int32 *samp_out_wrapped) const; + + void SetRemainder(const VectorBase &input); + + void SetIndexesAndWeights(); + + BaseFloat FilterFunc(BaseFloat) const; + + // The following variables are provided by the user. + int32 samp_rate_in_; + int32 samp_rate_out_; + BaseFloat filter_cutoff_; + int32 num_zeros_; + + int32 input_samples_in_unit_; ///< The number of input samples in the + ///< smallest repeating unit: num_samp_in_ = + ///< samp_rate_in_hz / Gcd(samp_rate_in_hz, + ///< samp_rate_out_hz) + int32 output_samples_in_unit_; ///< The number of output samples in the + ///< smallest repeating unit: num_samp_out_ = + ///< samp_rate_out_hz / Gcd(samp_rate_in_hz, + ///< samp_rate_out_hz) + + + /// The first input-sample index that we sum over, for this output-sample + /// index. May be negative; any truncation at the beginning is handled + /// separately. This is just for the first few output samples, but we can + /// extrapolate the correct input-sample index for arbitrary output samples. + std::vector first_index_; + + /// Weights on the input samples, for this output-sample index. + std::vector > weights_; + + // the following variables keep track of where we are in a particular signal, + // if it is being provided over multiple calls to Resample(). + + int64 input_sample_offset_; ///< The number of input samples we have + ///< already received for this signal + ///< (including anything in remainder_) + int64 output_sample_offset_; ///< The number of samples we have already + ///< output for this signal. + Vector input_remainder_; ///< A small trailing part of the + ///< previously seen input signal. +}; + +/** + Downsample or upsample a waveform. This is a convenience wrapper for the + class 'LinearResample'. + The low-pass filter cutoff used in 'LinearResample' is 0.99 of the Nyquist, + where the Nyquist is half of the minimum of (orig_freq, new_freq). The + resampling is done with a symmetric FIR filter with N_z (number of zeros) + as 6. + + We compared the downsampling results with those from the sox resampling + toolkit. + Sox's design is inspired by Laurent De Soras' paper, + https://ccrma.stanford.edu/~jos/resample/Implementation.html + + Note: we expect that while orig_freq and new_freq are of type BaseFloat, they + are actually required to have exact integer values (like 16000 or 8000) with + a ratio between them that can be expressed as a rational number with + reasonably small integer factors. +*/ +void ResampleWaveform(BaseFloat orig_freq, const VectorBase &wave, + BaseFloat new_freq, Vector *new_wave); + + +/// This function is deprecated. It is provided for backward compatibility, to avoid +/// breaking older code. +inline void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase &wave, + BaseFloat new_freq, Vector *new_wave) { + ResampleWaveform(orig_freq, wave, new_freq, new_wave); +} + + +/// @} End of "addtogroup feat" +} // namespace kaldi +#endif // KALDI_FEAT_RESAMPLE_H_ diff --git a/speechx/speechx/kaldi/feat/signal.cc b/speechx/speechx/kaldi/feat/signal.cc new file mode 100644 index 00000000..a206d399 --- /dev/null +++ b/speechx/speechx/kaldi/feat/signal.cc @@ -0,0 +1,129 @@ +// feat/signal.cc + +// Copyright 2015 Tom Ko + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/signal.h" + +namespace kaldi { + +void ElementwiseProductOfFft(const Vector &a, Vector *b) { + int32 num_fft_bins = a.Dim() / 2; + for (int32 i = 0; i < num_fft_bins; i++) { + // do complex multiplication + ComplexMul(a(2*i), a(2*i + 1), &((*b)(2*i)), &((*b)(2*i + 1))); + } +} + +void ConvolveSignals(const Vector &filter, Vector *signal) { + int32 signal_length = signal->Dim(); + int32 filter_length = filter.Dim(); + int32 output_length = signal_length + filter_length - 1; + Vector signal_padded(output_length); + signal_padded.SetZero(); + for (int32 i = 0; i < signal_length; i++) { + for (int32 j = 0; j < filter_length; j++) { + signal_padded(i + j) += (*signal)(i) * filter(j); + } + } + signal->Resize(output_length); + signal->CopyFromVec(signal_padded); +} + + +void FFTbasedConvolveSignals(const Vector &filter, Vector *signal) { + int32 signal_length = signal->Dim(); + int32 filter_length = filter.Dim(); + int32 output_length = signal_length + filter_length - 1; + + int32 fft_length = RoundUpToNearestPowerOfTwo(output_length); + KALDI_VLOG(1) << "fft_length for full signal convolution is " << fft_length; + + SplitRadixRealFft srfft(fft_length); + + Vector filter_padded(fft_length); + filter_padded.Range(0, filter_length).CopyFromVec(filter); + srfft.Compute(filter_padded.Data(), true); + + Vector signal_padded(fft_length); + signal_padded.Range(0, signal_length).CopyFromVec(*signal); + srfft.Compute(signal_padded.Data(), true); + + ElementwiseProductOfFft(filter_padded, &signal_padded); + + srfft.Compute(signal_padded.Data(), false); + signal_padded.Scale(1.0 / fft_length); + + signal->Resize(output_length); + signal->CopyFromVec(signal_padded.Range(0, output_length)); +} + +void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal) { + int32 signal_length = signal->Dim(); + int32 filter_length = filter.Dim(); + int32 output_length = signal_length + filter_length - 1; + signal->Resize(output_length, kCopyData); + + KALDI_VLOG(1) << "Length of the filter is " << filter_length; + + int32 fft_length = RoundUpToNearestPowerOfTwo(4 * filter_length); + KALDI_VLOG(1) << "Best FFT length is " << fft_length; + + int32 block_length = fft_length - filter_length + 1; + KALDI_VLOG(1) << "Block size is " << block_length; + SplitRadixRealFft srfft(fft_length); + + Vector filter_padded(fft_length); + filter_padded.Range(0, filter_length).CopyFromVec(filter); + srfft.Compute(filter_padded.Data(), true); + + Vector temp_pad(filter_length - 1); + temp_pad.SetZero(); + Vector signal_block_padded(fft_length); + + for (int32 po = 0; po < output_length; po += block_length) { + // get a block of the signal + int32 process_length = std::min(block_length, output_length - po); + signal_block_padded.SetZero(); + signal_block_padded.Range(0, process_length).CopyFromVec(signal->Range(po, process_length)); + + srfft.Compute(signal_block_padded.Data(), true); + + ElementwiseProductOfFft(filter_padded, &signal_block_padded); + + srfft.Compute(signal_block_padded.Data(), false); + signal_block_padded.Scale(1.0 / fft_length); + + // combine the block + if (po + block_length < output_length) { // current block is not the last block + signal->Range(po, block_length).CopyFromVec(signal_block_padded.Range(0, block_length)); + signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad); + temp_pad.CopyFromVec(signal_block_padded.Range(block_length, filter_length - 1)); + } else { + signal->Range(po, output_length - po).CopyFromVec( + signal_block_padded.Range(0, output_length - po)); + if (filter_length - 1 < output_length - po) + signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad); + else + signal->Range(po, output_length - po).AddVec(1.0, temp_pad.Range(0, output_length - po)); + } + } +} +} + diff --git a/speechx/speechx/kaldi/feat/signal.h b/speechx/speechx/kaldi/feat/signal.h new file mode 100644 index 00000000..c6c3eb50 --- /dev/null +++ b/speechx/speechx/kaldi/feat/signal.h @@ -0,0 +1,58 @@ +// feat/signal.h + +// Copyright 2015 Tom Ko + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_SIGNAL_H_ +#define KALDI_FEAT_SIGNAL_H_ + +#include "base/kaldi-common.h" +#include "util/common-utils.h" + +namespace kaldi { + +/* + The following three functions are having the same functionality but + different implementations so as the efficiency. After the convolution, + the length of the signal will be extended to (original signal length + + filter length - 1). +*/ + +/* + This function implements a simple non-FFT-based convolution of two signals. + It is suggested to use the FFT-based convolution function which is more + efficient. +*/ +void ConvolveSignals(const Vector &filter, Vector *signal); + +/* + This function implements FFT-based convolution of two signals. + However this should be an inefficient version of BlockConvolveSignals() + as it processes the entire signal with a single FFT. +*/ +void FFTbasedConvolveSignals(const Vector &filter, Vector *signal); + +/* + This function implements FFT-based block convolution of two signals using + overlap-add method. This is an efficient way to evaluate the discrete + convolution of a long signal with a finite impulse response filter. +*/ +void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal); + +} // namespace kaldi + +#endif // KALDI_FEAT_SIGNAL_H_ diff --git a/speechx/speechx/kaldi/feat/wave-reader.cc b/speechx/speechx/kaldi/feat/wave-reader.cc new file mode 100644 index 00000000..f8259a3a --- /dev/null +++ b/speechx/speechx/kaldi/feat/wave-reader.cc @@ -0,0 +1,387 @@ +// feat/wave-reader.cc + +// Copyright 2009-2011 Karel Vesely; Petr Motlicek +// 2013 Florent Masson +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "feat/wave-reader.h" +#include "base/kaldi-error.h" +#include "base/kaldi-utils.h" + +namespace kaldi { + +// A utility class for reading wave header. +struct WaveHeaderReadGofer { + std::istream &is; + bool swap; + char tag[5]; + + WaveHeaderReadGofer(std::istream &is) : is(is), swap(false) { + memset(tag, '\0', sizeof tag); + } + + void Expect4ByteTag(const char *expected) { + is.read(tag, 4); + if (is.fail()) + KALDI_ERR << "WaveData: expected " << expected + << ", failed to read anything"; + if (strcmp(tag, expected)) + KALDI_ERR << "WaveData: expected " << expected << ", got " << tag; + } + + void Read4ByteTag() { + is.read(tag, 4); + if (is.fail()) + KALDI_ERR << "WaveData: expected 4-byte chunk-name, got read error"; + } + + uint32 ReadUint32() { + union { + char result[4]; + uint32 ans; + } u; + is.read(u.result, 4); + if (swap) + KALDI_SWAP4(u.result); + if (is.fail()) + KALDI_ERR << "WaveData: unexpected end of file or read error"; + return u.ans; + } + + uint16 ReadUint16() { + union { + char result[2]; + int16 ans; + } u; + is.read(u.result, 2); + if (swap) + KALDI_SWAP2(u.result); + if (is.fail()) + KALDI_ERR << "WaveData: unexpected end of file or read error"; + return u.ans; + } +}; + +static void WriteUint32(std::ostream &os, int32 i) { + union { + char buf[4]; + int i; + } u; + u.i = i; +#ifdef __BIG_ENDIAN__ + KALDI_SWAP4(u.buf); +#endif + os.write(u.buf, 4); + if (os.fail()) + KALDI_ERR << "WaveData: error writing to stream."; +} + +static void WriteUint16(std::ostream &os, int16 i) { + union { + char buf[2]; + int16 i; + } u; + u.i = i; +#ifdef __BIG_ENDIAN__ + KALDI_SWAP2(u.buf); +#endif + os.write(u.buf, 2); + if (os.fail()) + KALDI_ERR << "WaveData: error writing to stream."; +} + +void WaveInfo::Read(std::istream &is) { + WaveHeaderReadGofer reader(is); + reader.Read4ByteTag(); + if (strcmp(reader.tag, "RIFF") == 0) + reverse_bytes_ = false; + else if (strcmp(reader.tag, "RIFX") == 0) + reverse_bytes_ = true; + else + KALDI_ERR << "WaveData: expected RIFF or RIFX, got " << reader.tag; + +#ifdef __BIG_ENDIAN__ + reverse_bytes_ = !reverse_bytes_; +#endif + reader.swap = reverse_bytes_; + + uint32 riff_chunk_size = reader.ReadUint32(); + reader.Expect4ByteTag("WAVE"); + + uint32 riff_chunk_read = 0; + riff_chunk_read += 4; // WAVE included in riff_chunk_size. + + // Possibly skip any RIFF tags between 'WAVE' and 'fmt '. + // Apple devices produce a filler tag 'JUNK' for memory alignment. + reader.Read4ByteTag(); + riff_chunk_read += 4; + while (strcmp(reader.tag,"fmt ") != 0) { + uint32 filler_size = reader.ReadUint32(); + riff_chunk_read += 4; + for (uint32 i = 0; i < filler_size; i++) { + is.get(); // read 1 byte, + } + riff_chunk_read += filler_size; + // get next RIFF tag, + reader.Read4ByteTag(); + riff_chunk_read += 4; + } + + KALDI_ASSERT(strcmp(reader.tag,"fmt ") == 0); + uint32 subchunk1_size = reader.ReadUint32(); + uint16 audio_format = reader.ReadUint16(); + num_channels_ = reader.ReadUint16(); + uint32 sample_rate = reader.ReadUint32(), + byte_rate = reader.ReadUint32(), + block_align = reader.ReadUint16(), + bits_per_sample = reader.ReadUint16(); + samp_freq_ = static_cast(sample_rate); + + uint32 fmt_chunk_read = 16; + if (audio_format == 1) { + if (subchunk1_size < 16) { + KALDI_ERR << "WaveData: expect PCM format data to have fmt chunk " + << "of at least size 16."; + } + } else if (audio_format == 0xFFFE) { // WAVE_FORMAT_EXTENSIBLE + uint16 extra_size = reader.ReadUint16(); + if (subchunk1_size < 40 || extra_size < 22) { + KALDI_ERR << "WaveData: malformed WAVE_FORMAT_EXTENSIBLE format data."; + } + reader.ReadUint16(); // Unused for PCM. + reader.ReadUint32(); // Channel map: we do not care. + uint32 guid1 = reader.ReadUint32(), + guid2 = reader.ReadUint32(), + guid3 = reader.ReadUint32(), + guid4 = reader.ReadUint32(); + fmt_chunk_read = 40; + + // Support only KSDATAFORMAT_SUBTYPE_PCM for now. Interesting formats: + // ("00000001-0000-0010-8000-00aa00389b71", KSDATAFORMAT_SUBTYPE_PCM) + // ("00000003-0000-0010-8000-00aa00389b71", KSDATAFORMAT_SUBTYPE_IEEE_FLOAT) + // ("00000006-0000-0010-8000-00aa00389b71", KSDATAFORMAT_SUBTYPE_ALAW) + // ("00000007-0000-0010-8000-00aa00389b71", KSDATAFORMAT_SUBTYPE_MULAW) + if (guid1 != 0x00000001 || guid2 != 0x00100000 || + guid3 != 0xAA000080 || guid4 != 0x719B3800) { + KALDI_ERR << "WaveData: unsupported WAVE_FORMAT_EXTENSIBLE format."; + } + } else { + KALDI_ERR << "WaveData: can read only PCM data, format id in file is: " + << audio_format; + } + + for (uint32 i = fmt_chunk_read; i < subchunk1_size; ++i) + is.get(); // use up extra data. + + if (num_channels_ == 0) + KALDI_ERR << "WaveData: no channels present"; + if (bits_per_sample != 16) + KALDI_ERR << "WaveData: unsupported bits_per_sample = " << bits_per_sample; + if (byte_rate != sample_rate * bits_per_sample/8 * num_channels_) + KALDI_ERR << "Unexpected byte rate " << byte_rate << " vs. " + << sample_rate << " * " << (bits_per_sample/8) + << " * " << num_channels_; + if (block_align != num_channels_ * bits_per_sample/8) + KALDI_ERR << "Unexpected block_align: " << block_align << " vs. " + << num_channels_ << " * " << (bits_per_sample/8); + + riff_chunk_read += 4 + subchunk1_size; + // size of what we just read, 4 for subchunk1_size + subchunk1_size itself. + + // We support an optional "fact" chunk (which is useless but which + // we encountered), and then a single "data" chunk. + + reader.Read4ByteTag(); + riff_chunk_read += 4; + + // Skip any subchunks between "fmt" and "data". Usually there will + // be a single "fact" subchunk, but on Windows there can also be a + // "list" subchunk. + while (strcmp(reader.tag, "data") != 0) { + // We will just ignore the data in these chunks. + uint32 chunk_sz = reader.ReadUint32(); + if (chunk_sz != 4 && strcmp(reader.tag, "fact") == 0) + KALDI_WARN << "Expected fact chunk to be 4 bytes long."; + for (uint32 i = 0; i < chunk_sz; i++) + is.get(); + riff_chunk_read += 4 + chunk_sz; // for chunk_sz (4) + chunk contents (chunk-sz) + + // Now read the next chunk name. + reader.Read4ByteTag(); + riff_chunk_read += 4; + } + + KALDI_ASSERT(strcmp(reader.tag, "data") == 0); + uint32 data_chunk_size = reader.ReadUint32(); + riff_chunk_read += 4; + + // Figure out if the file is going to be read to the end. Values as + // observed in the wild: + bool is_stream_mode = + riff_chunk_size == 0 + || riff_chunk_size == 0xFFFFFFFF + || data_chunk_size == 0 + || data_chunk_size == 0xFFFFFFFF + || data_chunk_size == 0x7FFFF000; // This value is used by SoX. + + if (is_stream_mode) + KALDI_VLOG(1) << "Read in RIFF chunk size: " << riff_chunk_size + << ", data chunk size: " << data_chunk_size + << ". Assume 'stream mode' (reading data to EOF)."; + + if (!is_stream_mode + && std::abs(static_cast(riff_chunk_read) + + static_cast(data_chunk_size) - + static_cast(riff_chunk_size)) > 1) { + // We allow the size to be off by one without warning, because there is a + // weirdness in the format of RIFF files that means that the input may + // sometimes be padded with 1 unused byte to make the total size even. + KALDI_WARN << "Expected " << riff_chunk_size << " bytes in RIFF chunk, but " + << "after first data block there will be " << riff_chunk_read + << " + " << data_chunk_size << " bytes " + << "(we do not support reading multiple data chunks)."; + } + + if (is_stream_mode) + samp_count_ = -1; + else + samp_count_ = data_chunk_size / block_align; +} + +void WaveData::Read(std::istream &is) { + const uint32 kBlockSize = 1024 * 1024; + + WaveInfo header; + header.Read(is); + + data_.Resize(0, 0); // clear the data. + samp_freq_ = header.SampFreq(); + + std::vector buffer; + uint32 bytes_to_go = header.IsStreamed() ? kBlockSize : header.DataBytes(); + + // Once in a while header.DataBytes() will report an insane value; + // read the file to the end + while (is && bytes_to_go > 0) { + uint32 block_bytes = std::min(bytes_to_go, kBlockSize); + uint32 offset = buffer.size(); + buffer.resize(offset + block_bytes); + is.read(&buffer[offset], block_bytes); + uint32 bytes_read = is.gcount(); + buffer.resize(offset + bytes_read); + if (!header.IsStreamed()) + bytes_to_go -= bytes_read; + } + + if (is.bad()) + KALDI_ERR << "WaveData: file read error"; + + if (buffer.size() == 0) + KALDI_ERR << "WaveData: empty file (no data)"; + + if (!header.IsStreamed() && buffer.size() < header.DataBytes()) { + KALDI_WARN << "Expected " << header.DataBytes() << " bytes of wave data, " + << "but read only " << buffer.size() << " bytes. " + << "Truncated file?"; + } + + uint16 *data_ptr = reinterpret_cast(&buffer[0]); + + // The matrix is arranged row per channel, column per sample. + data_.Resize(header.NumChannels(), + buffer.size() / header.BlockAlign()); + for (uint32 i = 0; i < data_.NumCols(); ++i) { + for (uint32 j = 0; j < data_.NumRows(); ++j) { + int16 k = *data_ptr++; + if (header.ReverseBytes()) + KALDI_SWAP2(k); + data_(j, i) = k; + } + } +} + + +// Write 16-bit PCM. + +// note: the WAVE chunk contains 2 subchunks. +// +// subchunk2size = data.NumRows() * data.NumCols() * 2. + + +void WaveData::Write(std::ostream &os) const { + os << "RIFF"; + if (data_.NumRows() == 0) + KALDI_ERR << "Error: attempting to write empty WAVE file"; + + int32 num_chan = data_.NumRows(), + num_samp = data_.NumCols(), + bytes_per_samp = 2; + + int32 subchunk2size = (num_chan * num_samp * bytes_per_samp); + int32 chunk_size = 36 + subchunk2size; + WriteUint32(os, chunk_size); + os << "WAVE"; + os << "fmt "; + WriteUint32(os, 16); + WriteUint16(os, 1); + WriteUint16(os, num_chan); + KALDI_ASSERT(samp_freq_ > 0); + WriteUint32(os, static_cast(samp_freq_)); + WriteUint32(os, static_cast(samp_freq_) * num_chan * bytes_per_samp); + WriteUint16(os, num_chan * bytes_per_samp); + WriteUint16(os, 8 * bytes_per_samp); + os << "data"; + WriteUint32(os, subchunk2size); + + const BaseFloat *data_ptr = data_.Data(); + int32 stride = data_.Stride(); + + int num_clipped = 0; + for (int32 i = 0; i < num_samp; i++) { + for (int32 j = 0; j < num_chan; j++) { + int32 elem = static_cast(trunc(data_ptr[j * stride + i])); + int16 elem_16 = static_cast(elem); + if (elem < std::numeric_limits::min()) { + elem_16 = std::numeric_limits::min(); + ++num_clipped; + } else if (elem > std::numeric_limits::max()) { + elem_16 = std::numeric_limits::max(); + ++num_clipped; + } +#ifdef __BIG_ENDIAN__ + KALDI_SWAP2(elem_16); +#endif + os.write(reinterpret_cast(&elem_16), 2); + } + } + if (os.fail()) + KALDI_ERR << "Error writing wave data to stream."; + if (num_clipped > 0) + KALDI_WARN << "WARNING: clipped " << num_clipped + << " samples out of total " << num_chan * num_samp + << ". Reduce volume?"; +} + + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/feat/wave-reader.h b/speechx/speechx/kaldi/feat/wave-reader.h new file mode 100644 index 00000000..dae74139 --- /dev/null +++ b/speechx/speechx/kaldi/feat/wave-reader.h @@ -0,0 +1,248 @@ +// feat/wave-reader.h + +// Copyright 2009-2011 Karel Vesely; Microsoft Corporation +// 2013 Florent Masson +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +/* +// THE WAVE FORMAT IS SPECIFIED IN: +// https:// ccrma.stanford.edu/courses/422/projects/WaveFormat/ +// +// +// +// RIFF +// | +// WAVE +// | \ \ \ +// fmt_ data ... data +// +// +// Riff is a general container, which usually contains one WAVE chunk +// each WAVE chunk has header sub-chunk 'fmt_' +// and one or more data sub-chunks 'data' +// +// [Note from Dan: to say that the wave format was ever "specified" anywhere is +// not quite right. The guy who invented the wave format attempted to create +// a formal specification but it did not completely make sense. And there +// doesn't seem to be a consensus on what makes a valid wave file, +// particularly where the accuracy of header information is concerned.] +*/ + + +#ifndef KALDI_FEAT_WAVE_READER_H_ +#define KALDI_FEAT_WAVE_READER_H_ + +#include + +#include "base/kaldi-types.h" +#include "matrix/kaldi-vector.h" +#include "matrix/kaldi-matrix.h" + + +namespace kaldi { + +/// For historical reasons, we scale waveforms to the range +/// (2^15-1)*[-1, 1], not the usual default DSP range [-1, 1]. +const BaseFloat kWaveSampleMax = 32768.0; + +/// This class reads and hold wave file header information. +class WaveInfo { + public: + WaveInfo() : samp_freq_(0), samp_count_(0), + num_channels_(0), reverse_bytes_(0) {} + + /// Is stream size unknown? Duration and SampleCount not valid if true. + bool IsStreamed() const { return samp_count_ < 0; } + + /// Sample frequency, Hz. + BaseFloat SampFreq() const { return samp_freq_; } + + /// Number of samples in stream. Invalid if IsStreamed() is true. + uint32 SampleCount() const { return samp_count_; } + + /// Approximate duration, seconds. Invalid if IsStreamed() is true. + BaseFloat Duration() const { return samp_count_ / samp_freq_; } + + /// Number of channels, 1 to 16. + int32 NumChannels() const { return num_channels_; } + + /// Bytes per sample. + size_t BlockAlign() const { return 2 * num_channels_; } + + /// Wave data bytes. Invalid if IsStreamed() is true. + size_t DataBytes() const { return samp_count_ * BlockAlign(); } + + /// Is data file byte order different from machine byte order? + bool ReverseBytes() const { return reverse_bytes_; } + + /// 'is' should be opened in binary mode. Read() will throw on error. + /// On success 'is' will be positioned at the beginning of wave data. + void Read(std::istream &is); + + private: + BaseFloat samp_freq_; + int32 samp_count_; // 0 if empty, -1 if undefined length. + uint8 num_channels_; + bool reverse_bytes_; // File endianness differs from host. +}; + +/// This class's purpose is to read in Wave files. +class WaveData { + public: + WaveData(BaseFloat samp_freq, const MatrixBase &data) + : data_(data), samp_freq_(samp_freq) {} + + WaveData() : samp_freq_(0.0) {} + + /// Read() will throw on error. It's valid to call Read() more than once-- + /// in this case it will destroy what was there before. + /// "is" should be opened in binary mode. + void Read(std::istream &is); + + /// Write() will throw on error. os should be opened in binary mode. + void Write(std::ostream &os) const; + + // This function returns the wave data-- it's in a matrix + // because there may be multiple channels. In the normal case + // there's just one channel so Data() will have one row. + const Matrix &Data() const { return data_; } + + BaseFloat SampFreq() const { return samp_freq_; } + + // Returns the duration in seconds + BaseFloat Duration() const { return data_.NumCols() / samp_freq_; } + + void CopyFrom(const WaveData &other) { + samp_freq_ = other.samp_freq_; + data_.CopyFromMat(other.data_); + } + + void Clear() { + data_.Resize(0, 0); + samp_freq_ = 0.0; + } + + void Swap(WaveData *other) { + data_.Swap(&(other->data_)); + std::swap(samp_freq_, other->samp_freq_); + } + + private: + static const uint32 kBlockSize = 1024 * 1024; // Use 1M bytes. + Matrix data_; + BaseFloat samp_freq_; +}; + + +// Holder class for .wav files that enables us to read (but not write) .wav +// files. c.f. util/kaldi-holder.h we don't use the KaldiObjectHolder template +// because we don't want to check for the \0B binary header. We could have faked +// it by pretending to read in the wave data in text mode after failing to find +// the \0B header, but that would have been a little ugly. +class WaveHolder { + public: + typedef WaveData T; + + static bool Write(std::ostream &os, bool binary, const T &t) { + // We don't write the binary-mode header here [always binary]. + if (!binary) + KALDI_ERR << "Wave data can only be written in binary mode."; + try { + t.Write(os); // throws exception on failure. + return true; + } catch (const std::exception &e) { + KALDI_WARN << "Exception caught in WaveHolder object (writing). " + << e.what(); + return false; // write failure. + } + } + void Copy(const T &t) { t_.CopyFrom(t); } + + static bool IsReadInBinary() { return true; } + + void Clear() { t_.Clear(); } + + T &Value() { return t_; } + + WaveHolder &operator = (const WaveHolder &other) { + t_.CopyFrom(other.t_); + return *this; + } + WaveHolder(const WaveHolder &other): t_(other.t_) {} + + WaveHolder() {} + + bool Read(std::istream &is) { + // We don't look for the binary-mode header here [always binary] + try { + t_.Read(is); // Throws exception on failure. + return true; + } catch (const std::exception &e) { + KALDI_WARN << "Exception caught in WaveHolder::Read(). " << e.what(); + return false; + } + } + + void Swap(WaveHolder *other) { + t_.Swap(&(other->t_)); + } + + bool ExtractRange(const WaveHolder &other, const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + + private: + T t_; +}; + +// This is like WaveHolder but when you just want the metadata- +// it leaves the actual data undefined, it doesn't read it. +class WaveInfoHolder { + public: + typedef WaveInfo T; + + void Clear() { info_ = WaveInfo(); } + void Swap(WaveInfoHolder *other) { std::swap(info_, other->info_); } + T &Value() { return info_; } + static bool IsReadInBinary() { return true; } + + bool Read(std::istream &is) { + try { + info_.Read(is); // Throws exception on failure. + return true; + } catch (const std::exception &e) { + KALDI_WARN << "Exception caught in WaveInfoHolder::Read(). " << e.what(); + return false; + } + } + + bool ExtractRange(const WaveInfoHolder &other, const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + + private: + WaveInfo info_; +}; + + +} // namespace kaldi + +#endif // KALDI_FEAT_WAVE_READER_H_ diff --git a/speechx/speechx/kaldi/matrix/BUILD b/speechx/speechx/kaldi/matrix/BUILD new file mode 100644 index 00000000..cefac6fc --- /dev/null +++ b/speechx/speechx/kaldi/matrix/BUILD @@ -0,0 +1,39 @@ +# Copyright (c) 2020 PeachLab. All Rights Reserved. +# Author : goat.zhou@qq.com (Yang Zhou) + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = 'kaldi-matrix', + srcs = [ + 'compressed-matrix.cc', + 'kaldi-matrix.cc', + 'kaldi-vector.cc', + 'matrix-functions.cc', + 'optimization.cc', + 'packed-matrix.cc', + 'qr.cc', + 'sparse-matrix.cc', + 'sp-matrix.cc', + 'srfft.cc', + 'tp-matrix.cc', + ], + hdrs = glob(["*.h"]), + deps = [ + '//base:kaldi-base', + '//common/third_party/openblas:openblas', + ], + linkopts=['-lgfortran'], +) + +cc_binary( + name = 'matrix-lib-test', + srcs = [ + 'matrix-lib-test.cc', + ], + deps = [ + ':kaldi-matrix', + '//util:kaldi-util', + ], +) + diff --git a/speechx/speechx/kaldi/matrix/CMakeLists.txt b/speechx/speechx/kaldi/matrix/CMakeLists.txt new file mode 100644 index 00000000..a4dbde2e --- /dev/null +++ b/speechx/speechx/kaldi/matrix/CMakeLists.txt @@ -0,0 +1,16 @@ + +add_library(kaldi-matrix +compressed-matrix.cc +kaldi-matrix.cc +kaldi-vector.cc +matrix-functions.cc +optimization.cc +packed-matrix.cc +qr.cc +sparse-matrix.cc +sp-matrix.cc +srfft.cc +tp-matrix.cc +) + +target_link_libraries(kaldi-matrix gfortran kaldi-base libopenblas.a) diff --git a/speechx/speechx/kaldi/matrix/cblas-wrappers.h b/speechx/speechx/kaldi/matrix/cblas-wrappers.h new file mode 100644 index 00000000..f869ab7e --- /dev/null +++ b/speechx/speechx/kaldi/matrix/cblas-wrappers.h @@ -0,0 +1,491 @@ +// matrix/cblas-wrappers.h + +// Copyright 2012 Johns Hopkins University (author: Daniel Povey); +// Haihua Xu; Wei Shi + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_MATRIX_CBLAS_WRAPPERS_H_ +#define KALDI_MATRIX_CBLAS_WRAPPERS_H_ 1 + + +#include +#include "matrix/sp-matrix.h" +#include "matrix/kaldi-vector.h" +#include "matrix/kaldi-matrix.h" +#include "matrix/matrix-functions.h" +#include "matrix/kaldi-blas.h" + +// Do not include this file directly. It is to be included +// by .cc files in this directory. + +namespace kaldi { + + +inline void cblas_Xcopy(const int N, const float *X, const int incX, float *Y, + const int incY) { + cblas_scopy(N, X, incX, Y, incY); +} + +inline void cblas_Xcopy(const int N, const double *X, const int incX, double *Y, + const int incY) { + cblas_dcopy(N, X, incX, Y, incY); +} + + +inline float cblas_Xasum(const int N, const float *X, const int incX) { + return cblas_sasum(N, X, incX); +} + +inline double cblas_Xasum(const int N, const double *X, const int incX) { + return cblas_dasum(N, X, incX); +} + +inline void cblas_Xrot(const int N, float *X, const int incX, float *Y, + const int incY, const float c, const float s) { + cblas_srot(N, X, incX, Y, incY, c, s); +} +inline void cblas_Xrot(const int N, double *X, const int incX, double *Y, + const int incY, const double c, const double s) { + cblas_drot(N, X, incX, Y, incY, c, s); +} +inline float cblas_Xdot(const int N, const float *const X, + const int incX, const float *const Y, + const int incY) { + return cblas_sdot(N, X, incX, Y, incY); +} +inline double cblas_Xdot(const int N, const double *const X, + const int incX, const double *const Y, + const int incY) { + return cblas_ddot(N, X, incX, Y, incY); +} +inline void cblas_Xaxpy(const int N, const float alpha, const float *X, + const int incX, float *Y, const int incY) { + cblas_saxpy(N, alpha, X, incX, Y, incY); +} +inline void cblas_Xaxpy(const int N, const double alpha, const double *X, + const int incX, double *Y, const int incY) { + cblas_daxpy(N, alpha, X, incX, Y, incY); +} +inline void cblas_Xscal(const int N, const float alpha, float *data, + const int inc) { + cblas_sscal(N, alpha, data, inc); +} +inline void cblas_Xscal(const int N, const double alpha, double *data, + const int inc) { + cblas_dscal(N, alpha, data, inc); +} +inline void cblas_Xspmv(const float alpha, const int num_rows, const float *Mdata, + const float *v, const int v_inc, + const float beta, float *y, const int y_inc) { + cblas_sspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc); +} +inline void cblas_Xspmv(const double alpha, const int num_rows, const double *Mdata, + const double *v, const int v_inc, + const double beta, double *y, const int y_inc) { + cblas_dspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc); +} +inline void cblas_Xtpmv(MatrixTransposeType trans, const float *Mdata, + const int num_rows, float *y, const int y_inc) { + cblas_stpmv(CblasRowMajor, CblasLower, static_cast(trans), + CblasNonUnit, num_rows, Mdata, y, y_inc); +} +inline void cblas_Xtpmv(MatrixTransposeType trans, const double *Mdata, + const int num_rows, double *y, const int y_inc) { + cblas_dtpmv(CblasRowMajor, CblasLower, static_cast(trans), + CblasNonUnit, num_rows, Mdata, y, y_inc); +} + + +inline void cblas_Xtpsv(MatrixTransposeType trans, const float *Mdata, + const int num_rows, float *y, const int y_inc) { + cblas_stpsv(CblasRowMajor, CblasLower, static_cast(trans), + CblasNonUnit, num_rows, Mdata, y, y_inc); +} +inline void cblas_Xtpsv(MatrixTransposeType trans, const double *Mdata, + const int num_rows, double *y, const int y_inc) { + cblas_dtpsv(CblasRowMajor, CblasLower, static_cast(trans), + CblasNonUnit, num_rows, Mdata, y, y_inc); +} + +// x = alpha * M * y + beta * x +inline void cblas_Xspmv(MatrixIndexT dim, float alpha, const float *Mdata, + const float *ydata, MatrixIndexT ystride, + float beta, float *xdata, MatrixIndexT xstride) { + cblas_sspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata, + ydata, ystride, beta, xdata, xstride); +} +inline void cblas_Xspmv(MatrixIndexT dim, double alpha, const double *Mdata, + const double *ydata, MatrixIndexT ystride, + double beta, double *xdata, MatrixIndexT xstride) { + cblas_dspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata, + ydata, ystride, beta, xdata, xstride); +} + +// Implements A += alpha * (x y' + y x'); A is symmetric matrix. +inline void cblas_Xspr2(MatrixIndexT dim, float alpha, const float *Xdata, + MatrixIndexT incX, const float *Ydata, MatrixIndexT incY, + float *Adata) { + cblas_sspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata, + incX, Ydata, incY, Adata); +} +inline void cblas_Xspr2(MatrixIndexT dim, double alpha, const double *Xdata, + MatrixIndexT incX, const double *Ydata, MatrixIndexT incY, + double *Adata) { + cblas_dspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata, + incX, Ydata, incY, Adata); +} + +// Implements A += alpha * (x x'); A is symmetric matrix. +inline void cblas_Xspr(MatrixIndexT dim, float alpha, const float *Xdata, + MatrixIndexT incX, float *Adata) { + cblas_sspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata); +} +inline void cblas_Xspr(MatrixIndexT dim, double alpha, const double *Xdata, + MatrixIndexT incX, double *Adata) { + cblas_dspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata); +} + +// sgemv,dgemv: y = alpha M x + beta y. +inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows, + MatrixIndexT num_cols, float alpha, const float *Mdata, + MatrixIndexT stride, const float *xdata, + MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) { + cblas_sgemv(CblasRowMajor, static_cast(trans), num_rows, + num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY); +} +inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows, + MatrixIndexT num_cols, double alpha, const double *Mdata, + MatrixIndexT stride, const double *xdata, + MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) { + cblas_dgemv(CblasRowMajor, static_cast(trans), num_rows, + num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY); +} + +// sgbmv, dgmmv: y = alpha M x + + beta * y. +inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows, + MatrixIndexT num_cols, MatrixIndexT num_below, + MatrixIndexT num_above, float alpha, const float *Mdata, + MatrixIndexT stride, const float *xdata, + MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) { + cblas_sgbmv(CblasRowMajor, static_cast(trans), num_rows, + num_cols, num_below, num_above, alpha, Mdata, stride, xdata, + incX, beta, ydata, incY); +} +inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows, + MatrixIndexT num_cols, MatrixIndexT num_below, + MatrixIndexT num_above, double alpha, const double *Mdata, + MatrixIndexT stride, const double *xdata, + MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) { + cblas_dgbmv(CblasRowMajor, static_cast(trans), num_rows, + num_cols, num_below, num_above, alpha, Mdata, stride, xdata, + incX, beta, ydata, incY); +} + + +template +inline void Xgemv_sparsevec(MatrixTransposeType trans, MatrixIndexT num_rows, + MatrixIndexT num_cols, Real alpha, const Real *Mdata, + MatrixIndexT stride, const Real *xdata, + MatrixIndexT incX, Real beta, Real *ydata, + MatrixIndexT incY) { + if (trans == kNoTrans) { + if (beta != 1.0) cblas_Xscal(num_rows, beta, ydata, incY); + for (MatrixIndexT i = 0; i < num_cols; i++) { + Real x_i = xdata[i * incX]; + if (x_i == 0.0) continue; + // Add to ydata, the i'th column of M, times alpha * x_i + cblas_Xaxpy(num_rows, x_i * alpha, Mdata + i, stride, ydata, incY); + } + } else { + if (beta != 1.0) cblas_Xscal(num_cols, beta, ydata, incY); + for (MatrixIndexT i = 0; i < num_rows; i++) { + Real x_i = xdata[i * incX]; + if (x_i == 0.0) continue; + // Add to ydata, the i'th row of M, times alpha * x_i + cblas_Xaxpy(num_cols, x_i * alpha, + Mdata + (i * stride), 1, ydata, incY); + } + } +} + +inline void cblas_Xgemm(const float alpha, + MatrixTransposeType transA, + const float *Adata, + MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride, + MatrixTransposeType transB, + const float *Bdata, MatrixIndexT b_stride, + const float beta, + float *Mdata, + MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) { + cblas_sgemm(CblasRowMajor, static_cast(transA), + static_cast(transB), + num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows, + alpha, Adata, a_stride, Bdata, b_stride, + beta, Mdata, stride); +} +inline void cblas_Xgemm(const double alpha, + MatrixTransposeType transA, + const double *Adata, + MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride, + MatrixTransposeType transB, + const double *Bdata, MatrixIndexT b_stride, + const double beta, + double *Mdata, + MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) { + cblas_dgemm(CblasRowMajor, static_cast(transA), + static_cast(transB), + num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows, + alpha, Adata, a_stride, Bdata, b_stride, + beta, Mdata, stride); +} + + +inline void cblas_Xsymm(const float alpha, + MatrixIndexT sz, + const float *Adata,MatrixIndexT a_stride, + const float *Bdata,MatrixIndexT b_stride, + const float beta, + float *Mdata, MatrixIndexT stride) { + cblas_ssymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata, + a_stride, Bdata, b_stride, beta, Mdata, stride); +} +inline void cblas_Xsymm(const double alpha, + MatrixIndexT sz, + const double *Adata,MatrixIndexT a_stride, + const double *Bdata,MatrixIndexT b_stride, + const double beta, + double *Mdata, MatrixIndexT stride) { + cblas_dsymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata, + a_stride, Bdata, b_stride, beta, Mdata, stride); +} +// ger: M += alpha x y^T. +inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, float alpha, + const float *xdata, MatrixIndexT incX, const float *ydata, + MatrixIndexT incY, float *Mdata, MatrixIndexT stride) { + cblas_sger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1, + Mdata, stride); +} +inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, double alpha, + const double *xdata, MatrixIndexT incX, const double *ydata, + MatrixIndexT incY, double *Mdata, MatrixIndexT stride) { + cblas_dger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1, + Mdata, stride); +} + +// syrk: symmetric rank-k update. +// if trans==kNoTrans, then C = alpha A A^T + beta C +// else C = alpha A^T A + beta C. +// note: dim_c is dim(C), other_dim_a is the "other" dimension of A, i.e. +// num-cols(A) if kNoTrans, or num-rows(A) if kTrans. +// We only need the row-major and lower-triangular option of this, and this +// is hard-coded. +inline void cblas_Xsyrk ( + const MatrixTransposeType trans, const MatrixIndexT dim_c, + const MatrixIndexT other_dim_a, const float alpha, const float *A, + const MatrixIndexT a_stride, const float beta, float *C, + const MatrixIndexT c_stride) { + cblas_ssyrk(CblasRowMajor, CblasLower, static_cast(trans), + dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride); +} + +inline void cblas_Xsyrk( + const MatrixTransposeType trans, const MatrixIndexT dim_c, + const MatrixIndexT other_dim_a, const double alpha, const double *A, + const MatrixIndexT a_stride, const double beta, double *C, + const MatrixIndexT c_stride) { + cblas_dsyrk(CblasRowMajor, CblasLower, static_cast(trans), + dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride); +} + +/// matrix-vector multiply using a banded matrix; we always call this +/// with b = 1 meaning we're multiplying by a diagonal matrix. This is used for +/// elementwise multiplication. We miss some of the arguments out of this +/// wrapper. +inline void cblas_Xsbmv1( + const MatrixIndexT dim, + const double *A, + const double alpha, + const double *x, + const double beta, + double *y) { + cblas_dsbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A, + 1, x, 1, beta, y, 1); +} + +inline void cblas_Xsbmv1( + const MatrixIndexT dim, + const float *A, + const float alpha, + const float *x, + const float beta, + float *y) { + cblas_ssbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A, + 1, x, 1, beta, y, 1); +} + +/// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could +/// extend this somehow. +inline void mul_elements( + const MatrixIndexT dim, + const double *a, + double *b) { // does b *= a, elementwise. + double c1, c2, c3, c4; + MatrixIndexT i; + for (i = 0; i + 4 <= dim; i += 4) { + c1 = a[i] * b[i]; + c2 = a[i+1] * b[i+1]; + c3 = a[i+2] * b[i+2]; + c4 = a[i+3] * b[i+3]; + b[i] = c1; + b[i+1] = c2; + b[i+2] = c3; + b[i+3] = c4; + } + for (; i < dim; i++) + b[i] *= a[i]; +} + +inline void mul_elements( + const MatrixIndexT dim, + const float *a, + float *b) { // does b *= a, elementwise. + float c1, c2, c3, c4; + MatrixIndexT i; + for (i = 0; i + 4 <= dim; i += 4) { + c1 = a[i] * b[i]; + c2 = a[i+1] * b[i+1]; + c3 = a[i+2] * b[i+2]; + c4 = a[i+3] * b[i+3]; + b[i] = c1; + b[i+1] = c2; + b[i+2] = c3; + b[i+3] = c4; + } + for (; i < dim; i++) + b[i] *= a[i]; +} + + + +// add clapack here +#if !defined(HAVE_ATLAS) +inline void clapack_Xtptri(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *result) { + stptri_(const_cast("U"), const_cast("N"), num_rows, Mdata, result); +} +inline void clapack_Xtptri(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *result) { + dtptri_(const_cast("U"), const_cast("N"), num_rows, Mdata, result); +} +// +inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols, + float *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, + KaldiBlasInt *result) { + sgetrf_(num_rows, num_cols, Mdata, stride, pivot, result); +} +inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols, + double *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, + KaldiBlasInt *result) { + dgetrf_(num_rows, num_cols, Mdata, stride, pivot, result); +} + +// +inline void clapack_Xgetri2(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride, + KaldiBlasInt *pivot, float *p_work, + KaldiBlasInt *l_work, KaldiBlasInt *result) { + sgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result); +} +inline void clapack_Xgetri2(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride, + KaldiBlasInt *pivot, double *p_work, + KaldiBlasInt *l_work, KaldiBlasInt *result) { + dgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result); +} +// +inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols, + KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride, + float *sv, float *Vdata, KaldiBlasInt *vstride, + float *Udata, KaldiBlasInt *ustride, float *p_work, + KaldiBlasInt *l_work, KaldiBlasInt *result) { + sgesvd_(v, u, + num_cols, num_rows, Mdata, stride, + sv, Vdata, vstride, Udata, ustride, + p_work, l_work, result); +} +inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols, + KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride, + double *sv, double *Vdata, KaldiBlasInt *vstride, + double *Udata, KaldiBlasInt *ustride, double *p_work, + KaldiBlasInt *l_work, KaldiBlasInt *result) { + dgesvd_(v, u, + num_cols, num_rows, Mdata, stride, + sv, Vdata, vstride, Udata, ustride, + p_work, l_work, result); +} +// +void inline clapack_Xsptri(KaldiBlasInt *num_rows, float *Mdata, + KaldiBlasInt *ipiv, float *work, KaldiBlasInt *result) { + ssptri_(const_cast("U"), num_rows, Mdata, ipiv, work, result); +} +void inline clapack_Xsptri(KaldiBlasInt *num_rows, double *Mdata, + KaldiBlasInt *ipiv, double *work, KaldiBlasInt *result) { + dsptri_(const_cast("U"), num_rows, Mdata, ipiv, work, result); +} +// +void inline clapack_Xsptrf(KaldiBlasInt *num_rows, float *Mdata, + KaldiBlasInt *ipiv, KaldiBlasInt *result) { + ssptrf_(const_cast("U"), num_rows, Mdata, ipiv, result); +} +void inline clapack_Xsptrf(KaldiBlasInt *num_rows, double *Mdata, + KaldiBlasInt *ipiv, KaldiBlasInt *result) { + dsptrf_(const_cast("U"), num_rows, Mdata, ipiv, result); +} +#else +inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols, + float *Mdata, MatrixIndexT stride, + int *pivot, int *result) { + *result = clapack_sgetrf(CblasColMajor, num_rows, num_cols, + Mdata, stride, pivot); +} + +inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols, + double *Mdata, MatrixIndexT stride, + int *pivot, int *result) { + *result = clapack_dgetrf(CblasColMajor, num_rows, num_cols, + Mdata, stride, pivot); +} +// +inline int clapack_Xtrtri(int num_rows, float *Mdata, MatrixIndexT stride) { + return clapack_strtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows, + Mdata, stride); +} + +inline int clapack_Xtrtri(int num_rows, double *Mdata, MatrixIndexT stride) { + return clapack_dtrtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows, + Mdata, stride); +} +// +inline void clapack_Xgetri(MatrixIndexT num_rows, float *Mdata, MatrixIndexT stride, + int *pivot, int *result) { + *result = clapack_sgetri(CblasColMajor, num_rows, Mdata, stride, pivot); +} +inline void clapack_Xgetri(MatrixIndexT num_rows, double *Mdata, MatrixIndexT stride, + int *pivot, int *result) { + *result = clapack_dgetri(CblasColMajor, num_rows, Mdata, stride, pivot); +} +#endif + +} +// namespace kaldi + +#endif diff --git a/speechx/speechx/kaldi/matrix/compressed-matrix.cc b/speechx/speechx/kaldi/matrix/compressed-matrix.cc new file mode 100644 index 00000000..13214b25 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/compressed-matrix.cc @@ -0,0 +1,876 @@ +// matrix/compressed-matrix.cc + +// Copyright 2012 Johns Hopkins University (author: Daniel Povey) +// Frantisek Skala, Wei Shi +// 2015 Tom Ko + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "matrix/compressed-matrix.h" +#include + +namespace kaldi { + +//static +MatrixIndexT CompressedMatrix::DataSize(const GlobalHeader &header) { + // Returns size in bytes of the data. + DataFormat format = static_cast(header.format); + if (format == kOneByteWithColHeaders) { + return sizeof(GlobalHeader) + + header.num_cols * (sizeof(PerColHeader) + header.num_rows); + } else if (format == kTwoByte) { + return sizeof(GlobalHeader) + + 2 * header.num_rows * header.num_cols; + } else { + KALDI_ASSERT(format == kOneByte); + return sizeof(GlobalHeader) + + header.num_rows * header.num_cols; + } +} + +// scale all element of matrix by scaling floats +// in GlobalHeader with alpha. +void CompressedMatrix::Scale(float alpha) { + if (data_ != NULL) { + GlobalHeader *h = reinterpret_cast(data_); + // scale the floating point values in each PerColHolder + // and leave all integers the same. + h->min_value *= alpha; + h->range *= alpha; + } +} + +template // static inline +void CompressedMatrix::ComputeGlobalHeader( + const MatrixBase &mat, CompressionMethod method, + GlobalHeader *header) { + if (method == kAutomaticMethod) { + if (mat.NumRows() > 8) method = kSpeechFeature; + else method = kTwoByteAuto; + } + + switch (method) { + case kSpeechFeature: + header->format = static_cast(kOneByteWithColHeaders); // 1. + break; + case kTwoByteAuto: case kTwoByteSignedInteger: + header->format = static_cast(kTwoByte); // 2. + break; + case kOneByteAuto: case kOneByteUnsignedInteger: case kOneByteZeroOne: + header->format = static_cast(kOneByte); // 3. + break; + default: + KALDI_ERR << "Invalid compression type: " + << static_cast(method); + } + + header->num_rows = mat.NumRows(); + header->num_cols = mat.NumCols(); + + // Now compute 'min_value' and 'range'. + switch (method) { + case kSpeechFeature: case kTwoByteAuto: case kOneByteAuto: { + float min_value = mat.Min(), max_value = mat.Max(); + // ensure that max_value is strictly greater than min_value, even if matrix is + // constant; this avoids crashes in ComputeColHeader when compressing speech + // featupres. + if (max_value == min_value) + max_value = min_value + (1.0 + fabs(min_value)); + KALDI_ASSERT(min_value - min_value == 0 && + max_value - max_value == 0 && + "Cannot compress a matrix with Nan's or Inf's"); + + header->min_value = min_value; + header->range = max_value - min_value; + + // we previously checked that max_value != min_value, so their + // difference should be nonzero. + KALDI_ASSERT(header->range > 0.0); + break; + } + case kTwoByteSignedInteger: { + header->min_value = -32768.0; + header->range = 65535.0; + break; + } + case kOneByteUnsignedInteger: { + header->min_value = 0.0; + header->range = 255.0; + break; + } + case kOneByteZeroOne: { + header->min_value = 0.0; + header->range = 1.0; + break; + } + default: + KALDI_ERR << "Unknown compression method = " + << static_cast(method); + } + KALDI_COMPILE_TIME_ASSERT(sizeof(*header) == 20); // otherwise + // something weird is happening and our code probably won't work or + // won't be robust across platforms. +} + +template +void CompressedMatrix::CopyFromMat( + const MatrixBase &mat, CompressionMethod method) { + if (data_ != NULL) { + delete [] static_cast(data_); // call delete [] because was allocated with new float[] + data_ = NULL; + } + if (mat.NumRows() == 0) { return; } // Zero-size matrix stored as zero pointer. + + + GlobalHeader global_header; + ComputeGlobalHeader(mat, method, &global_header); + + int32 data_size = DataSize(global_header); + + data_ = AllocateData(data_size); + + *(reinterpret_cast(data_)) = global_header; + + DataFormat format = static_cast(global_header.format); + if (format == kOneByteWithColHeaders) { + PerColHeader *header_data = + reinterpret_cast(static_cast(data_) + + sizeof(GlobalHeader)); + uint8 *byte_data = + reinterpret_cast(header_data + global_header.num_cols); + + const Real *matrix_data = mat.Data(); + + for (int32 col = 0; col < global_header.num_cols; col++) { + CompressColumn(global_header, + matrix_data + col, mat.Stride(), + global_header.num_rows, + header_data, byte_data); + header_data++; + byte_data += global_header.num_rows; + } + } else if (format == kTwoByte) { + uint16 *data = reinterpret_cast(static_cast(data_) + + sizeof(GlobalHeader)); + int32 num_rows = mat.NumRows(), num_cols = mat.NumCols(); + for (int32 r = 0; r < num_rows; r++) { + const Real *row_data = mat.RowData(r); + for (int32 c = 0; c < num_cols; c++) + data[c] = FloatToUint16(global_header, row_data[c]); + data += num_cols; + } + } else { + KALDI_ASSERT(format == kOneByte); + uint8 *data = reinterpret_cast(static_cast(data_) + + sizeof(GlobalHeader)); + int32 num_rows = mat.NumRows(), num_cols = mat.NumCols(); + for (int32 r = 0; r < num_rows; r++) { + const Real *row_data = mat.RowData(r); + for (int32 c = 0; c < num_cols; c++) + data[c] = FloatToUint8(global_header, row_data[c]); + data += num_cols; + } + } +} + +// Instantiate the template for float and double. +template +void CompressedMatrix::CopyFromMat(const MatrixBase &mat, + CompressionMethod method); + +template +void CompressedMatrix::CopyFromMat(const MatrixBase &mat, + CompressionMethod method); + + +CompressedMatrix::CompressedMatrix( + const CompressedMatrix &cmat, + const MatrixIndexT row_offset, + const MatrixIndexT num_rows, + const MatrixIndexT col_offset, + const MatrixIndexT num_cols, + bool allow_padding): data_(NULL) { + int32 old_num_rows = cmat.NumRows(), old_num_cols = cmat.NumCols(); + + if (old_num_rows == 0) { + KALDI_ASSERT(num_rows == 0 && num_cols == 0); + // The empty matrix is stored as a zero pointer. + return; + } + + KALDI_ASSERT(row_offset < old_num_rows); + KALDI_ASSERT(col_offset < old_num_cols); + KALDI_ASSERT(row_offset >= 0 || allow_padding); + KALDI_ASSERT(col_offset >= 0); + KALDI_ASSERT(row_offset + num_rows <= old_num_rows || allow_padding); + KALDI_ASSERT(col_offset + num_cols <= old_num_cols); + + if (num_rows == 0 || num_cols == 0) { return; } + + bool padding_is_used = (row_offset < 0 || + row_offset + num_rows > old_num_rows); + + GlobalHeader new_global_header; + KALDI_COMPILE_TIME_ASSERT(sizeof(new_global_header) == 20); + + GlobalHeader *old_global_header = reinterpret_cast(cmat.Data()); + + new_global_header = *old_global_header; + new_global_header.num_cols = num_cols; + new_global_header.num_rows = num_rows; + + // We don't switch format from 1 -> 2 (in case of size reduction) yet; if this + // is needed, we will do this below by creating a temporary Matrix. + new_global_header.format = old_global_header->format; + + data_ = AllocateData(DataSize(new_global_header)); // allocate memory + *(reinterpret_cast(data_)) = new_global_header; + + + DataFormat format = static_cast(old_global_header->format); + if (format == kOneByteWithColHeaders) { + PerColHeader *old_per_col_header = + reinterpret_cast(old_global_header + 1); + uint8 *old_byte_data = + reinterpret_cast(old_per_col_header + + old_global_header->num_cols); + PerColHeader *new_per_col_header = + reinterpret_cast( + reinterpret_cast(data_) + 1); + + memcpy(new_per_col_header, old_per_col_header + col_offset, + sizeof(PerColHeader) * num_cols); + + uint8 *new_byte_data = + reinterpret_cast(new_per_col_header + num_cols); + if (!padding_is_used) { + uint8 *old_start_of_subcol = + old_byte_data + row_offset + (col_offset * old_num_rows), + *new_start_of_col = new_byte_data; + for (int32 i = 0; i < num_cols; i++) { + memcpy(new_start_of_col, old_start_of_subcol, num_rows); + new_start_of_col += num_rows; + old_start_of_subcol += old_num_rows; + } + } else { + uint8 *old_start_of_col = + old_byte_data + (col_offset * old_num_rows), + *new_start_of_col = new_byte_data; + for (int32 i = 0; i < num_cols; i++) { + + for (int32 j = 0; j < num_rows; j++) { + int32 old_j = j + row_offset; + if (old_j < 0) old_j = 0; + else if (old_j >= old_num_rows) old_j = old_num_rows - 1; + new_start_of_col[j] = old_start_of_col[old_j]; + } + new_start_of_col += num_rows; + old_start_of_col += old_num_rows; + } + } + } else if (format == kTwoByte) { + const uint16 *old_data = + reinterpret_cast(old_global_header + 1); + uint16 *new_row_data = + reinterpret_cast(reinterpret_cast(data_) + 1); + + for (int32 row = 0; row < num_rows; row++) { + int32 old_row = row + row_offset; + // The next two lines are only relevant if padding_is_used. + if (old_row < 0) old_row = 0; + else if (old_row >= old_num_rows) old_row = old_num_rows - 1; + const uint16 *old_row_data = + old_data + col_offset + (old_num_cols * old_row); + memcpy(new_row_data, old_row_data, sizeof(uint16) * num_cols); + new_row_data += num_cols; + } + } else { + KALDI_ASSERT(format == kOneByte); + const uint8 *old_data = + reinterpret_cast(old_global_header + 1); + uint8 *new_row_data = + reinterpret_cast(reinterpret_cast(data_) + 1); + + for (int32 row = 0; row < num_rows; row++) { + int32 old_row = row + row_offset; + // The next two lines are only relevant if padding_is_used. + if (old_row < 0) old_row = 0; + else if (old_row >= old_num_rows) old_row = old_num_rows - 1; + const uint8 *old_row_data = + old_data + col_offset + (old_num_cols * old_row); + memcpy(new_row_data, old_row_data, sizeof(uint8) * num_cols); + new_row_data += num_cols; + } + } + + if (num_rows < 8 && format == kOneByteWithColHeaders) { + // format was 1 but we want it to be 2 -> create a temporary + // Matrix (uncompress), re-compress, and swap. + // This gives us almost exact reconstruction while saving + // memory (the elements take more space but there will be + // no per-column headers). + Matrix temp(this->NumRows(), this->NumCols(), + kUndefined); + this->CopyToMat(&temp); + CompressedMatrix temp_cmat(temp, kTwoByteAuto); + this->Swap(&temp_cmat); + } +} + + +template +CompressedMatrix &CompressedMatrix::operator =(const MatrixBase &mat) { + this->CopyFromMat(mat); + return *this; +} + +// Instantiate the template for float and double. +template +CompressedMatrix& CompressedMatrix::operator =(const MatrixBase &mat); + +template +CompressedMatrix& CompressedMatrix::operator =(const MatrixBase &mat); + +inline uint16 CompressedMatrix::FloatToUint16( + const GlobalHeader &global_header, + float value) { + float f = (value - global_header.min_value) / + global_header.range; + if (f > 1.0) f = 1.0; // Note: this should not happen. + if (f < 0.0) f = 0.0; // Note: this should not happen. + return static_cast(f * 65535 + 0.499); // + 0.499 is to + // round to closest int; avoids bias. +} + + +inline uint8 CompressedMatrix::FloatToUint8( + const GlobalHeader &global_header, + float value) { + float f = (value - global_header.min_value) / + global_header.range; + if (f > 1.0) f = 1.0; // Note: this should not happen. + if (f < 0.0) f = 0.0; // Note: this should not happen. + return static_cast(f * 255 + 0.499); // + 0.499 is to + // round to closest int; avoids bias. +} + + +inline float CompressedMatrix::Uint16ToFloat( + const GlobalHeader &global_header, + uint16 value) { + // the constant 1.52590218966964e-05 is 1/65535. + return global_header.min_value + + global_header.range * 1.52590218966964e-05F * value; +} + +template // static +void CompressedMatrix::ComputeColHeader( + const GlobalHeader &global_header, + const Real *data, MatrixIndexT stride, + int32 num_rows, CompressedMatrix::PerColHeader *header) { + KALDI_ASSERT(num_rows > 0); + std::vector sdata(num_rows); // the sorted data. + for (size_t i = 0, size = sdata.size(); i < size; i++) + sdata[i] = data[i*stride]; + + if (num_rows >= 5) { + int quarter_nr = num_rows/4; + // std::sort(sdata.begin(), sdata.end()); + // The elements at positions 0, quarter_nr, + // 3*quarter_nr, and num_rows-1 need to be in sorted order. + std::nth_element(sdata.begin(), sdata.begin() + quarter_nr, sdata.end()); + // Now, sdata.begin() + quarter_nr contains the element that would appear + // in sorted order, in that position. + std::nth_element(sdata.begin(), sdata.begin(), sdata.begin() + quarter_nr); + // Now, sdata.begin() and sdata.begin() + quarter_nr contain the elements + // that would appear at those positions in sorted order. + std::nth_element(sdata.begin() + quarter_nr + 1, + sdata.begin() + (3*quarter_nr), sdata.end()); + // Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() + + // 3*quarter_nr, contain the elements that would appear at those positions + // in sorted order. + std::nth_element(sdata.begin() + (3*quarter_nr) + 1, sdata.end() - 1, + sdata.end()); + // Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() + + // 3*quarter_nr, and sdata.end() - 1, contain the elements that would appear + // at those positions in sorted order. + + header->percentile_0 = + std::min(FloatToUint16(global_header, sdata[0]), 65532); + header->percentile_25 = + std::min( + std::max( + FloatToUint16(global_header, sdata[quarter_nr]), + header->percentile_0 + static_cast(1)), 65533); + header->percentile_75 = + std::min( + std::max( + FloatToUint16(global_header, sdata[3*quarter_nr]), + header->percentile_25 + static_cast(1)), 65534); + header->percentile_100 = std::max( + FloatToUint16(global_header, sdata[num_rows-1]), + header->percentile_75 + static_cast(1)); + + } else { // handle this pathological case. + std::sort(sdata.begin(), sdata.end()); + // Note: we know num_rows is at least 1. + header->percentile_0 = + std::min(FloatToUint16(global_header, sdata[0]), + 65532); + if (num_rows > 1) + header->percentile_25 = + std::min( + std::max(FloatToUint16(global_header, sdata[1]), + header->percentile_0 + 1), 65533); + else + header->percentile_25 = header->percentile_0 + 1; + if (num_rows > 2) + header->percentile_75 = + std::min( + std::max(FloatToUint16(global_header, sdata[2]), + header->percentile_25 + 1), 65534); + else + header->percentile_75 = header->percentile_25 + 1; + if (num_rows > 3) + header->percentile_100 = + std::max(FloatToUint16(global_header, sdata[3]), + header->percentile_75 + 1); + else + header->percentile_100 = header->percentile_75 + 1; + } +} + +// static +inline uint8 CompressedMatrix::FloatToChar( + float p0, float p25, float p75, float p100, + float value) { + int ans; + if (value < p25) { // range [ p0, p25 ) covered by + // characters 0 .. 64. We round to the closest int. + float f = (value - p0) / (p25 - p0); + ans = static_cast(f * 64 + 0.5); + // Note: the checks on the next two lines + // are necessary in pathological cases when all the elements in a row + // are the same and the percentile_* values are separated by one. + if (ans < 0) ans = 0; + if (ans > 64) ans = 64; + } else if (value < p75) { // range [ p25, p75 )covered + // by characters 64 .. 192. We round to the closest int. + float f = (value - p25) / (p75 - p25); + ans = 64 + static_cast(f * 128 + 0.5); + if (ans < 64) ans = 64; + if (ans > 192) ans = 192; + } else { // range [ p75, p100 ] covered by + // characters 192 .. 255. Note: this last range + // has fewer characters than the left range, because + // we go up to 255, not 256. + float f = (value - p75) / (p100 - p75); + ans = 192 + static_cast(f * 63 + 0.5); + if (ans < 192) ans = 192; + if (ans > 255) ans = 255; + } + return static_cast(ans); +} + + +// static +inline float CompressedMatrix::CharToFloat( + float p0, float p25, float p75, float p100, + uint8 value) { + if (value <= 64) { + return p0 + (p25 - p0) * value * (1/64.0); + } else if (value <= 192) { + return p25 + (p75 - p25) * (value - 64) * (1/128.0); + } else { + return p75 + (p100 - p75) * (value - 192) * (1/63.0); + } +} + + +template // static +void CompressedMatrix::CompressColumn( + const GlobalHeader &global_header, + const Real *data, MatrixIndexT stride, + int32 num_rows, CompressedMatrix::PerColHeader *header, + uint8 *byte_data) { + ComputeColHeader(global_header, data, stride, + num_rows, header); + + float p0 = Uint16ToFloat(global_header, header->percentile_0), + p25 = Uint16ToFloat(global_header, header->percentile_25), + p75 = Uint16ToFloat(global_header, header->percentile_75), + p100 = Uint16ToFloat(global_header, header->percentile_100); + + for (int32 i = 0; i < num_rows; i++) { + Real this_data = data[i * stride]; + byte_data[i] = FloatToChar(p0, p25, p75, p100, this_data); + } +} + +// static +void* CompressedMatrix::AllocateData(int32 num_bytes) { + KALDI_ASSERT(num_bytes > 0); + KALDI_COMPILE_TIME_ASSERT(sizeof(float) == 4); + // round size up to nearest number of floats. + return reinterpret_cast(new float[(num_bytes/3) + 4]); +} + +void CompressedMatrix::Write(std::ostream &os, bool binary) const { + if (binary) { // Binary-mode write: + if (data_ != NULL) { + GlobalHeader &h = *reinterpret_cast(data_); + DataFormat format = static_cast(h.format); + if (format == kOneByteWithColHeaders) { + WriteToken(os, binary, "CM"); + } else if (format == kTwoByte) { + WriteToken(os, binary, "CM2"); + } else if (format == kOneByte) { + WriteToken(os, binary, "CM3"); + } + MatrixIndexT size = DataSize(h); // total size of data in data_ + // We don't write out the "int32 format", hence the + 4, - 4. + os.write(reinterpret_cast(data_) + 4, size - 4); + } else { // special case: where data_ == NULL, we treat it as an empty + // matrix. + WriteToken(os, binary, "CM"); + GlobalHeader h; + h.range = h.min_value = 0.0; + h.num_rows = h.num_cols = 0; + os.write(reinterpret_cast(&h), sizeof(h)); + } + } else { + // In text mode, just use the same format as a regular matrix. + // This is not compressed. + Matrix temp_mat(this->NumRows(), this->NumCols(), + kUndefined); + this->CopyToMat(&temp_mat); + temp_mat.Write(os, binary); + } + if (os.fail()) + KALDI_ERR << "Error writing compressed matrix to stream."; +} + +void CompressedMatrix::Read(std::istream &is, bool binary) { + if (data_ != NULL) { + delete [] (static_cast(data_)); + data_ = NULL; + } + if (binary) { + int peekval = Peek(is, binary); + if (peekval == 'C') { + std::string tok; // Should be CM (format 1) or CM2 (format 2) + ReadToken(is, binary, &tok); + GlobalHeader h; + if (tok == "CM") { h.format = 1; } // kOneByteWithColHeaders + else if (tok == "CM2") { h.format = 2; } // kTwoByte + else if (tok == "CM3") { h.format = 3; } // kOneByte + else { + KALDI_ERR << "Unexpected token " << tok << ", expecting CM, CM2 or CM3"; + } + // don't read the "format" -> hence + 4, - 4. + is.read(reinterpret_cast(&h) + 4, sizeof(h) - 4); + if (is.fail()) + KALDI_ERR << "Failed to read header"; + if (h.num_cols == 0) // empty matrix. + return; + int32 size = DataSize(h), remaining_size = size - sizeof(GlobalHeader); + data_ = AllocateData(size); + *(reinterpret_cast(data_)) = h; + is.read(reinterpret_cast(data_) + sizeof(GlobalHeader), + remaining_size); + } else { + // Assume that what we're reading is a regular Matrix. This might be the + // case if you changed your code, making a Matrix into a CompressedMatrix, + // and you want back-compatibility for reading. + Matrix M; + M.Read(is, binary); // This will crash if it was not a Matrix. + this->CopyFromMat(M); + } + } else { // Text-mode read. In this case you don't get to + // choose the compression type. Anyway this branch would only + // be taken when debugging. + Matrix temp; + temp.Read(is, binary); + this->CopyFromMat(temp); + } + if (is.fail()) + KALDI_ERR << "Failed to read data."; +} + +template +void CompressedMatrix::CopyToMat(MatrixBase *mat, + MatrixTransposeType trans) const { + if (trans == kTrans) { + Matrix temp(this->NumCols(), this->NumRows()); + CopyToMat(&temp, kNoTrans); + mat->CopyFromMat(temp, kTrans); + return; + } + + if (data_ == NULL) { + KALDI_ASSERT(mat->NumRows() == 0); + KALDI_ASSERT(mat->NumCols() == 0); + return; + } + GlobalHeader *h = reinterpret_cast(data_); + int32 num_cols = h->num_cols, num_rows = h->num_rows; + KALDI_ASSERT(mat->NumRows() == num_rows); + KALDI_ASSERT(mat->NumCols() == num_cols); + + DataFormat format = static_cast(h->format); + if (format == kOneByteWithColHeaders) { + PerColHeader *per_col_header = reinterpret_cast(h+1); + uint8 *byte_data = reinterpret_cast(per_col_header + + h->num_cols); + for (int32 i = 0; i < num_cols; i++, per_col_header++) { + float p0 = Uint16ToFloat(*h, per_col_header->percentile_0), + p25 = Uint16ToFloat(*h, per_col_header->percentile_25), + p75 = Uint16ToFloat(*h, per_col_header->percentile_75), + p100 = Uint16ToFloat(*h, per_col_header->percentile_100); + for (int32 j = 0; j < num_rows; j++, byte_data++) { + float f = CharToFloat(p0, p25, p75, p100, *byte_data); + (*mat)(j, i) = f; + } + } + } else if (format == kTwoByte) { + const uint16 *data = reinterpret_cast(h + 1); + float min_value = h->min_value, + increment = h->range * (1.0 / 65535.0); + for (int32 i = 0; i < num_rows; i++) { + Real *row_data = mat->RowData(i); + for (int32 j = 0; j < num_cols; j++) + row_data[j] = min_value + data[j] * increment; + data += num_cols; + } + } else { + KALDI_ASSERT(format == kOneByte); + float min_value = h->min_value, increment = h->range * (1.0 / 255.0); + + const uint8 *data = reinterpret_cast(h + 1); + for (int32 i = 0; i < num_rows; i++) { + Real *row_data = mat->RowData(i); + for (int32 j = 0; j < num_cols; j++) + row_data[j] = min_value + data[j] * increment; + data += num_cols; + } + } +} + +// Instantiate the template for float and double. +template +void CompressedMatrix::CopyToMat(MatrixBase *mat, + MatrixTransposeType trans) const; +template +void CompressedMatrix::CopyToMat(MatrixBase *mat, + MatrixTransposeType trans) const; + +template +void CompressedMatrix::CopyRowToVec(MatrixIndexT row, + VectorBase *v) const { + KALDI_ASSERT(row < this->NumRows()); + KALDI_ASSERT(row >= 0); + KALDI_ASSERT(v->Dim() == this->NumCols()); + + GlobalHeader *h = reinterpret_cast(data_); + DataFormat format = static_cast(h->format); + if (format == kOneByteWithColHeaders) { + PerColHeader *per_col_header = reinterpret_cast(h+1); + uint8 *byte_data = reinterpret_cast(per_col_header + + h->num_cols); + byte_data += row; // point to first value we are interested in + for (int32 i = 0; i < h->num_cols; + i++, per_col_header++, byte_data += h->num_rows) { + float p0 = Uint16ToFloat(*h, per_col_header->percentile_0), + p25 = Uint16ToFloat(*h, per_col_header->percentile_25), + p75 = Uint16ToFloat(*h, per_col_header->percentile_75), + p100 = Uint16ToFloat(*h, per_col_header->percentile_100); + float f = CharToFloat(p0, p25, p75, p100, *byte_data); + (*v)(i) = f; + } + } else if (format == kTwoByte) { + int32 num_cols = h->num_cols; + float min_value = h->min_value, + increment = h->range * (1.0 / 65535.0); + const uint16 *row_data = reinterpret_cast(h + 1) + (num_cols * row); + Real *v_data = v->Data(); + for (int32 c = 0; c < num_cols; c++) + v_data[c] = min_value + row_data[c] * increment; + } else { + KALDI_ASSERT(format == kOneByte); + int32 num_cols = h->num_cols; + float min_value = h->min_value, + increment = h->range * (1.0 / 255.0); + const uint8 *row_data = reinterpret_cast(h + 1) + (num_cols * row); + Real *v_data = v->Data(); + for (int32 c = 0; c < num_cols; c++) + v_data[c] = min_value + row_data[c] * increment; + } +} + +template +void CompressedMatrix::CopyColToVec(MatrixIndexT col, + VectorBase *v) const { + KALDI_ASSERT(col < this->NumCols()); + KALDI_ASSERT(col >= 0); + KALDI_ASSERT(v->Dim() == this->NumRows()); + + GlobalHeader *h = reinterpret_cast(data_); + + DataFormat format = static_cast(h->format); + if (format == kOneByteWithColHeaders) { + PerColHeader *per_col_header = reinterpret_cast(h+1); + uint8 *byte_data = reinterpret_cast(per_col_header + + h->num_cols); + byte_data += col*h->num_rows; // point to first value in the column we want + per_col_header += col; + float p0 = Uint16ToFloat(*h, per_col_header->percentile_0), + p25 = Uint16ToFloat(*h, per_col_header->percentile_25), + p75 = Uint16ToFloat(*h, per_col_header->percentile_75), + p100 = Uint16ToFloat(*h, per_col_header->percentile_100); + for (int32 i = 0; i < h->num_rows; i++, byte_data++) { + float f = CharToFloat(p0, p25, p75, p100, *byte_data); + (*v)(i) = f; + } + } else if (format == kTwoByte) { + int32 num_rows = h->num_rows, num_cols = h->num_cols; + float min_value = h->min_value, + increment = h->range * (1.0 / 65535.0); + const uint16 *col_data = reinterpret_cast(h + 1) + col; + Real *v_data = v->Data(); + for (int32 r = 0; r < num_rows; r++) + v_data[r] = min_value + increment * col_data[r * num_cols]; + } else { + KALDI_ASSERT(format == kOneByte); + int32 num_rows = h->num_rows, num_cols = h->num_cols; + float min_value = h->min_value, + increment = h->range * (1.0 / 255.0); + const uint8 *col_data = reinterpret_cast(h + 1) + col; + Real *v_data = v->Data(); + for (int32 r = 0; r < num_rows; r++) + v_data[r] = min_value + increment * col_data[r * num_cols]; + } +} + +// instantiate the templates. +template void +CompressedMatrix::CopyColToVec(MatrixIndexT, VectorBase *) const; +template void +CompressedMatrix::CopyColToVec(MatrixIndexT, VectorBase *) const; +template void +CompressedMatrix::CopyRowToVec(MatrixIndexT, VectorBase *) const; +template void +CompressedMatrix::CopyRowToVec(MatrixIndexT, VectorBase *) const; + +template +void CompressedMatrix::CopyToMat(int32 row_offset, + int32 col_offset, + MatrixBase *dest) const { + KALDI_PARANOID_ASSERT(row_offset < this->NumRows()); + KALDI_PARANOID_ASSERT(col_offset < this->NumCols()); + KALDI_PARANOID_ASSERT(row_offset >= 0); + KALDI_PARANOID_ASSERT(col_offset >= 0); + KALDI_ASSERT(row_offset+dest->NumRows() <= this->NumRows()); + KALDI_ASSERT(col_offset+dest->NumCols() <= this->NumCols()); + // everything is OK + GlobalHeader *h = reinterpret_cast(data_); + int32 num_rows = h->num_rows, num_cols = h->num_cols, + tgt_cols = dest->NumCols(), tgt_rows = dest->NumRows(); + + DataFormat format = static_cast(h->format); + if (format == kOneByteWithColHeaders) { + PerColHeader *per_col_header = reinterpret_cast(h+1); + uint8 *byte_data = reinterpret_cast(per_col_header + + h->num_cols); + + uint8 *start_of_subcol = byte_data+row_offset; // skip appropriate + // number of columns + start_of_subcol += col_offset*num_rows; // skip appropriate number of rows + + per_col_header += col_offset; // skip the appropriate number of headers + + for (int32 i = 0; + i < tgt_cols; + i++, per_col_header++, start_of_subcol+=num_rows) { + byte_data = start_of_subcol; + float p0 = Uint16ToFloat(*h, per_col_header->percentile_0), + p25 = Uint16ToFloat(*h, per_col_header->percentile_25), + p75 = Uint16ToFloat(*h, per_col_header->percentile_75), + p100 = Uint16ToFloat(*h, per_col_header->percentile_100); + for (int32 j = 0; j < tgt_rows; j++, byte_data++) { + float f = CharToFloat(p0, p25, p75, p100, *byte_data); + (*dest)(j, i) = f; + } + } + } else if (format == kTwoByte) { + const uint16 *data = reinterpret_cast(h+1) + col_offset + + (num_cols * row_offset); + float min_value = h->min_value, + increment = h->range * (1.0 / 65535.0); + + for (int32 row = 0; row < tgt_rows; row++) { + Real *dest_row = dest->RowData(row); + for (int32 col = 0; col < tgt_cols; col++) + dest_row[col] = min_value + increment * data[col]; + data += num_cols; + } + } else { + KALDI_ASSERT(format == kOneByte); + const uint8 *data = reinterpret_cast(h+1) + col_offset + + (num_cols * row_offset); + float min_value = h->min_value, + increment = h->range * (1.0 / 255.0); + for (int32 row = 0; row < tgt_rows; row++) { + Real *dest_row = dest->RowData(row); + for (int32 col = 0; col < tgt_cols; col++) + dest_row[col] = min_value + increment * data[col]; + data += num_cols; + } + } +} + +// instantiate the templates. +template void CompressedMatrix::CopyToMat(int32, + int32, + MatrixBase *dest) const; +template void CompressedMatrix::CopyToMat(int32, + int32, + MatrixBase *dest) const; + +void CompressedMatrix::Clear() { + if (data_ != NULL) { + delete [] static_cast(data_); + data_ = NULL; + } +} + +CompressedMatrix::CompressedMatrix(const CompressedMatrix &mat): data_(NULL) { + *this = mat; // use assignment operator. +} + +CompressedMatrix &CompressedMatrix::operator = (const CompressedMatrix &mat) { + Clear(); // now this->data_ == NULL. + if (mat.data_ != NULL) { + MatrixIndexT data_size = DataSize(*static_cast(mat.data_)); + data_ = AllocateData(data_size); + memcpy(static_cast(data_), + static_cast(mat.data_), + data_size); + } + return *this; +} + + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/matrix/compressed-matrix.h b/speechx/speechx/kaldi/matrix/compressed-matrix.h new file mode 100644 index 00000000..78105b9b --- /dev/null +++ b/speechx/speechx/kaldi/matrix/compressed-matrix.h @@ -0,0 +1,283 @@ +// matrix/compressed-matrix.h + +// Copyright 2012 Johns Hopkins University (author: Daniel Povey) +// Frantisek Skala, Wei Shi + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_MATRIX_COMPRESSED_MATRIX_H_ +#define KALDI_MATRIX_COMPRESSED_MATRIX_H_ 1 + +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + +/// \addtogroup matrix_group +/// @{ + + + +/* + The enum CompressionMethod is used when creating a CompressedMatrix (a lossily + compressed matrix) from a regular Matrix. It dictates how we choose the + compressed format and how we choose the ranges of floats that are represented + by particular integers. + + kAutomaticMethod = 1 This is the default when you don't specify the + compression method. It is a shorthand for using + kSpeechFeature if the num-rows is more than 8, and + kTwoByteAuto otherwise. + kSpeechFeature = 2 This is the most complicated of the compression methods, + and was designed for speech features which have a roughly + Gaussian distribution with different ranges for each + dimension. Each element is stored in one byte, but there + is an 8-byte header per column; the spacing of the + integer values is not uniform but is in 3 ranges. + kTwoByteAuto = 3 Each element is stored in two bytes as a uint16, with + the representable range of values chosen automatically + with the minimum and maximum elements of the matrix as + its edges. + kTwoByteSignedInteger = 4 + Each element is stored in two bytes as a uint16, with + the representable range of value chosen to coincide with + what you'd get if you stored signed integers, i.e. + [-32768.0, 32767.0]. Suitable for waveform data that + was previously stored as 16-bit PCM. + kOneByteAuto = 5 Each element is stored in one byte as a uint8, with the + representable range of values chosen automatically with + the minimum and maximum elements of the matrix as its + edges. + kOneByteUnsignedInteger = 6 Each element is stored in + one byte as a uint8, with the representable range of + values equal to [0.0, 255.0]. + kOneByteZeroOne = 7 Each element is stored in + one byte as a uint8, with the representable range of + values equal to [0.0, 1.0]. Suitable for image data + that has previously been compressed as int8. + + // We can add new methods here as needed: if they just imply different ways + // of selecting the min_value and range, and a num-bytes = 1 or 2, they will + // be trivial to implement. +*/ +enum CompressionMethod { + kAutomaticMethod = 1, + kSpeechFeature = 2, + kTwoByteAuto = 3, + kTwoByteSignedInteger = 4, + kOneByteAuto = 5, + kOneByteUnsignedInteger = 6, + kOneByteZeroOne = 7 +}; + + +/* + This class does lossy compression of a matrix. It supports various compression + methods, see enum CompressionMethod. +*/ + +class CompressedMatrix { + public: + CompressedMatrix(): data_(NULL) { } + + ~CompressedMatrix() { Clear(); } + + template + explicit CompressedMatrix(const MatrixBase &mat, + CompressionMethod method = kAutomaticMethod): + data_(NULL) { CopyFromMat(mat, method); } + + /// Initializer that can be used to select part of an existing + /// CompressedMatrix without un-compressing and re-compressing (note: unlike + /// similar initializers for class Matrix, it doesn't point to the same memory + /// location). + /// + /// This creates a CompressedMatrix with the size (num_rows, num_cols) + /// starting at (row_offset, col_offset). + /// + /// If you specify allow_padding = true, + /// it is permitted to have row_offset < 0 and + /// row_offset + num_rows > mat.NumRows(), and the result will contain + /// repeats of the first and last rows of 'mat' as necessary. + CompressedMatrix(const CompressedMatrix &mat, + const MatrixIndexT row_offset, + const MatrixIndexT num_rows, + const MatrixIndexT col_offset, + const MatrixIndexT num_cols, + bool allow_padding = false); + + void *Data() const { return this->data_; } + + /// This will resize *this and copy the contents of mat to *this. + template + void CopyFromMat(const MatrixBase &mat, + CompressionMethod method = kAutomaticMethod); + + CompressedMatrix(const CompressedMatrix &mat); + + CompressedMatrix &operator = (const CompressedMatrix &mat); // assignment operator. + + template + CompressedMatrix &operator = (const MatrixBase &mat); // assignment operator. + + /// Copies contents to matrix. Note: mat must have the correct size. + /// The kTrans case uses a temporary. + template + void CopyToMat(MatrixBase *mat, + MatrixTransposeType trans = kNoTrans) const; + + void Write(std::ostream &os, bool binary) const; + + void Read(std::istream &is, bool binary); + + /// Returns number of rows (or zero for emtpy matrix). + inline MatrixIndexT NumRows() const { return (data_ == NULL) ? 0 : + (*reinterpret_cast(data_)).num_rows; } + + /// Returns number of columns (or zero for emtpy matrix). + inline MatrixIndexT NumCols() const { return (data_ == NULL) ? 0 : + (*reinterpret_cast(data_)).num_cols; } + + /// Copies row #row of the matrix into vector v. + /// Note: v must have same size as #cols. + template + void CopyRowToVec(MatrixIndexT row, VectorBase *v) const; + + /// Copies column #col of the matrix into vector v. + /// Note: v must have same size as #rows. + template + void CopyColToVec(MatrixIndexT col, VectorBase *v) const; + + /// Copies submatrix of compressed matrix into matrix dest. + /// Submatrix starts at row row_offset and column column_offset and its size + /// is defined by size of provided matrix dest + template + void CopyToMat(int32 row_offset, + int32 column_offset, + MatrixBase *dest) const; + + void Swap(CompressedMatrix *other) { std::swap(data_, other->data_); } + + void Clear(); + + /// scales all elements of matrix by alpha. + /// It scales the floating point values in GlobalHeader by alpha. + void Scale(float alpha); + + friend class Matrix; + friend class Matrix; + private: + + // This enum describes the different compressed-data formats: these are + // distinct from the compression methods although all of the methods apart + // from kAutomaticMethod dictate a particular compressed-data format. + // + // kOneByteWithColHeaders means there is a GlobalHeader and each + // column has a PerColHeader; the actual data is stored in + // one byte per element, in column-major order (the mapping + // from integers to floats is a little complicated). + // kTwoByte means there is a global header but no PerColHeader; + // the actual data is stored in two bytes per element in + // row-major order; it's decompressed as: + // uint16 i; GlobalHeader g; + // float f = g.min_value + i * (g.range / 65535.0) + // kOneByte means there is a global header but not PerColHeader; + // the data is stored in one byte per element in row-major + // order and is decompressed as: + // uint8 i; GlobalHeader g; + // float f = g.min_value + i * (g.range / 255.0) + enum DataFormat { + kOneByteWithColHeaders = 1, + kTwoByte = 2, + kOneByte = 3 + }; + + + // allocates data using new [], ensures byte alignment + // sufficient for float. + static void *AllocateData(int32 num_bytes); + + struct GlobalHeader { + int32 format; // Represents the enum DataFormat. + float min_value; // min_value and range represent the ranges of the integer + // data in the kTwoByte and kOneByte formats, and the + // range of the PerColHeader uint16's in the + // kOneByteWithColheaders format. + float range; + int32 num_rows; + int32 num_cols; + }; + + // This function computes the global header for compressing this data. + template + static inline void ComputeGlobalHeader(const MatrixBase &mat, + CompressionMethod method, + GlobalHeader *header); + + + // The number of bytes we need to request when allocating 'data_'. + static MatrixIndexT DataSize(const GlobalHeader &header); + + // This struct is only used in format kOneByteWithColHeaders. + struct PerColHeader { + uint16 percentile_0; + uint16 percentile_25; + uint16 percentile_75; + uint16 percentile_100; + }; + + template + static void CompressColumn(const GlobalHeader &global_header, + const Real *data, MatrixIndexT stride, + int32 num_rows, PerColHeader *header, + uint8 *byte_data); + template + static void ComputeColHeader(const GlobalHeader &global_header, + const Real *data, MatrixIndexT stride, + int32 num_rows, PerColHeader *header); + + static inline uint16 FloatToUint16(const GlobalHeader &global_header, + float value); + + // this is used only in the kOneByte compression format. + static inline uint8 FloatToUint8(const GlobalHeader &global_header, + float value); + + static inline float Uint16ToFloat(const GlobalHeader &global_header, + uint16 value); + + // this is used only in the kOneByteWithColHeaders compression format. + static inline uint8 FloatToChar(float p0, float p25, + float p75, float p100, + float value); + + // this is used only in the kOneByteWithColHeaders compression format. + static inline float CharToFloat(float p0, float p25, + float p75, float p100, + uint8 value); + + void *data_; // first GlobalHeader, then PerColHeader (repeated), then + // the byte data for each column (repeated). Note: don't intersperse + // the byte data with the PerColHeaders, because of alignment issues. + +}; + +/// @} end of \addtogroup matrix_group + + +} // namespace kaldi + + +#endif // KALDI_MATRIX_COMPRESSED_MATRIX_H_ diff --git a/speechx/speechx/kaldi/matrix/jama-eig.h b/speechx/speechx/kaldi/matrix/jama-eig.h new file mode 100644 index 00000000..92d8c27e --- /dev/null +++ b/speechx/speechx/kaldi/matrix/jama-eig.h @@ -0,0 +1,924 @@ +// matrix/jama-eig.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +// This file consists of a port and modification of materials from +// JAMA: A Java Matrix Package +// under the following notice: This software is a cooperative product of +// The MathWorks and the National Institute of Standards and Technology (NIST) +// which has been released to the public. This notice and the original code are +// available at http://math.nist.gov/javanumerics/jama/domain.notice + + + +#ifndef KALDI_MATRIX_JAMA_EIG_H_ +#define KALDI_MATRIX_JAMA_EIG_H_ 1 + +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + +// This class is not to be used externally. See the Eig function in the Matrix +// class in kaldi-matrix.h. This is the external interface. + +template class EigenvalueDecomposition { + // This class is based on the EigenvalueDecomposition class from the JAMA + // library (version 1.0.2). + public: + EigenvalueDecomposition(const MatrixBase &A); + + ~EigenvalueDecomposition(); // free memory. + + void GetV(MatrixBase *V_out) { // V is what we call P externally; it's the matrix of + // eigenvectors. + KALDI_ASSERT(V_out->NumRows() == static_cast(n_) + && V_out->NumCols() == static_cast(n_)); + for (int i = 0; i < n_; i++) + for (int j = 0; j < n_; j++) + (*V_out)(i, j) = V(i, j); // V(i, j) is member function. + } + void GetRealEigenvalues(VectorBase *r_out) { + // returns real part of eigenvalues. + KALDI_ASSERT(r_out->Dim() == static_cast(n_)); + for (int i = 0; i < n_; i++) + (*r_out)(i) = d_[i]; + } + void GetImagEigenvalues(VectorBase *i_out) { + // returns imaginary part of eigenvalues. + KALDI_ASSERT(i_out->Dim() == static_cast(n_)); + for (int i = 0; i < n_; i++) + (*i_out)(i) = e_[i]; + } + private: + + inline Real &H(int r, int c) { return H_[r*n_ + c]; } + inline Real &V(int r, int c) { return V_[r*n_ + c]; } + + // complex division + inline static void cdiv(Real xr, Real xi, Real yr, Real yi, Real *cdivr, Real *cdivi) { + Real r, d; + if (std::abs(yr) > std::abs(yi)) { + r = yi/yr; + d = yr + r*yi; + *cdivr = (xr + r*xi)/d; + *cdivi = (xi - r*xr)/d; + } else { + r = yr/yi; + d = yi + r*yr; + *cdivr = (r*xr + xi)/d; + *cdivi = (r*xi - xr)/d; + } + } + + // Nonsymmetric reduction from Hessenberg to real Schur form. + void Hqr2 (); + + + int n_; // matrix dimension. + + Real *d_, *e_; // real and imaginary parts of eigenvalues. + Real *V_; // the eigenvectors (P in our external notation) + Real *H_; // the nonsymmetric Hessenberg form. + Real *ort_; // working storage for nonsymmetric algorithm. + + // Symmetric Householder reduction to tridiagonal form. + void Tred2 (); + + // Symmetric tridiagonal QL algorithm. + void Tql2 (); + + // Nonsymmetric reduction to Hessenberg form. + void Orthes (); + +}; + +template class EigenvalueDecomposition; // force instantiation. +template class EigenvalueDecomposition; // force instantiation. + +template void EigenvalueDecomposition::Tred2() { + // This is derived from the Algol procedures tred2 by + // Bowdler, Martin, Reinsch, and Wilkinson, Handbook for + // Auto. Comp., Vol.ii-Linear Algebra, and the corresponding + // Fortran subroutine in EISPACK. + + for (int j = 0; j < n_; j++) { + d_[j] = V(n_-1, j); + } + + // Householder reduction to tridiagonal form. + + for (int i = n_-1; i > 0; i--) { + + // Scale to avoid under/overflow. + + Real scale = 0.0; + Real h = 0.0; + for (int k = 0; k < i; k++) { + scale = scale + std::abs(d_[k]); + } + if (scale == 0.0) { + e_[i] = d_[i-1]; + for (int j = 0; j < i; j++) { + d_[j] = V(i-1, j); + V(i, j) = 0.0; + V(j, i) = 0.0; + } + } else { + + // Generate Householder vector. + + for (int k = 0; k < i; k++) { + d_[k] /= scale; + h += d_[k] * d_[k]; + } + Real f = d_[i-1]; + Real g = std::sqrt(h); + if (f > 0) { + g = -g; + } + e_[i] = scale * g; + h = h - f * g; + d_[i-1] = f - g; + for (int j = 0; j < i; j++) { + e_[j] = 0.0; + } + + // Apply similarity transformation to remaining columns. + + for (int j = 0; j < i; j++) { + f = d_[j]; + V(j, i) = f; + g =e_[j] + V(j, j) * f; + for (int k = j+1; k <= i-1; k++) { + g += V(k, j) * d_[k]; + e_[k] += V(k, j) * f; + } + e_[j] = g; + } + f = 0.0; + for (int j = 0; j < i; j++) { + e_[j] /= h; + f += e_[j] * d_[j]; + } + Real hh = f / (h + h); + for (int j = 0; j < i; j++) { + e_[j] -= hh * d_[j]; + } + for (int j = 0; j < i; j++) { + f = d_[j]; + g = e_[j]; + for (int k = j; k <= i-1; k++) { + V(k, j) -= (f * e_[k] + g * d_[k]); + } + d_[j] = V(i-1, j); + V(i, j) = 0.0; + } + } + d_[i] = h; + } + + // Accumulate transformations. + + for (int i = 0; i < n_-1; i++) { + V(n_-1, i) = V(i, i); + V(i, i) = 1.0; + Real h = d_[i+1]; + if (h != 0.0) { + for (int k = 0; k <= i; k++) { + d_[k] = V(k, i+1) / h; + } + for (int j = 0; j <= i; j++) { + Real g = 0.0; + for (int k = 0; k <= i; k++) { + g += V(k, i+1) * V(k, j); + } + for (int k = 0; k <= i; k++) { + V(k, j) -= g * d_[k]; + } + } + } + for (int k = 0; k <= i; k++) { + V(k, i+1) = 0.0; + } + } + for (int j = 0; j < n_; j++) { + d_[j] = V(n_-1, j); + V(n_-1, j) = 0.0; + } + V(n_-1, n_-1) = 1.0; + e_[0] = 0.0; +} + +template void EigenvalueDecomposition::Tql2() { + // This is derived from the Algol procedures tql2, by + // Bowdler, Martin, Reinsch, and Wilkinson, Handbook for + // Auto. Comp., Vol.ii-Linear Algebra, and the corresponding + // Fortran subroutine in EISPACK. + + for (int i = 1; i < n_; i++) { + e_[i-1] = e_[i]; + } + e_[n_-1] = 0.0; + + Real f = 0.0; + Real tst1 = 0.0; + Real eps = std::numeric_limits::epsilon(); + for (int l = 0; l < n_; l++) { + + // Find small subdiagonal element + + tst1 = std::max(tst1, std::abs(d_[l]) + std::abs(e_[l])); + int m = l; + while (m < n_) { + if (std::abs(e_[m]) <= eps*tst1) { + break; + } + m++; + } + + // If m == l, d_[l] is an eigenvalue, + // otherwise, iterate. + + if (m > l) { + int iter = 0; + do { + iter = iter + 1; // (Could check iteration count here.) + + // Compute implicit shift + + Real g = d_[l]; + Real p = (d_[l+1] - g) / (2.0 *e_[l]); + Real r = Hypot(p, static_cast(1.0)); // This is a Kaldi version of hypot that works with templates. + if (p < 0) { + r = -r; + } + d_[l] =e_[l] / (p + r); + d_[l+1] =e_[l] * (p + r); + Real dl1 = d_[l+1]; + Real h = g - d_[l]; + for (int i = l+2; i < n_; i++) { + d_[i] -= h; + } + f = f + h; + + // Implicit QL transformation. + + p = d_[m]; + Real c = 1.0; + Real c2 = c; + Real c3 = c; + Real el1 =e_[l+1]; + Real s = 0.0; + Real s2 = 0.0; + for (int i = m-1; i >= l; i--) { + c3 = c2; + c2 = c; + s2 = s; + g = c *e_[i]; + h = c * p; + r = Hypot(p, e_[i]); // This is a Kaldi version of Hypot that works with templates. + e_[i+1] = s * r; + s =e_[i] / r; + c = p / r; + p = c * d_[i] - s * g; + d_[i+1] = h + s * (c * g + s * d_[i]); + + // Accumulate transformation. + + for (int k = 0; k < n_; k++) { + h = V(k, i+1); + V(k, i+1) = s * V(k, i) + c * h; + V(k, i) = c * V(k, i) - s * h; + } + } + p = -s * s2 * c3 * el1 *e_[l] / dl1; + e_[l] = s * p; + d_[l] = c * p; + + // Check for convergence. + + } while (std::abs(e_[l]) > eps*tst1); + } + d_[l] = d_[l] + f; + e_[l] = 0.0; + } + + // Sort eigenvalues and corresponding vectors. + + for (int i = 0; i < n_-1; i++) { + int k = i; + Real p = d_[i]; + for (int j = i+1; j < n_; j++) { + if (d_[j] < p) { + k = j; + p = d_[j]; + } + } + if (k != i) { + d_[k] = d_[i]; + d_[i] = p; + for (int j = 0; j < n_; j++) { + p = V(j, i); + V(j, i) = V(j, k); + V(j, k) = p; + } + } + } +} + +template +void EigenvalueDecomposition::Orthes() { + + // This is derived from the Algol procedures orthes and ortran, + // by Martin and Wilkinson, Handbook for Auto. Comp., + // Vol.ii-Linear Algebra, and the corresponding + // Fortran subroutines in EISPACK. + + int low = 0; + int high = n_-1; + + for (int m = low+1; m <= high-1; m++) { + + // Scale column. + + Real scale = 0.0; + for (int i = m; i <= high; i++) { + scale = scale + std::abs(H(i, m-1)); + } + if (scale != 0.0) { + + // Compute Householder transformation. + + Real h = 0.0; + for (int i = high; i >= m; i--) { + ort_[i] = H(i, m-1)/scale; + h += ort_[i] * ort_[i]; + } + Real g = std::sqrt(h); + if (ort_[m] > 0) { + g = -g; + } + h = h - ort_[m] * g; + ort_[m] = ort_[m] - g; + + // Apply Householder similarity transformation + // H = (I-u*u'/h)*H*(I-u*u')/h) + + for (int j = m; j < n_; j++) { + Real f = 0.0; + for (int i = high; i >= m; i--) { + f += ort_[i]*H(i, j); + } + f = f/h; + for (int i = m; i <= high; i++) { + H(i, j) -= f*ort_[i]; + } + } + + for (int i = 0; i <= high; i++) { + Real f = 0.0; + for (int j = high; j >= m; j--) { + f += ort_[j]*H(i, j); + } + f = f/h; + for (int j = m; j <= high; j++) { + H(i, j) -= f*ort_[j]; + } + } + ort_[m] = scale*ort_[m]; + H(m, m-1) = scale*g; + } + } + + // Accumulate transformations (Algol's ortran). + + for (int i = 0; i < n_; i++) { + for (int j = 0; j < n_; j++) { + V(i, j) = (i == j ? 1.0 : 0.0); + } + } + + for (int m = high-1; m >= low+1; m--) { + if (H(m, m-1) != 0.0) { + for (int i = m+1; i <= high; i++) { + ort_[i] = H(i, m-1); + } + for (int j = m; j <= high; j++) { + Real g = 0.0; + for (int i = m; i <= high; i++) { + g += ort_[i] * V(i, j); + } + // Double division avoids possible underflow + g = (g / ort_[m]) / H(m, m-1); + for (int i = m; i <= high; i++) { + V(i, j) += g * ort_[i]; + } + } + } + } +} + +template void EigenvalueDecomposition::Hqr2() { + // This is derived from the Algol procedure hqr2, + // by Martin and Wilkinson, Handbook for Auto. Comp., + // Vol.ii-Linear Algebra, and the corresponding + // Fortran subroutine in EISPACK. + + int nn = n_; + int n = nn-1; + int low = 0; + int high = nn-1; + Real eps = std::numeric_limits::epsilon(); + Real exshift = 0.0; + Real p = 0, q = 0, r = 0, s = 0, z=0, t, w, x, y; + + // Store roots isolated by balanc and compute matrix norm + + Real norm = 0.0; + for (int i = 0; i < nn; i++) { + if (i < low || i > high) { + d_[i] = H(i, i); + e_[i] = 0.0; + } + for (int j = std::max(i-1, 0); j < nn; j++) { + norm = norm + std::abs(H(i, j)); + } + } + + // Outer loop over eigenvalue index + + int iter = 0; + while (n >= low) { + + // Look for single small sub-diagonal element + + int l = n; + while (l > low) { + s = std::abs(H(l-1, l-1)) + std::abs(H(l, l)); + if (s == 0.0) { + s = norm; + } + if (std::abs(H(l, l-1)) < eps * s) { + break; + } + l--; + } + + // Check for convergence + // One root found + + if (l == n) { + H(n, n) = H(n, n) + exshift; + d_[n] = H(n, n); + e_[n] = 0.0; + n--; + iter = 0; + + // Two roots found + + } else if (l == n-1) { + w = H(n, n-1) * H(n-1, n); + p = (H(n-1, n-1) - H(n, n)) / 2.0; + q = p * p + w; + z = std::sqrt(std::abs(q)); + H(n, n) = H(n, n) + exshift; + H(n-1, n-1) = H(n-1, n-1) + exshift; + x = H(n, n); + + // Real pair + + if (q >= 0) { + if (p >= 0) { + z = p + z; + } else { + z = p - z; + } + d_[n-1] = x + z; + d_[n] = d_[n-1]; + if (z != 0.0) { + d_[n] = x - w / z; + } + e_[n-1] = 0.0; + e_[n] = 0.0; + x = H(n, n-1); + s = std::abs(x) + std::abs(z); + p = x / s; + q = z / s; + r = std::sqrt(p * p+q * q); + p = p / r; + q = q / r; + + // Row modification + + for (int j = n-1; j < nn; j++) { + z = H(n-1, j); + H(n-1, j) = q * z + p * H(n, j); + H(n, j) = q * H(n, j) - p * z; + } + + // Column modification + + for (int i = 0; i <= n; i++) { + z = H(i, n-1); + H(i, n-1) = q * z + p * H(i, n); + H(i, n) = q * H(i, n) - p * z; + } + + // Accumulate transformations + + for (int i = low; i <= high; i++) { + z = V(i, n-1); + V(i, n-1) = q * z + p * V(i, n); + V(i, n) = q * V(i, n) - p * z; + } + + // Complex pair + + } else { + d_[n-1] = x + p; + d_[n] = x + p; + e_[n-1] = z; + e_[n] = -z; + } + n = n - 2; + iter = 0; + + // No convergence yet + + } else { + + // Form shift + + x = H(n, n); + y = 0.0; + w = 0.0; + if (l < n) { + y = H(n-1, n-1); + w = H(n, n-1) * H(n-1, n); + } + + // Wilkinson's original ad hoc shift + + if (iter == 10) { + exshift += x; + for (int i = low; i <= n; i++) { + H(i, i) -= x; + } + s = std::abs(H(n, n-1)) + std::abs(H(n-1, n-2)); + x = y = 0.75 * s; + w = -0.4375 * s * s; + } + + // MATLAB's new ad hoc shift + + if (iter == 30) { + s = (y - x) / 2.0; + s = s * s + w; + if (s > 0) { + s = std::sqrt(s); + if (y < x) { + s = -s; + } + s = x - w / ((y - x) / 2.0 + s); + for (int i = low; i <= n; i++) { + H(i, i) -= s; + } + exshift += s; + x = y = w = 0.964; + } + } + + iter = iter + 1; // (Could check iteration count here.) + + // Look for two consecutive small sub-diagonal elements + + int m = n-2; + while (m >= l) { + z = H(m, m); + r = x - z; + s = y - z; + p = (r * s - w) / H(m+1, m) + H(m, m+1); + q = H(m+1, m+1) - z - r - s; + r = H(m+2, m+1); + s = std::abs(p) + std::abs(q) + std::abs(r); + p = p / s; + q = q / s; + r = r / s; + if (m == l) { + break; + } + if (std::abs(H(m, m-1)) * (std::abs(q) + std::abs(r)) < + eps * (std::abs(p) * (std::abs(H(m-1, m-1)) + std::abs(z) + + std::abs(H(m+1, m+1))))) { + break; + } + m--; + } + + for (int i = m+2; i <= n; i++) { + H(i, i-2) = 0.0; + if (i > m+2) { + H(i, i-3) = 0.0; + } + } + + // Double QR step involving rows l:n and columns m:n + + for (int k = m; k <= n-1; k++) { + bool notlast = (k != n-1); + if (k != m) { + p = H(k, k-1); + q = H(k+1, k-1); + r = (notlast ? H(k+2, k-1) : 0.0); + x = std::abs(p) + std::abs(q) + std::abs(r); + if (x != 0.0) { + p = p / x; + q = q / x; + r = r / x; + } + } + if (x == 0.0) { + break; + } + s = std::sqrt(p * p + q * q + r * r); + if (p < 0) { + s = -s; + } + if (s != 0) { + if (k != m) { + H(k, k-1) = -s * x; + } else if (l != m) { + H(k, k-1) = -H(k, k-1); + } + p = p + s; + x = p / s; + y = q / s; + z = r / s; + q = q / p; + r = r / p; + + // Row modification + + for (int j = k; j < nn; j++) { + p = H(k, j) + q * H(k+1, j); + if (notlast) { + p = p + r * H(k+2, j); + H(k+2, j) = H(k+2, j) - p * z; + } + H(k, j) = H(k, j) - p * x; + H(k+1, j) = H(k+1, j) - p * y; + } + + // Column modification + + for (int i = 0; i <= std::min(n, k+3); i++) { + p = x * H(i, k) + y * H(i, k+1); + if (notlast) { + p = p + z * H(i, k+2); + H(i, k+2) = H(i, k+2) - p * r; + } + H(i, k) = H(i, k) - p; + H(i, k+1) = H(i, k+1) - p * q; + } + + // Accumulate transformations + + for (int i = low; i <= high; i++) { + p = x * V(i, k) + y * V(i, k+1); + if (notlast) { + p = p + z * V(i, k+2); + V(i, k+2) = V(i, k+2) - p * r; + } + V(i, k) = V(i, k) - p; + V(i, k+1) = V(i, k+1) - p * q; + } + } // (s != 0) + } // k loop + } // check convergence + } // while (n >= low) + + // Backsubstitute to find vectors of upper triangular form + + if (norm == 0.0) { + return; + } + + for (n = nn-1; n >= 0; n--) { + p = d_[n]; + q = e_[n]; + + // Real vector + + if (q == 0) { + int l = n; + H(n, n) = 1.0; + for (int i = n-1; i >= 0; i--) { + w = H(i, i) - p; + r = 0.0; + for (int j = l; j <= n; j++) { + r = r + H(i, j) * H(j, n); + } + if (e_[i] < 0.0) { + z = w; + s = r; + } else { + l = i; + if (e_[i] == 0.0) { + if (w != 0.0) { + H(i, n) = -r / w; + } else { + H(i, n) = -r / (eps * norm); + } + + // Solve real equations + + } else { + x = H(i, i+1); + y = H(i+1, i); + q = (d_[i] - p) * (d_[i] - p) +e_[i] *e_[i]; + t = (x * s - z * r) / q; + H(i, n) = t; + if (std::abs(x) > std::abs(z)) { + H(i+1, n) = (-r - w * t) / x; + } else { + H(i+1, n) = (-s - y * t) / z; + } + } + + // Overflow control + + t = std::abs(H(i, n)); + if ((eps * t) * t > 1) { + for (int j = i; j <= n; j++) { + H(j, n) = H(j, n) / t; + } + } + } + } + + // Complex vector + + } else if (q < 0) { + int l = n-1; + + // Last vector component imaginary so matrix is triangular + + if (std::abs(H(n, n-1)) > std::abs(H(n-1, n))) { + H(n-1, n-1) = q / H(n, n-1); + H(n-1, n) = -(H(n, n) - p) / H(n, n-1); + } else { + Real cdivr, cdivi; + cdiv(0.0, -H(n-1, n), H(n-1, n-1)-p, q, &cdivr, &cdivi); + H(n-1, n-1) = cdivr; + H(n-1, n) = cdivi; + } + H(n, n-1) = 0.0; + H(n, n) = 1.0; + for (int i = n-2; i >= 0; i--) { + Real ra, sa, vr, vi; + ra = 0.0; + sa = 0.0; + for (int j = l; j <= n; j++) { + ra = ra + H(i, j) * H(j, n-1); + sa = sa + H(i, j) * H(j, n); + } + w = H(i, i) - p; + + if (e_[i] < 0.0) { + z = w; + r = ra; + s = sa; + } else { + l = i; + if (e_[i] == 0) { + Real cdivr, cdivi; + cdiv(-ra, -sa, w, q, &cdivr, &cdivi); + H(i, n-1) = cdivr; + H(i, n) = cdivi; + } else { + Real cdivr, cdivi; + // Solve complex equations + + x = H(i, i+1); + y = H(i+1, i); + vr = (d_[i] - p) * (d_[i] - p) +e_[i] *e_[i] - q * q; + vi = (d_[i] - p) * 2.0 * q; + if (vr == 0.0 && vi == 0.0) { + vr = eps * norm * (std::abs(w) + std::abs(q) + + std::abs(x) + std::abs(y) + std::abs(z)); + } + cdiv(x*r-z*ra+q*sa, x*s-z*sa-q*ra, vr, vi, &cdivr, &cdivi); + H(i, n-1) = cdivr; + H(i, n) = cdivi; + if (std::abs(x) > (std::abs(z) + std::abs(q))) { + H(i+1, n-1) = (-ra - w * H(i, n-1) + q * H(i, n)) / x; + H(i+1, n) = (-sa - w * H(i, n) - q * H(i, n-1)) / x; + } else { + cdiv(-r-y*H(i, n-1), -s-y*H(i, n), z, q, &cdivr, &cdivi); + H(i+1, n-1) = cdivr; + H(i+1, n) = cdivi; + } + } + + // Overflow control + + t = std::max(std::abs(H(i, n-1)), std::abs(H(i, n))); + if ((eps * t) * t > 1) { + for (int j = i; j <= n; j++) { + H(j, n-1) = H(j, n-1) / t; + H(j, n) = H(j, n) / t; + } + } + } + } + } + } + + // Vectors of isolated roots + + for (int i = 0; i < nn; i++) { + if (i < low || i > high) { + for (int j = i; j < nn; j++) { + V(i, j) = H(i, j); + } + } + } + + // Back transformation to get eigenvectors of original matrix + + for (int j = nn-1; j >= low; j--) { + for (int i = low; i <= high; i++) { + z = 0.0; + for (int k = low; k <= std::min(j, high); k++) { + z = z + V(i, k) * H(k, j); + } + V(i, j) = z; + } + } +} + +template +EigenvalueDecomposition::EigenvalueDecomposition(const MatrixBase &A) { + KALDI_ASSERT(A.NumCols() == A.NumRows() && A.NumCols() >= 1); + n_ = A.NumRows(); + V_ = new Real[n_*n_]; + d_ = new Real[n_]; + e_ = new Real[n_]; + H_ = NULL; + ort_ = NULL; + if (A.IsSymmetric(0.0)) { + + for (int i = 0; i < n_; i++) + for (int j = 0; j < n_; j++) + V(i, j) = A(i, j); // Note that V(i, j) is a member function; A(i, j) is an operator + // of the matrix A. + // Tridiagonalize. + Tred2(); + + // Diagonalize. + Tql2(); + } else { + H_ = new Real[n_*n_]; + ort_ = new Real[n_]; + for (int i = 0; i < n_; i++) + for (int j = 0; j < n_; j++) + H(i, j) = A(i, j); // as before: H is member function, A(i, j) is operator of matrix. + + // Reduce to Hessenberg form. + Orthes(); + + // Reduce Hessenberg to real Schur form. + Hqr2(); + } +} + +template +EigenvalueDecomposition::~EigenvalueDecomposition() { + delete [] d_; + delete [] e_; + delete [] V_; + delete [] H_; + delete [] ort_; +} + +// see function MatrixBase::Eig in kaldi-matrix.cc + + +} // namespace kaldi + +#endif // KALDI_MATRIX_JAMA_EIG_H_ diff --git a/speechx/speechx/kaldi/matrix/jama-svd.h b/speechx/speechx/kaldi/matrix/jama-svd.h new file mode 100644 index 00000000..8304dac6 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/jama-svd.h @@ -0,0 +1,531 @@ +// matrix/jama-svd.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +// This file consists of a port and modification of materials from +// JAMA: A Java Matrix Package +// under the following notice: This software is a cooperative product of +// The MathWorks and the National Institute of Standards and Technology (NIST) +// which has been released to the public. This notice and the original code are +// available at http://math.nist.gov/javanumerics/jama/domain.notice + + +#ifndef KALDI_MATRIX_JAMA_SVD_H_ +#define KALDI_MATRIX_JAMA_SVD_H_ 1 + + +#include "matrix/kaldi-matrix.h" +#include "matrix/sp-matrix.h" +#include "matrix/cblas-wrappers.h" + +namespace kaldi { + +#if defined(HAVE_ATLAS) || defined(USE_KALDI_SVD) +// using ATLAS as our math library, which doesn't have SVD -> need +// to implement it. + +// This routine is a modified form of jama_svd.h which is part of the TNT distribution. +// (originally comes from JAMA). + +/** Singular Value Decomposition. + *

+ * For an m-by-n matrix A with m >= n, the singular value decomposition is + * an m-by-n orthogonal matrix U, an n-by-n diagonal matrix S, and + * an n-by-n orthogonal matrix V so that A = U*S*V'. + *

+ * The singular values, sigma[k] = S(k, k), are ordered so that + * sigma[0] >= sigma[1] >= ... >= sigma[n-1]. + *

+ * The singular value decompostion always exists, so the constructor will + * never fail. The matrix condition number and the effective numerical + * rank can be computed from this decomposition. + + *

+ * (Adapted from JAMA, a Java Matrix Library, developed by jointly + * by the Mathworks and NIST; see http://math.nist.gov/javanumerics/jama). + */ + + +template +bool MatrixBase::JamaSvd(VectorBase *s_in, + MatrixBase *U_in, + MatrixBase *V_in) { // Destructive! + KALDI_ASSERT(s_in != NULL && U_in != this && V_in != this); + int wantu = (U_in != NULL), wantv = (V_in != NULL); + Matrix Utmp, Vtmp; + MatrixBase &U = (U_in ? *U_in : Utmp), &V = (V_in ? *V_in : Vtmp); + VectorBase &s = *s_in; + + int m = num_rows_, n = num_cols_; + KALDI_ASSERT(m>=n && m != 0 && n != 0); + if (wantu) KALDI_ASSERT((int)U.num_rows_ == m && (int)U.num_cols_ == n); + if (wantv) KALDI_ASSERT((int)V.num_rows_ == n && (int)V.num_cols_ == n); + KALDI_ASSERT((int)s.Dim() == n); // n<=m so n is min. + + int nu = n; + U.SetZero(); // make sure all zero. + Vector e(n); + Vector work(m); + MatrixBase &A(*this); + Real *adata = A.Data(), *workdata = work.Data(), *edata = e.Data(), + *udata = U.Data(), *vdata = V.Data(); + int astride = static_cast(A.Stride()), + ustride = static_cast(U.Stride()), + vstride = static_cast(V.Stride()); + int i = 0, j = 0, k = 0; + + // Reduce A to bidiagonal form, storing the diagonal elements + // in s and the super-diagonal elements in e. + + int nct = std::min(m-1, n); + int nrt = std::max(0, std::min(n-2, m)); + for (k = 0; k < std::max(nct, nrt); k++) { + if (k < nct) { + + // Compute the transformation for the k-th column and + // place the k-th diagonal in s(k). + // Compute 2-norm of k-th column without under/overflow. + s(k) = 0; + for (i = k; i < m; i++) { + s(k) = hypot(s(k), A(i, k)); + } + if (s(k) != 0.0) { + if (A(k, k) < 0.0) { + s(k) = -s(k); + } + for (i = k; i < m; i++) { + A(i, k) /= s(k); + } + A(k, k) += 1.0; + } + s(k) = -s(k); + } + for (j = k+1; j < n; j++) { + if ((k < nct) && (s(k) != 0.0)) { + + // Apply the transformation. + + Real t = cblas_Xdot(m - k, adata + astride*k + k, astride, + adata + astride*k + j, astride); + /*for (i = k; i < m; i++) { + t += adata[i*astride + k]*adata[i*astride + j]; // A(i, k)*A(i, j); // 3 + }*/ + t = -t/A(k, k); + cblas_Xaxpy(m - k, t, adata + k*astride + k, astride, + adata + k*astride + j, astride); + /*for (i = k; i < m; i++) { + adata[i*astride + j] += t*adata[i*astride + k]; // A(i, j) += t*A(i, k); // 5 + }*/ + } + + // Place the k-th row of A into e for the + // subsequent calculation of the row transformation. + + e(j) = A(k, j); + } + if (wantu & (k < nct)) { + + // Place the transformation in U for subsequent back + // multiplication. + + for (i = k; i < m; i++) { + U(i, k) = A(i, k); + } + } + if (k < nrt) { + + // Compute the k-th row transformation and place the + // k-th super-diagonal in e(k). + // Compute 2-norm without under/overflow. + e(k) = 0; + for (i = k+1; i < n; i++) { + e(k) = hypot(e(k), e(i)); + } + if (e(k) != 0.0) { + if (e(k+1) < 0.0) { + e(k) = -e(k); + } + for (i = k+1; i < n; i++) { + e(i) /= e(k); + } + e(k+1) += 1.0; + } + e(k) = -e(k); + if ((k+1 < m) & (e(k) != 0.0)) { + + // Apply the transformation. + + for (i = k+1; i < m; i++) { + work(i) = 0.0; + } + for (j = k+1; j < n; j++) { + for (i = k+1; i < m; i++) { + workdata[i] += edata[j] * adata[i*astride + j]; // work(i) += e(j)*A(i, j); // 5 + } + } + for (j = k+1; j < n; j++) { + Real t(-e(j)/e(k+1)); + cblas_Xaxpy(m - (k+1), t, workdata + (k+1), 1, + adata + (k+1)*astride + j, astride); + /* + for (i = k+1; i < m; i++) { + adata[i*astride + j] += t*workdata[i]; // A(i, j) += t*work(i); // 5 + }*/ + } + } + if (wantv) { + + // Place the transformation in V for subsequent + // back multiplication. + + for (i = k+1; i < n; i++) { + V(i, k) = e(i); + } + } + } + } + + // Set up the final bidiagonal matrix or order p. + + int p = std::min(n, m+1); + if (nct < n) { + s(nct) = A(nct, nct); + } + if (m < p) { + s(p-1) = 0.0; + } + if (nrt+1 < p) { + e(nrt) = A(nrt, p-1); + } + e(p-1) = 0.0; + + // If required, generate U. + + if (wantu) { + for (j = nct; j < nu; j++) { + for (i = 0; i < m; i++) { + U(i, j) = 0.0; + } + U(j, j) = 1.0; + } + for (k = nct-1; k >= 0; k--) { + if (s(k) != 0.0) { + for (j = k+1; j < nu; j++) { + Real t = cblas_Xdot(m - k, udata + k*ustride + k, ustride, udata + k*ustride + j, ustride); + //for (i = k; i < m; i++) { + // t += udata[i*ustride + k]*udata[i*ustride + j]; // t += U(i, k)*U(i, j); // 8 + // } + t = -t/U(k, k); + cblas_Xaxpy(m - k, t, udata + ustride*k + k, ustride, + udata + k*ustride + j, ustride); + /*for (i = k; i < m; i++) { + udata[i*ustride + j] += t*udata[i*ustride + k]; // U(i, j) += t*U(i, k); // 4 + }*/ + } + for (i = k; i < m; i++ ) { + U(i, k) = -U(i, k); + } + U(k, k) = 1.0 + U(k, k); + for (i = 0; i < k-1; i++) { + U(i, k) = 0.0; + } + } else { + for (i = 0; i < m; i++) { + U(i, k) = 0.0; + } + U(k, k) = 1.0; + } + } + } + + // If required, generate V. + + if (wantv) { + for (k = n-1; k >= 0; k--) { + if ((k < nrt) & (e(k) != 0.0)) { + for (j = k+1; j < nu; j++) { + Real t = cblas_Xdot(n - (k+1), vdata + (k+1)*vstride + k, vstride, + vdata + (k+1)*vstride + j, vstride); + /*Real t (0.0); + for (i = k+1; i < n; i++) { + t += vdata[i*vstride + k]*vdata[i*vstride + j]; // t += V(i, k)*V(i, j); // 7 + }*/ + t = -t/V(k+1, k); + cblas_Xaxpy(n - (k+1), t, vdata + (k+1)*vstride + k, vstride, + vdata + (k+1)*vstride + j, vstride); + /*for (i = k+1; i < n; i++) { + vdata[i*vstride + j] += t*vdata[i*vstride + k]; // V(i, j) += t*V(i, k); // 7 + }*/ + } + } + for (i = 0; i < n; i++) { + V(i, k) = 0.0; + } + V(k, k) = 1.0; + } + } + + // Main iteration loop for the singular values. + + int pp = p-1; + int iter = 0; + // note: -52.0 is from Jama code; the -23 is the extension + // to float, because mantissa length in (double, float) + // is (52, 23) bits respectively. + Real eps(pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0)); + // Note: the -966 was taken from Jama code, but the -120 is a guess + // of how to extend this to float... the exponent in double goes + // from -1022 .. 1023, and in float from -126..127. I'm not sure + // what the significance of 966 is, so -120 just represents a number + // that's a bit less negative than -126. If we get convergence + // failure in float only, this may mean that we have to make the + // -120 value less negative. + Real tiny(pow(2.0, sizeof(Real) == 4 ? -120.0: -966.0 )); + + while (p > 0) { + int k = 0; + int kase = 0; + + if (iter == 500 || iter == 750) { + KALDI_WARN << "Svd taking a long time: making convergence criterion less exact."; + eps = pow(static_cast(0.8), eps); + tiny = pow(static_cast(0.8), tiny); + } + if (iter > 1000) { + KALDI_WARN << "Svd not converging on matrix of size " << m << " by " <= -1; k--) { + if (k == -1) { + break; + } + if (std::abs(e(k)) <= + tiny + eps*(std::abs(s(k)) + std::abs(s(k+1)))) { + e(k) = 0.0; + break; + } + } + if (k == p-2) { + kase = 4; + } else { + int ks; + for (ks = p-1; ks >= k; ks--) { + if (ks == k) { + break; + } + Real t( (ks != p ? std::abs(e(ks)) : 0.) + + (ks != k+1 ? std::abs(e(ks-1)) : 0.)); + if (std::abs(s(ks)) <= tiny + eps*t) { + s(ks) = 0.0; + break; + } + } + if (ks == k) { + kase = 3; + } else if (ks == p-1) { + kase = 1; + } else { + kase = 2; + k = ks; + } + } + k++; + + // Perform the task indicated by kase. + + switch (kase) { + + // Deflate negligible s(p). + + case 1: { + Real f(e(p-2)); + e(p-2) = 0.0; + for (j = p-2; j >= k; j--) { + Real t( hypot(s(j), f)); + Real cs(s(j)/t); + Real sn(f/t); + s(j) = t; + if (j != k) { + f = -sn*e(j-1); + e(j-1) = cs*e(j-1); + } + if (wantv) { + for (i = 0; i < n; i++) { + t = cs*V(i, j) + sn*V(i, p-1); + V(i, p-1) = -sn*V(i, j) + cs*V(i, p-1); + V(i, j) = t; + } + } + } + } + break; + + // Split at negligible s(k). + + case 2: { + Real f(e(k-1)); + e(k-1) = 0.0; + for (j = k; j < p; j++) { + Real t(hypot(s(j), f)); + Real cs( s(j)/t); + Real sn(f/t); + s(j) = t; + f = -sn*e(j); + e(j) = cs*e(j); + if (wantu) { + for (i = 0; i < m; i++) { + t = cs*U(i, j) + sn*U(i, k-1); + U(i, k-1) = -sn*U(i, j) + cs*U(i, k-1); + U(i, j) = t; + } + } + } + } + break; + + // Perform one qr step. + + case 3: { + + // Calculate the shift. + + Real scale = std::max(std::max(std::max(std::max( + std::abs(s(p-1)), std::abs(s(p-2))), std::abs(e(p-2))), + std::abs(s(k))), std::abs(e(k))); + Real sp = s(p-1)/scale; + Real spm1 = s(p-2)/scale; + Real epm1 = e(p-2)/scale; + Real sk = s(k)/scale; + Real ek = e(k)/scale; + Real b = ((spm1 + sp)*(spm1 - sp) + epm1*epm1)/2.0; + Real c = (sp*epm1)*(sp*epm1); + Real shift = 0.0; + if ((b != 0.0) || (c != 0.0)) { + shift = std::sqrt(b*b + c); + if (b < 0.0) { + shift = -shift; + } + shift = c/(b + shift); + } + Real f = (sk + sp)*(sk - sp) + shift; + Real g = sk*ek; + + // Chase zeros. + + for (j = k; j < p-1; j++) { + Real t = hypot(f, g); + Real cs = f/t; + Real sn = g/t; + if (j != k) { + e(j-1) = t; + } + f = cs*s(j) + sn*e(j); + e(j) = cs*e(j) - sn*s(j); + g = sn*s(j+1); + s(j+1) = cs*s(j+1); + if (wantv) { + cblas_Xrot(n, vdata + j, vstride, vdata + j+1, vstride, cs, sn); + /*for (i = 0; i < n; i++) { + t = cs*vdata[i*vstride + j] + sn*vdata[i*vstride + j+1]; // t = cs*V(i, j) + sn*V(i, j+1); // 13 + vdata[i*vstride + j+1] = -sn*vdata[i*vstride + j] + cs*vdata[i*vstride + j+1]; // V(i, j+1) = -sn*V(i, j) + cs*V(i, j+1); // 5 + vdata[i*vstride + j] = t; // V(i, j) = t; // 4 + }*/ + } + t = hypot(f, g); + cs = f/t; + sn = g/t; + s(j) = t; + f = cs*e(j) + sn*s(j+1); + s(j+1) = -sn*e(j) + cs*s(j+1); + g = sn*e(j+1); + e(j+1) = cs*e(j+1); + if (wantu && (j < m-1)) { + cblas_Xrot(m, udata + j, ustride, udata + j+1, ustride, cs, sn); + /*for (i = 0; i < m; i++) { + t = cs*udata[i*ustride + j] + sn*udata[i*ustride + j+1]; // t = cs*U(i, j) + sn*U(i, j+1); // 7 + udata[i*ustride + j+1] = -sn*udata[i*ustride + j] +cs*udata[i*ustride + j+1]; // U(i, j+1) = -sn*U(i, j) + cs*U(i, j+1); // 8 + udata[i*ustride + j] = t; // U(i, j) = t; // 1 + }*/ + } + } + e(p-2) = f; + iter = iter + 1; + } + break; + + // Convergence. + + case 4: { + + // Make the singular values positive. + + if (s(k) <= 0.0) { + s(k) = (s(k) < 0.0 ? -s(k) : 0.0); + if (wantv) { + for (i = 0; i <= pp; i++) { + V(i, k) = -V(i, k); + } + } + } + + // Order the singular values. + + while (k < pp) { + if (s(k) >= s(k+1)) { + break; + } + Real t = s(k); + s(k) = s(k+1); + s(k+1) = t; + if (wantv && (k < n-1)) { + for (i = 0; i < n; i++) { + t = V(i, k+1); V(i, k+1) = V(i, k); V(i, k) = t; + } + } + if (wantu && (k < m-1)) { + for (i = 0; i < m; i++) { + t = U(i, k+1); U(i, k+1) = U(i, k); U(i, k) = t; + } + } + k++; + } + iter = 0; + p--; + } + break; + } + } + return true; +} + +#endif // defined(HAVE_ATLAS) || defined(USE_KALDI_SVD) + +} // namespace kaldi + +#endif // KALDI_MATRIX_JAMA_SVD_H_ diff --git a/speechx/speechx/kaldi/matrix/kaldi-blas.h b/speechx/speechx/kaldi/matrix/kaldi-blas.h new file mode 100644 index 00000000..b08d8c51 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/kaldi-blas.h @@ -0,0 +1,133 @@ +// matrix/kaldi-blas.h + +// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_MATRIX_KALDI_BLAS_H_ +#define KALDI_MATRIX_KALDI_BLAS_H_ + +// This file handles the #includes for BLAS, LAPACK and so on. +// It manipulates the declarations into a common format that kaldi can handle. +// However, the kaldi code will check whether HAVE_ATLAS is defined as that +// code is called a bit differently from CLAPACK that comes from other sources. + +// There are three alternatives: +// (i) you have ATLAS, which includes the ATLAS implementation of CBLAS +// plus a subset of CLAPACK (but with clapack_ in the function declarations). +// In this case, define HAVE_ATLAS and make sure the relevant directories are +// in the include path. + +// (ii) you have CBLAS (some implementation thereof) plus CLAPACK. +// In this case, define HAVE_CLAPACK. +// [Since CLAPACK depends on BLAS, the presence of BLAS is implicit]. + +// (iii) you have the MKL library, which includes CLAPACK and CBLAS. + +// Note that if we are using ATLAS, no Svd implementation is supplied, +// so we define HAVE_Svd to be zero and this directs our implementation to +// supply its own "by hand" implementation which is based on TNT code. + + + +#define HAVE_OPENBLAS + +#if (defined(HAVE_CLAPACK) && (defined(HAVE_ATLAS) || defined(HAVE_MKL))) \ + || (defined(HAVE_ATLAS) && defined(HAVE_MKL)) +#error "Do not define more than one of HAVE_CLAPACK, HAVE_ATLAS and HAVE_MKL" +#endif + +#ifdef HAVE_ATLAS + extern "C" { + #include "cblas.h" + #include "clapack.h" + } +#elif defined(HAVE_CLAPACK) + #ifdef __APPLE__ + #ifndef __has_extension + #define __has_extension(x) 0 + #endif + #define vImage_Utilities_h + #define vImage_CVUtilities_h + #include + typedef __CLPK_integer integer; + typedef __CLPK_logical logical; + typedef __CLPK_real real; + typedef __CLPK_doublereal doublereal; + typedef __CLPK_complex complex; + typedef __CLPK_doublecomplex doublecomplex; + typedef __CLPK_ftnlen ftnlen; + #else + extern "C" { + // May be in /usr/[local]/include if installed; else this uses the one + // from the tools/CLAPACK_include directory. + #include + #include + #include + + // get rid of macros from f2c.h -- these are dangerous. + #undef abs + #undef dabs + #undef min + #undef max + #undef dmin + #undef dmax + #undef bit_test + #undef bit_clear + #undef bit_set + } + #endif +#elif defined(HAVE_MKL) + extern "C" { + #include + } +#elif defined(HAVE_OPENBLAS) + // getting cblas.h and lapacke.h from /. + // putting in "" not <> to search -I before system libraries. + #include "third_party/openblas/cblas.h" + #include "third_party/openblas/lapacke.h" + #undef I + #undef complex + // get rid of macros from f2c.h -- these are dangerous. + #undef abs + #undef dabs + #undef min + #undef max + #undef dmin + #undef dmax + #undef bit_test + #undef bit_clear + #undef bit_set +#else + #error "You need to define (using the preprocessor) either HAVE_CLAPACK or HAVE_ATLAS or HAVE_MKL (but not more than one)" +#endif + +#ifdef HAVE_OPENBLAS +typedef int KaldiBlasInt; // try int. +#endif +#ifdef HAVE_CLAPACK +typedef integer KaldiBlasInt; +#endif +#ifdef HAVE_MKL +typedef MKL_INT KaldiBlasInt; +#endif + +#ifdef HAVE_ATLAS +// in this case there is no need for KaldiBlasInt-- this typedef is only needed +// for Svd code which is not included in ATLAS (we re-implement it). +#endif + + +#endif // KALDI_MATRIX_KALDI_BLAS_H_ diff --git a/speechx/speechx/kaldi/matrix/kaldi-matrix-inl.h b/speechx/speechx/kaldi/matrix/kaldi-matrix-inl.h new file mode 100644 index 00000000..c2ff0079 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/kaldi-matrix-inl.h @@ -0,0 +1,63 @@ +// matrix/kaldi-matrix-inl.h + +// Copyright 2009-2011 Microsoft Corporation; Haihua Xu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_MATRIX_KALDI_MATRIX_INL_H_ +#define KALDI_MATRIX_KALDI_MATRIX_INL_H_ 1 + +#include "matrix/kaldi-vector.h" + +namespace kaldi { + +/// Empty constructor +template +Matrix::Matrix(): MatrixBase(NULL, 0, 0, 0) { } + + +template<> +template<> +void MatrixBase::AddVecVec(const float alpha, const VectorBase &ra, const VectorBase &rb); + +template<> +template<> +void MatrixBase::AddVecVec(const double alpha, const VectorBase &ra, const VectorBase &rb); + +template +inline std::ostream & operator << (std::ostream & os, const MatrixBase & M) { + M.Write(os, false); + return os; +} + +template +inline std::istream & operator >> (std::istream & is, Matrix & M) { + M.Read(is, false); + return is; +} + + +template +inline std::istream & operator >> (std::istream & is, MatrixBase & M) { + M.Read(is, false); + return is; +} + +}// namespace kaldi + + +#endif // KALDI_MATRIX_KALDI_MATRIX_INL_H_ + diff --git a/speechx/speechx/kaldi/matrix/kaldi-matrix.cc b/speechx/speechx/kaldi/matrix/kaldi-matrix.cc new file mode 100644 index 00000000..faf23cdf --- /dev/null +++ b/speechx/speechx/kaldi/matrix/kaldi-matrix.cc @@ -0,0 +1,3103 @@ +// matrix/kaldi-matrix.cc + +// Copyright 2009-2011 Lukas Burget; Ondrej Glembek; Go Vivace Inc.; +// Microsoft Corporation; Saarland University; +// Yanmin Qian; Petr Schwarz; Jan Silovsky; +// Haihua Xu +// 2017 Shiyin Kang +// 2019 Yiwen Shao + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "matrix/kaldi-matrix.h" +#include "matrix/sp-matrix.h" +#include "matrix/jama-svd.h" +#include "matrix/jama-eig.h" +#include "matrix/compressed-matrix.h" +#include "matrix/sparse-matrix.h" + +static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans), + "kaldi::kNoTrans and kaldi::kTrans must be equal to the appropriate CBLAS library constants!"); + +namespace kaldi { + +template +void MatrixBase::Invert(Real *log_det, Real *det_sign, + bool inverse_needed) { + KALDI_ASSERT(num_rows_ == num_cols_); + if (num_rows_ == 0) { + if (det_sign) *det_sign = 1; + if (log_det) *log_det = 0.0; + return; + } +#ifndef HAVE_ATLAS + KaldiBlasInt *pivot = new KaldiBlasInt[num_rows_]; + KaldiBlasInt M = num_rows_; + KaldiBlasInt N = num_cols_; + KaldiBlasInt LDA = stride_; + KaldiBlasInt result = -1; + KaldiBlasInt l_work = std::max(1, N); + Real *p_work; + void *temp; + if ((p_work = static_cast( + KALDI_MEMALIGN(16, sizeof(Real)*l_work, &temp))) == NULL) { + delete[] pivot; + throw std::bad_alloc(); + } + + clapack_Xgetrf2(&M, &N, data_, &LDA, pivot, &result); + const int pivot_offset = 1; +#else + int *pivot = new int[num_rows_]; + int result; + clapack_Xgetrf(num_rows_, num_cols_, data_, stride_, pivot, &result); + const int pivot_offset = 0; +#endif + KALDI_ASSERT(result >= 0 && "Call to CLAPACK sgetrf_ or ATLAS clapack_sgetrf " + "called with wrong arguments"); + if (result > 0) { + if (inverse_needed) { + KALDI_ERR << "Cannot invert: matrix is singular"; + } else { + if (log_det) *log_det = -std::numeric_limits::infinity(); + if (det_sign) *det_sign = 0; + delete[] pivot; +#ifndef HAVE_ATLAS + KALDI_MEMALIGN_FREE(p_work); +#endif + return; + } + } + if (det_sign != NULL) { + int sign = 1; + for (MatrixIndexT i = 0; i < num_rows_; i++) + if (pivot[i] != static_cast(i) + pivot_offset) sign *= -1; + *det_sign = sign; + } + if (log_det != NULL || det_sign != NULL) { // Compute log determinant. + if (log_det != NULL) *log_det = 0.0; + Real prod = 1.0; + for (MatrixIndexT i = 0; i < num_rows_; i++) { + prod *= (*this)(i, i); + if (i == num_rows_ - 1 || std::fabs(prod) < 1.0e-10 || + std::fabs(prod) > 1.0e+10) { + if (log_det != NULL) *log_det += kaldi::Log(std::fabs(prod)); + if (det_sign != NULL) *det_sign *= (prod > 0 ? 1.0 : -1.0); + prod = 1.0; + } + } + } +#ifndef HAVE_ATLAS + if (inverse_needed) clapack_Xgetri2(&M, data_, &LDA, pivot, p_work, &l_work, + &result); + delete[] pivot; + KALDI_MEMALIGN_FREE(p_work); +#else + if (inverse_needed) + clapack_Xgetri(num_rows_, data_, stride_, pivot, &result); + delete [] pivot; +#endif + KALDI_ASSERT(result == 0 && "Call to CLAPACK sgetri_ or ATLAS clapack_sgetri " + "called with wrong arguments"); +} + +template<> +template<> +void MatrixBase::AddVecVec(const float alpha, + const VectorBase &a, + const VectorBase &rb) { + KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_); + cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(), + 1, data_, stride_); +} + +template +template +void MatrixBase::AddVecVec(const Real alpha, + const VectorBase &a, + const VectorBase &b) { + KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_); + if (num_rows_ * num_cols_ > 100) { // It's probably worth it to allocate + // temporary vectors of the right type and use BLAS. + Vector temp_a(a), temp_b(b); + cblas_Xger(num_rows_, num_cols_, alpha, temp_a.Data(), 1, + temp_b.Data(), 1, data_, stride_); + } else { + const OtherReal *a_data = a.Data(), *b_data = b.Data(); + Real *row_data = data_; + for (MatrixIndexT i = 0; i < num_rows_; i++, row_data += stride_) { + BaseFloat alpha_ai = alpha * a_data[i]; + for (MatrixIndexT j = 0; j < num_cols_; j++) + row_data[j] += alpha_ai * b_data[j]; + } + } +} + +// instantiate the template above. +template +void MatrixBase::AddVecVec(const float alpha, + const VectorBase &a, + const VectorBase &b); +template +void MatrixBase::AddVecVec(const double alpha, + const VectorBase &a, + const VectorBase &b); + +template<> +template<> +void MatrixBase::AddVecVec(const double alpha, + const VectorBase &a, + const VectorBase &rb) { + KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_); + if (num_rows_ == 0) return; + cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(), + 1, data_, stride_); +} + +template +void MatrixBase::AddMatMat(const Real alpha, + const MatrixBase& A, + MatrixTransposeType transA, + const MatrixBase& B, + MatrixTransposeType transB, + const Real beta) { + KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ == B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_) + || (transA == kTrans && transB == kNoTrans && A.num_rows_ == B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_) + || (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_) + || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_)); + KALDI_ASSERT(&A != this && &B != this); + if (num_rows_ == 0) return; + cblas_Xgemm(alpha, transA, A.data_, A.num_rows_, A.num_cols_, A.stride_, + transB, B.data_, B.stride_, beta, data_, num_rows_, num_cols_, stride_); + +} + +template +void MatrixBase::SetMatMatDivMat(const MatrixBase& A, + const MatrixBase& B, + const MatrixBase& C) { + KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols()); + KALDI_ASSERT(A.NumRows() == C.NumRows() && A.NumCols() == C.NumCols()); + for (int32 r = 0; r < A.NumRows(); r++) { // each frame... + for (int32 c = 0; c < A.NumCols(); c++) { + BaseFloat i = C(r, c), o = B(r, c), od = A(r, c), + id; + if (i != 0.0) { + id = od * (o / i); /// o / i is either zero or "scale". + } else { + id = od; /// Just imagine the scale was 1.0. This is somehow true in + /// expectation; anyway, this case should basically never happen so it doesn't + /// really matter. + } + (*this)(r, c) = id; + } + } +} + + +template +void MatrixBase::CopyLowerToUpper() { + KALDI_ASSERT(num_rows_ == num_cols_); + Real *data = data_; + MatrixIndexT num_rows = num_rows_, stride = stride_; + for (int32 i = 0; i < num_rows; i++) + for (int32 j = 0; j < i; j++) + data[j * stride + i ] = data[i * stride + j]; +} + + +template +void MatrixBase::CopyUpperToLower() { + KALDI_ASSERT(num_rows_ == num_cols_); + Real *data = data_; + MatrixIndexT num_rows = num_rows_, stride = stride_; + for (int32 i = 0; i < num_rows; i++) + for (int32 j = 0; j < i; j++) + data[i * stride + j] = data[j * stride + i]; +} + +template +void MatrixBase::SymAddMat2(const Real alpha, + const MatrixBase &A, + MatrixTransposeType transA, + Real beta) { + KALDI_ASSERT(num_rows_ == num_cols_ && + ((transA == kNoTrans && A.num_rows_ == num_rows_) || + (transA == kTrans && A.num_cols_ == num_cols_))); + KALDI_ASSERT(A.data_ != data_); + if (num_rows_ == 0) return; + + /// When the matrix dimension(this->num_rows_) is not less than 56 + /// and the transpose type transA == kTrans, the cblas_Xsyrk(...) + /// function will produce NaN in the output. This is a bug in the + /// ATLAS library. To overcome this, the AddMatMat function, which calls + /// cblas_Xgemm(...) rather than cblas_Xsyrk(...), is used in this special + /// sitation. + /// Wei Shi: Note this bug is observerd for single precision matrix + /// on a 64-bit machine +#ifdef HAVE_ATLAS + if (transA == kTrans && num_rows_ >= 56) { + this->AddMatMat(alpha, A, kTrans, A, kNoTrans, beta); + return; + } +#endif // HAVE_ATLAS + + MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_); + + // This function call is hard-coded to update the lower triangle. + cblas_Xsyrk(transA, num_rows_, A_other_dim, alpha, A.Data(), + A.Stride(), beta, this->data_, this->stride_); +} + + +template +void MatrixBase::AddMatSmat(const Real alpha, + const MatrixBase &A, + MatrixTransposeType transA, + const MatrixBase &B, + MatrixTransposeType transB, + const Real beta) { + KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ == B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_) + || (transA == kTrans && transB == kNoTrans && A.num_rows_ == B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_) + || (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_) + || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_)); + KALDI_ASSERT(&A != this && &B != this); + + // We iterate over the columns of B. + + MatrixIndexT Astride = A.stride_, Bstride = B.stride_, stride = this->stride_, + Arows = A.num_rows_, Acols = A.num_cols_; + Real *data = this->data_, *Adata = A.data_, *Bdata = B.data_; + MatrixIndexT num_cols = this->num_cols_; + if (transB == kNoTrans) { + // Iterate over the columns of *this and of B. + for (MatrixIndexT c = 0; c < num_cols; c++) { + // for each column of *this, do + // [this column] = [alpha * A * this column of B] + [beta * this column] + Xgemv_sparsevec(transA, Arows, Acols, alpha, Adata, Astride, + Bdata + c, Bstride, beta, data + c, stride); + } + } else { + // Iterate over the columns of *this and the rows of B. + for (MatrixIndexT c = 0; c < num_cols; c++) { + // for each column of *this, do + // [this column] = [alpha * A * this row of B] + [beta * this column] + Xgemv_sparsevec(transA, Arows, Acols, alpha, Adata, Astride, + Bdata + (c * Bstride), 1, beta, data + c, stride); + } + } +} + +template +void MatrixBase::AddSmatMat(const Real alpha, + const MatrixBase &A, + MatrixTransposeType transA, + const MatrixBase &B, + MatrixTransposeType transB, + const Real beta) { + KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ == B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_) + || (transA == kTrans && transB == kNoTrans && A.num_rows_ == B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_) + || (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_) + || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_)); + KALDI_ASSERT(&A != this && &B != this); + + MatrixIndexT Astride = A.stride_, Bstride = B.stride_, stride = this->stride_, + Brows = B.num_rows_, Bcols = B.num_cols_; + MatrixTransposeType invTransB = (transB == kTrans ? kNoTrans : kTrans); + Real *data = this->data_, *Adata = A.data_, *Bdata = B.data_; + MatrixIndexT num_rows = this->num_rows_; + if (transA == kNoTrans) { + // Iterate over the rows of *this and of A. + for (MatrixIndexT r = 0; r < num_rows; r++) { + // for each row of *this, do + // [this row] = [alpha * (this row of A) * B^T] + [beta * this row] + Xgemv_sparsevec(invTransB, Brows, Bcols, alpha, Bdata, Bstride, + Adata + (r * Astride), 1, beta, data + (r * stride), 1); + } + } else { + // Iterate over the rows of *this and the columns of A. + for (MatrixIndexT r = 0; r < num_rows; r++) { + // for each row of *this, do + // [this row] = [alpha * (this column of A) * B^T] + [beta * this row] + Xgemv_sparsevec(invTransB, Brows, Bcols, alpha, Bdata, Bstride, + Adata + r, Astride, beta, data + (r * stride), 1); + } + } +} + +template +void MatrixBase::AddSpSp(const Real alpha, const SpMatrix &A_in, + const SpMatrix &B_in, const Real beta) { + MatrixIndexT sz = num_rows_; + KALDI_ASSERT(sz == num_cols_ && sz == A_in.NumRows() && sz == B_in.NumRows()); + + Matrix A(A_in), B(B_in); + // CblasLower or CblasUpper would work below as symmetric matrix is copied + // fully (to save work, we used the matrix constructor from SpMatrix). + // CblasLeft means A is on the left: C <-- alpha A B + beta C + if (sz == 0) return; + cblas_Xsymm(alpha, sz, A.data_, A.stride_, B.data_, B.stride_, beta, data_, stride_); +} + +template +void MatrixBase::AddMat(const Real alpha, const MatrixBase& A, + MatrixTransposeType transA) { + if (&A == this) { + if (transA == kNoTrans) { + Scale(alpha + 1.0); + } else { + KALDI_ASSERT(num_rows_ == num_cols_ && "AddMat: adding to self (transposed): not symmetric."); + Real *data = data_; + if (alpha == 1.0) { // common case-- handle separately. + for (MatrixIndexT row = 0; row < num_rows_; row++) { + for (MatrixIndexT col = 0; col < row; col++) { + Real *lower = data + (row * stride_) + col, *upper = data + (col + * stride_) + row; + Real sum = *lower + *upper; + *lower = *upper = sum; + } + *(data + (row * stride_) + row) *= 2.0; // diagonal. + } + } else { + for (MatrixIndexT row = 0; row < num_rows_; row++) { + for (MatrixIndexT col = 0; col < row; col++) { + Real *lower = data + (row * stride_) + col, *upper = data + (col + * stride_) + row; + Real lower_tmp = *lower; + *lower += alpha * *upper; + *upper += alpha * lower_tmp; + } + *(data + (row * stride_) + row) *= (1.0 + alpha); // diagonal. + } + } + } + } else { + int aStride = (int) A.stride_, stride = stride_; + Real *adata = A.data_, *data = data_; + if (transA == kNoTrans) { + KALDI_ASSERT(A.num_rows_ == num_rows_ && A.num_cols_ == num_cols_); + if (num_rows_ == 0) return; + for (MatrixIndexT row = 0; row < num_rows_; row++, adata += aStride, + data += stride) { + cblas_Xaxpy(num_cols_, alpha, adata, 1, data, 1); + } + } else { + KALDI_ASSERT(A.num_cols_ == num_rows_ && A.num_rows_ == num_cols_); + if (num_rows_ == 0) return; + for (MatrixIndexT row = 0; row < num_rows_; row++, adata++, data += stride) + cblas_Xaxpy(num_cols_, alpha, adata, aStride, data, 1); + } + } +} + +template +void MatrixBase::AddSmat(Real alpha, const SparseMatrix &A, + MatrixTransposeType trans) { + if (trans == kNoTrans) { + KALDI_ASSERT(NumRows() == A.NumRows()); + KALDI_ASSERT(NumCols() == A.NumCols()); + MatrixIndexT a_num_rows = A.NumRows(); + for (MatrixIndexT i = 0; i < a_num_rows; ++i) { + const SparseVector &row = A.Row(i); + MatrixIndexT num_elems = row.NumElements(); + for (MatrixIndexT id = 0; id < num_elems; ++id) { + (*this)(i, row.GetElement(id).first) += alpha + * row.GetElement(id).second; + } + } + } else { + KALDI_ASSERT(NumRows() == A.NumCols()); + KALDI_ASSERT(NumCols() == A.NumRows()); + MatrixIndexT a_num_rows = A.NumRows(); + for (MatrixIndexT i = 0; i < a_num_rows; ++i) { + const SparseVector &row = A.Row(i); + MatrixIndexT num_elems = row.NumElements(); + for (MatrixIndexT id = 0; id < num_elems; ++id) { + (*this)(row.GetElement(id).first, i) += alpha + * row.GetElement(id).second; + } + } + } +} + +template +void MatrixBase::AddSmatMat(Real alpha, const SparseMatrix &A, + MatrixTransposeType transA, + const MatrixBase &B, Real beta) { + if (transA == kNoTrans) { + KALDI_ASSERT(NumRows() == A.NumRows()); + KALDI_ASSERT(NumCols() == B.NumCols()); + KALDI_ASSERT(A.NumCols() == B.NumRows()); + + this->Scale(beta); + MatrixIndexT a_num_rows = A.NumRows(), + this_num_cols = this->NumCols(); + for (MatrixIndexT i = 0; i < a_num_rows; ++i) { + Real *this_row_i = this->RowData(i); + const SparseVector &A_row_i = A.Row(i); + MatrixIndexT num_elems = A_row_i.NumElements(); + for (MatrixIndexT e = 0; e < num_elems; ++e) { + const std::pair &p = A_row_i.GetElement(e); + MatrixIndexT k = p.first; + Real alpha_A_ik = alpha * p.second; + const Real *b_row_k = B.RowData(k); + cblas_Xaxpy(this_num_cols, alpha_A_ik, b_row_k, 1, + this_row_i, 1); + //for (MatrixIndexT j = 0; j < this_num_cols; ++j) + // this_row_i[j] += alpha_A_ik * b_row_k[j]; + } + } + } else { + KALDI_ASSERT(NumRows() == A.NumCols()); + KALDI_ASSERT(NumCols() == B.NumCols()); + KALDI_ASSERT(A.NumRows() == B.NumRows()); + + this->Scale(beta); + Matrix buf(NumRows(), NumCols(), kSetZero); + MatrixIndexT a_num_rows = A.NumRows(), + this_num_cols = this->NumCols(); + for (int k = 0; k < a_num_rows; ++k) { + const Real *b_row_k = B.RowData(k); + const SparseVector &A_row_k = A.Row(k); + MatrixIndexT num_elems = A_row_k.NumElements(); + for (MatrixIndexT e = 0; e < num_elems; ++e) { + const std::pair &p = A_row_k.GetElement(e); + MatrixIndexT i = p.first; + Real alpha_A_ki = alpha * p.second; + Real *this_row_i = this->RowData(i); + cblas_Xaxpy(this_num_cols, alpha_A_ki, b_row_k, 1, + this_row_i, 1); + //for (MatrixIndexT j = 0; j < this_num_cols; ++j) + // this_row_i[j] += alpha_A_ki * b_row_k[j]; + } + } + } +} + +template +void MatrixBase::AddMatSmat(Real alpha, const MatrixBase &A, + const SparseMatrix &B, + MatrixTransposeType transB, Real beta) { + if (transB == kNoTrans) { + KALDI_ASSERT(NumRows() == A.NumRows()); + KALDI_ASSERT(NumCols() == B.NumCols()); + KALDI_ASSERT(A.NumCols() == B.NumRows()); + + this->Scale(beta); + MatrixIndexT b_num_rows = B.NumRows(), + this_num_rows = this->NumRows(); + // Iterate over the rows of sparse matrix B and columns of A. + for (MatrixIndexT k = 0; k < b_num_rows; ++k) { + const SparseVector &B_row_k = B.Row(k); + MatrixIndexT num_elems = B_row_k.NumElements(); + const Real *a_col_k = A.Data() + k; + for (MatrixIndexT e = 0; e < num_elems; ++e) { + const std::pair &p = B_row_k.GetElement(e); + MatrixIndexT j = p.first; + Real alpha_B_kj = alpha * p.second; + Real *this_col_j = this->Data() + j; + // Add to entire 'j'th column of *this at once using cblas_Xaxpy. + // pass stride to write a colmun as matrices are stored in row major order. + cblas_Xaxpy(this_num_rows, alpha_B_kj, a_col_k, A.stride_, + this_col_j, this->stride_); + //for (MatrixIndexT i = 0; i < this_num_rows; ++i) + // this_col_j[i*this->stride_] += alpha_B_kj * a_col_k[i*A.stride_]; + } + } + } else { + KALDI_ASSERT(NumRows() == A.NumRows()); + KALDI_ASSERT(NumCols() == B.NumRows()); + KALDI_ASSERT(A.NumCols() == B.NumCols()); + + this->Scale(beta); + MatrixIndexT b_num_rows = B.NumRows(), + this_num_rows = this->NumRows(); + // Iterate over the rows of sparse matrix B and columns of *this. + for (MatrixIndexT j = 0; j < b_num_rows; ++j) { + const SparseVector &B_row_j = B.Row(j); + MatrixIndexT num_elems = B_row_j.NumElements(); + Real *this_col_j = this->Data() + j; + for (MatrixIndexT e = 0; e < num_elems; ++e) { + const std::pair &p = B_row_j.GetElement(e); + MatrixIndexT k = p.first; + Real alpha_B_jk = alpha * p.second; + const Real *a_col_k = A.Data() + k; + // Add to entire 'j'th column of *this at once using cblas_Xaxpy. + // pass stride to write a column as matrices are stored in row major order. + cblas_Xaxpy(this_num_rows, alpha_B_jk, a_col_k, A.stride_, + this_col_j, this->stride_); + //for (MatrixIndexT i = 0; i < this_num_rows; ++i) + // this_col_j[i*this->stride_] += alpha_B_jk * a_col_k[i*A.stride_]; + } + } + } +} + +template +template +void MatrixBase::AddSp(const Real alpha, const SpMatrix &S) { + KALDI_ASSERT(S.NumRows() == NumRows() && S.NumRows() == NumCols()); + Real *data = data_; const OtherReal *sdata = S.Data(); + MatrixIndexT num_rows = NumRows(), stride = Stride(); + for (MatrixIndexT i = 0; i < num_rows; i++) { + for (MatrixIndexT j = 0; j < i; j++, sdata++) { + data[i*stride + j] += alpha * *sdata; + data[j*stride + i] += alpha * *sdata; + } + data[i*stride + i] += alpha * *sdata++; + } +} + +// instantiate the template above. +template +void MatrixBase::AddSp(const float alpha, const SpMatrix &S); +template +void MatrixBase::AddSp(const double alpha, const SpMatrix &S); +template +void MatrixBase::AddSp(const float alpha, const SpMatrix &S); +template +void MatrixBase::AddSp(const double alpha, const SpMatrix &S); + + +template +void MatrixBase::AddDiagVecMat( + const Real alpha, const VectorBase &v, + const MatrixBase &M, + MatrixTransposeType transM, + Real beta) { + if (beta != 1.0) this->Scale(beta); + + if (transM == kNoTrans) { + KALDI_ASSERT(SameDim(*this, M)); + } else { + KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows()); + } + KALDI_ASSERT(v.Dim() == this->NumRows()); + + MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1, stride = stride_, + num_rows = num_rows_, num_cols = num_cols_; + if (transM == kTrans) std::swap(M_row_stride, M_col_stride); + Real *data = data_; + const Real *Mdata = M.Data(), *vdata = v.Data(); + if (num_rows_ == 0) return; + for (MatrixIndexT i = 0; i < num_rows; i++, data += stride, Mdata += M_row_stride, vdata++) + cblas_Xaxpy(num_cols, alpha * *vdata, Mdata, M_col_stride, data, 1); +} + +template +void MatrixBase::AddMatDiagVec( + const Real alpha, + const MatrixBase &M, MatrixTransposeType transM, + VectorBase &v, + Real beta) { + + if (beta != 1.0) this->Scale(beta); + + if (transM == kNoTrans) { + KALDI_ASSERT(SameDim(*this, M)); + } else { + KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows()); + } + KALDI_ASSERT(v.Dim() == this->NumCols()); + + MatrixIndexT M_row_stride = M.Stride(), + M_col_stride = 1, + stride = stride_, + num_rows = num_rows_, + num_cols = num_cols_; + + if (transM == kTrans) + std::swap(M_row_stride, M_col_stride); + + Real *data = data_; + const Real *Mdata = M.Data(), *vdata = v.Data(); + if (num_rows_ == 0) return; + for (MatrixIndexT i = 0; i < num_rows; i++){ + for(MatrixIndexT j = 0; j < num_cols; j ++ ){ + data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride + j*M_col_stride]; + } + } +} + +template +void MatrixBase::AddMatMatElements(const Real alpha, + const MatrixBase& A, + const MatrixBase& B, + const Real beta) { + KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols()); + KALDI_ASSERT(A.NumRows() == NumRows() && A.NumCols() == NumCols()); + Real *data = data_; + const Real *dataA = A.Data(); + const Real *dataB = B.Data(); + + for (MatrixIndexT i = 0; i < num_rows_; i++) { + for (MatrixIndexT j = 0; j < num_cols_; j++) { + data[j] = beta*data[j] + alpha*dataA[j]*dataB[j]; + } + data += Stride(); + dataA += A.Stride(); + dataB += B.Stride(); + } +} + +#if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD) +// **************************************************************************** +// **************************************************************************** +template +void MatrixBase::LapackGesvd(VectorBase *s, MatrixBase *U_in, + MatrixBase *V_in) { + KALDI_ASSERT(s != NULL && U_in != this && V_in != this); + + Matrix tmpU, tmpV; + if (U_in == NULL) tmpU.Resize(this->num_rows_, 1); // work-space if U_in empty. + if (V_in == NULL) tmpV.Resize(1, this->num_cols_); // work-space if V_in empty. + + /// Impementation notes: + /// Lapack works in column-order, therefore the dimensions of *this are + /// swapped as well as the U and V matrices. + + KaldiBlasInt M = num_cols_; + KaldiBlasInt N = num_rows_; + KaldiBlasInt LDA = Stride(); + + KALDI_ASSERT(N>=M); // NumRows >= columns. + + if (U_in) { + KALDI_ASSERT((int)U_in->num_rows_ == N && (int)U_in->num_cols_ == M); + } + if (V_in) { + KALDI_ASSERT((int)V_in->num_rows_ == M && (int)V_in->num_cols_ == M); + } + KALDI_ASSERT((int)s->Dim() == std::min(M, N)); + + MatrixBase *U = (U_in ? U_in : &tmpU); + MatrixBase *V = (V_in ? V_in : &tmpV); + + KaldiBlasInt V_stride = V->Stride(); + KaldiBlasInt U_stride = U->Stride(); + + // Original LAPACK recipe + // KaldiBlasInt l_work = std::max(std::max + // (1, 3*std::min(M, N)+std::max(M, N)), 5*std::min(M, N))*2; + KaldiBlasInt l_work = -1; + Real work_query; + KaldiBlasInt result; + + // query for work space + char *u_job = const_cast(U_in ? "s" : "N"); // "s" == skinny, "N" == "none." + char *v_job = const_cast(V_in ? "s" : "N"); // "s" == skinny, "N" == "none." + clapack_Xgesvd(v_job, u_job, + &M, &N, data_, &LDA, + s->Data(), + V->Data(), &V_stride, + U->Data(), &U_stride, + &work_query, &l_work, + &result); + + KALDI_ASSERT(result >= 0 && "Call to CLAPACK dgesvd_ called with wrong arguments"); + + l_work = static_cast(work_query); + Real *p_work; + void *temp; + if ((p_work = static_cast( + KALDI_MEMALIGN(16, sizeof(Real)*l_work, &temp))) == NULL) + throw std::bad_alloc(); + + // perform svd + clapack_Xgesvd(v_job, u_job, + &M, &N, data_, &LDA, + s->Data(), + V->Data(), &V_stride, + U->Data(), &U_stride, + p_work, &l_work, + &result); + + KALDI_ASSERT(result >= 0 && "Call to CLAPACK dgesvd_ called with wrong arguments"); + + if (result != 0) { + KALDI_WARN << "CLAPACK sgesvd_ : some weird convergence not satisfied"; + } + KALDI_MEMALIGN_FREE(p_work); +} + +#endif + +// Copy constructor. Copies data to newly allocated memory. +template +Matrix::Matrix (const MatrixBase & M, + MatrixTransposeType trans/*=kNoTrans*/) + : MatrixBase() { + if (trans == kNoTrans) { + Resize(M.num_rows_, M.num_cols_); + this->CopyFromMat(M); + } else { + Resize(M.num_cols_, M.num_rows_); + this->CopyFromMat(M, kTrans); + } +} + +// Copy constructor. Copies data to newly allocated memory. +template +Matrix::Matrix (const Matrix & M): + MatrixBase() { + Resize(M.num_rows_, M.num_cols_); + this->CopyFromMat(M); +} + +/// Copy constructor from another type. +template +template +Matrix::Matrix(const MatrixBase & M, + MatrixTransposeType trans) : MatrixBase() { + if (trans == kNoTrans) { + Resize(M.NumRows(), M.NumCols()); + this->CopyFromMat(M); + } else { + Resize(M.NumCols(), M.NumRows()); + this->CopyFromMat(M, kTrans); + } +} + +// Instantiate this constructor for float->double and double->float. +template +Matrix::Matrix(const MatrixBase & M, + MatrixTransposeType trans); +template +Matrix::Matrix(const MatrixBase & M, + MatrixTransposeType trans); + +template +inline void Matrix::Init(const MatrixIndexT rows, + const MatrixIndexT cols, + const MatrixStrideType stride_type) { + if (rows * cols == 0) { + KALDI_ASSERT(rows == 0 && cols == 0); + this->num_rows_ = 0; + this->num_cols_ = 0; + this->stride_ = 0; + this->data_ = NULL; + return; + } + KALDI_ASSERT(rows > 0 && cols > 0); + MatrixIndexT skip, stride; + size_t size; + void *data; // aligned memory block + void *temp; // memory block to be really freed + + // compute the size of skip and real cols + skip = ((16 / sizeof(Real)) - cols % (16 / sizeof(Real))) + % (16 / sizeof(Real)); + stride = cols + skip; + size = static_cast(rows) * static_cast(stride) + * sizeof(Real); + + // allocate the memory and set the right dimensions and parameters + if (NULL != (data = KALDI_MEMALIGN(16, size, &temp))) { + MatrixBase::data_ = static_cast (data); + MatrixBase::num_rows_ = rows; + MatrixBase::num_cols_ = cols; + MatrixBase::stride_ = (stride_type == kDefaultStride ? stride : cols); + } else { + throw std::bad_alloc(); + } +} + +template +void Matrix::Resize(const MatrixIndexT rows, + const MatrixIndexT cols, + MatrixResizeType resize_type, + MatrixStrideType stride_type) { + // the next block uses recursion to handle what we have to do if + // resize_type == kCopyData. + if (resize_type == kCopyData) { + if (this->data_ == NULL || rows == 0) resize_type = kSetZero; // nothing to copy. + else if (rows == this->num_rows_ && cols == this->num_cols_ && + (stride_type == kDefaultStride || this->stride_ == this->num_cols_)) { return; } // nothing to do. + else { + // set tmp to a matrix of the desired size; if new matrix + // is bigger in some dimension, zero it. + MatrixResizeType new_resize_type = + (rows > this->num_rows_ || cols > this->num_cols_) ? kSetZero : kUndefined; + Matrix tmp(rows, cols, new_resize_type, stride_type); + MatrixIndexT rows_min = std::min(rows, this->num_rows_), + cols_min = std::min(cols, this->num_cols_); + tmp.Range(0, rows_min, 0, cols_min). + CopyFromMat(this->Range(0, rows_min, 0, cols_min)); + tmp.Swap(this); + // and now let tmp go out of scope, deleting what was in *this. + return; + } + } + // At this point, resize_type == kSetZero or kUndefined. + + if (MatrixBase::data_ != NULL) { + if (rows == MatrixBase::num_rows_ + && cols == MatrixBase::num_cols_) { + if (resize_type == kSetZero) + this->SetZero(); + return; + } + else + Destroy(); + } + Init(rows, cols, stride_type); + if (resize_type == kSetZero) MatrixBase::SetZero(); +} + +template +template +void MatrixBase::CopyFromMat(const MatrixBase &M, + MatrixTransposeType Trans) { + if (sizeof(Real) == sizeof(OtherReal) && + static_cast(M.Data()) == + static_cast(this->Data())) { + // CopyFromMat called on same data. Nothing to do (except sanity checks). + KALDI_ASSERT(Trans == kNoTrans && M.NumRows() == NumRows() && + M.NumCols() == NumCols() && M.Stride() == Stride()); + return; + } + if (Trans == kNoTrans) { + KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == M.NumCols()); + for (MatrixIndexT i = 0; i < num_rows_; i++) + (*this).Row(i).CopyFromVec(M.Row(i)); + } else { + KALDI_ASSERT(num_cols_ == M.NumRows() && num_rows_ == M.NumCols()); + int32 this_stride = stride_, other_stride = M.Stride(); + Real *this_data = data_; + const OtherReal *other_data = M.Data(); + for (MatrixIndexT i = 0; i < num_rows_; i++) + for (MatrixIndexT j = 0; j < num_cols_; j++) + this_data[i * this_stride + j] = other_data[j * other_stride + i]; + } +} + +// template instantiations. +template +void MatrixBase::CopyFromMat(const MatrixBase & M, + MatrixTransposeType Trans); +template +void MatrixBase::CopyFromMat(const MatrixBase & M, + MatrixTransposeType Trans); +template +void MatrixBase::CopyFromMat(const MatrixBase & M, + MatrixTransposeType Trans); +template +void MatrixBase::CopyFromMat(const MatrixBase & M, + MatrixTransposeType Trans); + +// Specialize the template for CopyFromSp for float, float. +template<> +template<> +void MatrixBase::CopyFromSp(const SpMatrix & M) { + KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); + MatrixIndexT num_rows = num_rows_, stride = stride_; + const float *Mdata = M.Data(); + float *row_data = data_, *col_data = data_; + for (MatrixIndexT i = 0; i < num_rows; i++) { + cblas_scopy(i+1, Mdata, 1, row_data, 1); // copy to the row. + cblas_scopy(i, Mdata, 1, col_data, stride); // copy to the column. + Mdata += i+1; + row_data += stride; + col_data += 1; + } +} + +// Specialize the template for CopyFromSp for double, double. +template<> +template<> +void MatrixBase::CopyFromSp(const SpMatrix & M) { + KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); + MatrixIndexT num_rows = num_rows_, stride = stride_; + const double *Mdata = M.Data(); + double *row_data = data_, *col_data = data_; + for (MatrixIndexT i = 0; i < num_rows; i++) { + cblas_dcopy(i+1, Mdata, 1, row_data, 1); // copy to the row. + cblas_dcopy(i, Mdata, 1, col_data, stride); // copy to the column. + Mdata += i+1; + row_data += stride; + col_data += 1; + } +} + + +template +template +void MatrixBase::CopyFromSp(const SpMatrix & M) { + KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); + // MORE EFFICIENT IF LOWER TRIANGULAR! Reverse code otherwise. + for (MatrixIndexT i = 0; i < num_rows_; i++) { + for (MatrixIndexT j = 0; j < i; j++) { + (*this)(j, i) = (*this)(i, j) = M(i, j); + } + (*this)(i, i) = M(i, i); + } +} + +// Instantiate this function +template +void MatrixBase::CopyFromSp(const SpMatrix & M); +template +void MatrixBase::CopyFromSp(const SpMatrix & M); + + +template +template +void MatrixBase::CopyFromTp(const TpMatrix & M, + MatrixTransposeType Trans) { + if (Trans == kNoTrans) { + KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); + SetZero(); + Real *out_i = data_; + const OtherReal *in_i = M.Data(); + for (MatrixIndexT i = 0; i < num_rows_; i++, out_i += stride_, in_i += i) { + for (MatrixIndexT j = 0; j <= i; j++) + out_i[j] = in_i[j]; + } + } else { + SetZero(); + KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); + MatrixIndexT stride = stride_; + Real *out_i = data_; + const OtherReal *in_i = M.Data(); + for (MatrixIndexT i = 0; i < num_rows_; i++, out_i ++, in_i += i) { + for (MatrixIndexT j = 0; j <= i; j++) + out_i[j*stride] = in_i[j]; + } + } +} + +template +void MatrixBase::CopyFromTp(const TpMatrix & M, + MatrixTransposeType trans); +template +void MatrixBase::CopyFromTp(const TpMatrix & M, + MatrixTransposeType trans); +template +void MatrixBase::CopyFromTp(const TpMatrix & M, + MatrixTransposeType trans); +template +void MatrixBase::CopyFromTp(const TpMatrix & M, + MatrixTransposeType trans); + + +template +void MatrixBase::CopyRowsFromVec(const VectorBase &rv) { + if (rv.Dim() == num_rows_*num_cols_) { + if (stride_ == num_cols_) { + // one big copy operation. + const Real *rv_data = rv.Data(); + std::memcpy(data_, rv_data, sizeof(Real)*num_rows_*num_cols_); + } else { + const Real *rv_data = rv.Data(); + for (MatrixIndexT r = 0; r < num_rows_; r++) { + Real *row_data = RowData(r); + for (MatrixIndexT c = 0; c < num_cols_; c++) { + row_data[c] = rv_data[c]; + } + rv_data += num_cols_; + } + } + } else if (rv.Dim() == num_cols_) { + const Real *rv_data = rv.Data(); + for (MatrixIndexT r = 0; r < num_rows_; r++) + std::memcpy(RowData(r), rv_data, sizeof(Real)*num_cols_); + } else { + KALDI_ERR << "Wrong sized arguments"; + } +} + +template +template +void MatrixBase::CopyRowsFromVec(const VectorBase &rv) { + if (rv.Dim() == num_rows_*num_cols_) { + const OtherReal *rv_data = rv.Data(); + for (MatrixIndexT r = 0; r < num_rows_; r++) { + Real *row_data = RowData(r); + for (MatrixIndexT c = 0; c < num_cols_; c++) { + row_data[c] = static_cast(rv_data[c]); + } + rv_data += num_cols_; + } + } else if (rv.Dim() == num_cols_) { + const OtherReal *rv_data = rv.Data(); + Real *first_row_data = RowData(0); + for (MatrixIndexT c = 0; c < num_cols_; c++) + first_row_data[c] = rv_data[c]; + for (MatrixIndexT r = 1; r < num_rows_; r++) + std::memcpy(RowData(r), first_row_data, sizeof(Real)*num_cols_); + } else { + KALDI_ERR << "Wrong sized arguments."; + } +} + + +template +void MatrixBase::CopyRowsFromVec(const VectorBase &rv); +template +void MatrixBase::CopyRowsFromVec(const VectorBase &rv); + +template +void MatrixBase::CopyColsFromVec(const VectorBase &rv) { + if (rv.Dim() == num_rows_*num_cols_) { + const Real *v_inc_data = rv.Data(); + Real *m_inc_data = data_; + + for (MatrixIndexT c = 0; c < num_cols_; c++) { + for (MatrixIndexT r = 0; r < num_rows_; r++) { + m_inc_data[r * stride_] = v_inc_data[r]; + } + v_inc_data += num_rows_; + m_inc_data ++; + } + } else if (rv.Dim() == num_rows_) { + const Real *v_inc_data = rv.Data(); + Real *m_inc_data = data_; + for (MatrixIndexT r = 0; r < num_rows_; r++) { + Real value = *(v_inc_data++); + for (MatrixIndexT c = 0; c < num_cols_; c++) + m_inc_data[c] = value; + m_inc_data += stride_; + } + } else { + KALDI_ERR << "Wrong size of arguments."; + } +} + + +template +void MatrixBase::CopyRowFromVec(const VectorBase &rv, const MatrixIndexT row) { + KALDI_ASSERT(rv.Dim() == num_cols_ && + static_cast(row) < + static_cast(num_rows_)); + + const Real *rv_data = rv.Data(); + Real *row_data = RowData(row); + + std::memcpy(row_data, rv_data, num_cols_ * sizeof(Real)); +} + +template +void MatrixBase::CopyDiagFromVec(const VectorBase &rv) { + KALDI_ASSERT(rv.Dim() == std::min(num_cols_, num_rows_)); + const Real *rv_data = rv.Data(), *rv_end = rv_data + rv.Dim(); + Real *my_data = this->Data(); + for (; rv_data != rv_end; rv_data++, my_data += (this->stride_+1)) + *my_data = *rv_data; +} + +template +void MatrixBase::CopyColFromVec(const VectorBase &rv, + const MatrixIndexT col) { + KALDI_ASSERT(rv.Dim() == num_rows_ && + static_cast(col) < + static_cast(num_cols_)); + + const Real *rv_data = rv.Data(); + Real *col_data = data_ + col; + + for (MatrixIndexT r = 0; r < num_rows_; r++) + col_data[r * stride_] = rv_data[r]; +} + + + +template +void Matrix::RemoveRow(MatrixIndexT i) { + KALDI_ASSERT(static_cast(i) < + static_cast(MatrixBase::num_rows_) + && "Access out of matrix"); + for (MatrixIndexT j = i + 1; j < MatrixBase::num_rows_; j++) + MatrixBase::Row(j-1).CopyFromVec( MatrixBase::Row(j)); + MatrixBase::num_rows_--; +} + +template +void Matrix::Destroy() { + // we need to free the data block if it was defined + if (NULL != MatrixBase::data_) + KALDI_MEMALIGN_FREE( MatrixBase::data_); + MatrixBase::data_ = NULL; + MatrixBase::num_rows_ = MatrixBase::num_cols_ + = MatrixBase::stride_ = 0; +} + + + +template +void MatrixBase::MulElements(const MatrixBase &a) { + KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_); + + if (num_cols_ == stride_ && num_cols_ == a.stride_) { + mul_elements(num_rows_ * num_cols_, a.data_, data_); + } else { + MatrixIndexT a_stride = a.stride_, stride = stride_; + Real *data = data_, *a_data = a.data_; + for (MatrixIndexT i = 0; i < num_rows_; i++) { + mul_elements(num_cols_, a_data, data); + a_data += a_stride; + data += stride; + } + } +} + +template +void MatrixBase::DivElements(const MatrixBase &a) { + KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_); + MatrixIndexT i; + MatrixIndexT j; + + for (i = 0; i < num_rows_; i++) { + for (j = 0; j < num_cols_; j++) { + (*this)(i, j) /= a(i, j); + } + } +} + +template +Real MatrixBase::Sum() const { + double sum = 0.0; + + for (MatrixIndexT i = 0; i < num_rows_; i++) { + for (MatrixIndexT j = 0; j < num_cols_; j++) { + sum += (*this)(i, j); + } + } + + return (Real)sum; +} + +template void MatrixBase::Max(const MatrixBase &A) { + KALDI_ASSERT(A.NumRows() == NumRows() && A.NumCols() == NumCols()); + for (MatrixIndexT row = 0; row < num_rows_; row++) { + Real *row_data = RowData(row); + const Real *other_row_data = A.RowData(row); + MatrixIndexT num_cols = num_cols_; + for (MatrixIndexT col = 0; col < num_cols; col++) { + row_data[col] = std::max(row_data[col], + other_row_data[col]); + } + } +} + +template void MatrixBase::Min(const MatrixBase &A) { + KALDI_ASSERT(A.NumRows() == NumRows() && A.NumCols() == NumCols()); + for (MatrixIndexT row = 0; row < num_rows_; row++) { + Real *row_data = RowData(row); + const Real *other_row_data = A.RowData(row); + MatrixIndexT num_cols = num_cols_; + for (MatrixIndexT col = 0; col < num_cols; col++) { + row_data[col] = std::min(row_data[col], + other_row_data[col]); + } + } +} + + +template void MatrixBase::Scale(Real alpha) { + if (alpha == 1.0) return; + if (num_rows_ == 0) return; + if (num_cols_ == stride_) { + cblas_Xscal(static_cast(num_rows_) * static_cast(num_cols_), + alpha, data_,1); + } else { + Real *data = data_; + for (MatrixIndexT i = 0; i < num_rows_; ++i, data += stride_) { + cblas_Xscal(num_cols_, alpha, data,1); + } + } +} + +template // scales each row by scale[i]. +void MatrixBase::MulRowsVec(const VectorBase &scale) { + KALDI_ASSERT(scale.Dim() == num_rows_); + MatrixIndexT M = num_rows_, N = num_cols_; + + for (MatrixIndexT i = 0; i < M; i++) { + Real this_scale = scale(i); + for (MatrixIndexT j = 0; j < N; j++) { + (*this)(i, j) *= this_scale; + } + } +} + + +template +void MatrixBase::MulRowsGroupMat(const MatrixBase &src) { + KALDI_ASSERT(src.NumRows() == this->NumRows() && + this->NumCols() % src.NumCols() == 0); + int32 group_size = this->NumCols() / src.NumCols(), + num_groups = this->NumCols() / group_size, + num_rows = this->NumRows(); + + for (MatrixIndexT i = 0; i < num_rows; i++) { + Real *data = this->RowData(i); + for (MatrixIndexT j = 0; j < num_groups; j++, data += group_size) { + Real scale = src(i, j); + cblas_Xscal(group_size, scale, data, 1); + } + } +} + +template +void MatrixBase::GroupPnormDeriv(const MatrixBase &input, + const MatrixBase &output, + Real power) { + KALDI_ASSERT(input.NumCols() == this->NumCols() && input.NumRows() == this->NumRows()); + KALDI_ASSERT(this->NumCols() % output.NumCols() == 0 && + this->NumRows() == output.NumRows()); + + int group_size = this->NumCols() / output.NumCols(), + num_rows = this->NumRows(), num_cols = this->NumCols(); + + if (power == 1.0) { + for (MatrixIndexT i = 0; i < num_rows; i++) { + for (MatrixIndexT j = 0; j < num_cols; j++) { + Real input_val = input(i, j); + (*this)(i, j) = (input_val == 0 ? 0 : (input_val > 0 ? 1 : -1)); + } + } + } else if (power == std::numeric_limits::infinity()) { + for (MatrixIndexT i = 0; i < num_rows; i++) { + for (MatrixIndexT j = 0; j < num_cols; j++) { + Real output_val = output(i, j / group_size), input_val = input(i, j); + if (output_val == 0) + (*this)(i, j) = 0; + else + (*this)(i, j) = (std::abs(input_val) == output_val ? 1.0 : 0.0) + * (input_val >= 0 ? 1 : -1); + } + } + } else { + for (MatrixIndexT i = 0; i < num_rows; i++) { + for (MatrixIndexT j = 0; j < num_cols; j++) { + Real output_val = output(i, j / group_size), + input_val = input(i, j); + if (output_val == 0) + (*this)(i, j) = 0; + else + (*this)(i, j) = pow(std::abs(input_val), power - 1) * + pow(output_val, 1 - power) * (input_val >= 0 ? 1 : -1) ; + } + } + } +} + +template +void MatrixBase::GroupMaxDeriv(const MatrixBase &input, + const MatrixBase &output) { + KALDI_ASSERT(input.NumCols() == this->NumCols() && + input.NumRows() == this->NumRows()); + KALDI_ASSERT(this->NumCols() % output.NumCols() == 0 && + this->NumRows() == output.NumRows()); + + int group_size = this->NumCols() / output.NumCols(), + num_rows = this->NumRows(), num_cols = this->NumCols(); + + for (MatrixIndexT i = 0; i < num_rows; i++) { + for (MatrixIndexT j = 0; j < num_cols; j++) { + Real input_val = input(i, j); + Real output_val = output(i, j / group_size); + (*this)(i, j) = (input_val == output_val ? 1 : 0); + } + } +} + +template // scales each column by scale[i]. +void MatrixBase::MulColsVec(const VectorBase &scale) { + KALDI_ASSERT(scale.Dim() == num_cols_); + for (MatrixIndexT i = 0; i < num_rows_; i++) { + for (MatrixIndexT j = 0; j < num_cols_; j++) { + Real this_scale = scale(j); + (*this)(i, j) *= this_scale; + } + } +} + +template +void MatrixBase::SetZero() { + if (num_cols_ == stride_) + memset(data_, 0, sizeof(Real)*num_rows_*num_cols_); + else + for (MatrixIndexT row = 0; row < num_rows_; row++) + memset(data_ + row*stride_, 0, sizeof(Real)*num_cols_); +} + +template +void MatrixBase::Set(Real value) { + for (MatrixIndexT row = 0; row < num_rows_; row++) { + for (MatrixIndexT col = 0; col < num_cols_; col++) { + (*this)(row, col) = value; + } + } +} + +template +void MatrixBase::SetUnit() { + SetZero(); + for (MatrixIndexT row = 0; row < std::min(num_rows_, num_cols_); row++) + (*this)(row, row) = 1.0; +} + +template +void MatrixBase::SetRandn() { + kaldi::RandomState rstate; + for (MatrixIndexT row = 0; row < num_rows_; row++) { + Real *row_data = this->RowData(row); + MatrixIndexT nc = (num_cols_ % 2 == 1) ? num_cols_ - 1 : num_cols_; + for (MatrixIndexT col = 0; col < nc; col += 2) { + kaldi::RandGauss2(row_data + col, row_data + col + 1, &rstate); + } + if (nc != num_cols_) row_data[nc] = static_cast(kaldi::RandGauss(&rstate)); + } +} + +template +void MatrixBase::SetRandUniform() { + kaldi::RandomState rstate; + for (MatrixIndexT row = 0; row < num_rows_; row++) { + Real *row_data = this->RowData(row); + for (MatrixIndexT col = 0; col < num_cols_; col++, row_data++) { + *row_data = static_cast(kaldi::RandUniform(&rstate)); + } + } +} + +template +void MatrixBase::Write(std::ostream &os, bool binary) const { + if (!os.good()) { + KALDI_ERR << "Failed to write matrix to stream: stream not good"; + } + if (binary) { // Use separate binary and text formats, + // since in binary mode we need to know if it's float or double. + std::string my_token = (sizeof(Real) == 4 ? "FM" : "DM"); + + WriteToken(os, binary, my_token); + { + int32 rows = this->num_rows_; // make the size 32-bit on disk. + int32 cols = this->num_cols_; + KALDI_ASSERT(this->num_rows_ == (MatrixIndexT) rows); + KALDI_ASSERT(this->num_cols_ == (MatrixIndexT) cols); + WriteBasicType(os, binary, rows); + WriteBasicType(os, binary, cols); + } + if (Stride() == NumCols()) + os.write(reinterpret_cast (Data()), sizeof(Real) + * static_cast(num_rows_) * static_cast(num_cols_)); + else + for (MatrixIndexT i = 0; i < num_rows_; i++) + os.write(reinterpret_cast (RowData(i)), sizeof(Real) + * num_cols_); + if (!os.good()) { + KALDI_ERR << "Failed to write matrix to stream"; + } + } else { // text mode. + if (num_cols_ == 0) { + os << " [ ]\n"; + } else { + os << " ["; + for (MatrixIndexT i = 0; i < num_rows_; i++) { + os << "\n "; + for (MatrixIndexT j = 0; j < num_cols_; j++) + os << (*this)(i, j) << " "; + } + os << "]\n"; + } + } +} + + +template +void MatrixBase::Read(std::istream & is, bool binary, bool add) { + if (add) { + Matrix tmp(num_rows_, num_cols_); + tmp.Read(is, binary, false); // read without adding. + if (tmp.num_rows_ != this->num_rows_ || tmp.num_cols_ != this->num_cols_) + KALDI_ERR << "MatrixBase::Read, size mismatch " + << this->num_rows_ << ", " << this->num_cols_ + << " vs. " << tmp.num_rows_ << ", " << tmp.num_cols_; + this->AddMat(1.0, tmp); + return; + } + // now assume add == false. + + // In order to avoid rewriting this, we just declare a Matrix and + // use it to read the data, then copy. + Matrix tmp; + tmp.Read(is, binary, false); + if (tmp.NumRows() != NumRows() || tmp.NumCols() != NumCols()) { + KALDI_ERR << "MatrixBase::Read, size mismatch " + << NumRows() << " x " << NumCols() << " versus " + << tmp.NumRows() << " x " << tmp.NumCols(); + } + CopyFromMat(tmp); +} + + +template +void Matrix::Read(std::istream & is, bool binary, bool add) { + if (add) { + Matrix tmp; + tmp.Read(is, binary, false); // read without adding. + if (this->num_rows_ == 0) this->Resize(tmp.num_rows_, tmp.num_cols_); + else { + if (this->num_rows_ != tmp.num_rows_ || this->num_cols_ != tmp.num_cols_) { + if (tmp.num_rows_ == 0) return; // do nothing in this case. + else KALDI_ERR << "Matrix::Read, size mismatch " + << this->num_rows_ << ", " << this->num_cols_ + << " vs. " << tmp.num_rows_ << ", " << tmp.num_cols_; + } + } + this->AddMat(1.0, tmp); + return; + } + + // now assume add == false. + MatrixIndexT pos_at_start = is.tellg(); + std::ostringstream specific_error; + + if (binary) { // Read in binary mode. + int peekval = Peek(is, binary); + if (peekval == 'C') { + // This code enables us to read CompressedMatrix as a regular matrix. + CompressedMatrix compressed_mat; + compressed_mat.Read(is, binary); // at this point, add == false. + this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols()); + compressed_mat.CopyToMat(this); + return; + } + const char *my_token = (sizeof(Real) == 4 ? "FM" : "DM"); + char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F'); + if (peekval == other_token_start) { // need to instantiate the other type to read it. + typedef typename OtherReal::Real OtherType; // if Real == float, OtherType == double, and vice versa. + Matrix other(this->num_rows_, this->num_cols_); + other.Read(is, binary, false); // add is false at this point anyway. + this->Resize(other.NumRows(), other.NumCols()); + this->CopyFromMat(other); + return; + } + std::string token; + ReadToken(is, binary, &token); + if (token != my_token) { + if (token.length() > 20) token = token.substr(0, 17) + "..."; + specific_error << ": Expected token " << my_token << ", got " << token; + goto bad; + } + int32 rows, cols; + ReadBasicType(is, binary, &rows); // throws on error. + ReadBasicType(is, binary, &cols); // throws on error. + if ((MatrixIndexT)rows != this->num_rows_ || (MatrixIndexT)cols != this->num_cols_) { + this->Resize(rows, cols); + } + if (this->Stride() == this->NumCols() && rows*cols!=0) { + is.read(reinterpret_cast(this->Data()), + sizeof(Real)*rows*cols); + if (is.fail()) goto bad; + } else { + for (MatrixIndexT i = 0; i < (MatrixIndexT)rows; i++) { + is.read(reinterpret_cast(this->RowData(i)), sizeof(Real)*cols); + if (is.fail()) goto bad; + } + } + if (is.eof()) return; + if (is.fail()) goto bad; + return; + } else { // Text mode. + std::string str; + is >> str; // get a token + if (is.fail()) { specific_error << ": Expected \"[\", got EOF"; goto bad; } + // if ((str.compare("DM") == 0) || (str.compare("FM") == 0)) { // Back compatibility. + // is >> str; // get #rows + // is >> str; // get #cols + // is >> str; // get "[" + // } + if (str == "[]") { Resize(0, 0); return; } // Be tolerant of variants. + else if (str != "[") { + if (str.length() > 20) str = str.substr(0, 17) + "..."; + specific_error << ": Expected \"[\", got \"" << str << '"'; + goto bad; + } + // At this point, we have read "[". + std::vector* > data; + std::vector *cur_row = new std::vector; + while (1) { + int i = is.peek(); + if (i == -1) { specific_error << "Got EOF while reading matrix data"; goto cleanup; } + else if (static_cast(i) == ']') { // Finished reading matrix. + is.get(); // eat the "]". + i = is.peek(); + if (static_cast(i) == '\r') { + is.get(); + is.get(); // get \r\n (must eat what we wrote) + } else if (static_cast(i) == '\n') { is.get(); } // get \n (must eat what we wrote) + if (is.fail()) { + KALDI_WARN << "After end of matrix data, read error."; + // we got the data we needed, so just warn for this error. + } + // Now process the data. + if (!cur_row->empty()) data.push_back(cur_row); + else delete(cur_row); + cur_row = NULL; + if (data.empty()) { this->Resize(0, 0); return; } + else { + int32 num_rows = data.size(), num_cols = data[0]->size(); + this->Resize(num_rows, num_cols); + for (int32 i = 0; i < num_rows; i++) { + if (static_cast(data[i]->size()) != num_cols) { + specific_error << "Matrix has inconsistent #cols: " << num_cols + << " vs." << data[i]->size() << " (processing row" + << i << ")"; + goto cleanup; + } + for (int32 j = 0; j < num_cols; j++) + (*this)(i, j) = (*(data[i]))[j]; + delete data[i]; + data[i] = NULL; + } + } + return; + } else if (static_cast(i) == '\n' || static_cast(i) == ';') { + // End of matrix row. + is.get(); + if (cur_row->size() != 0) { + data.push_back(cur_row); + cur_row = new std::vector; + cur_row->reserve(data.back()->size()); + } + } else if ( (i >= '0' && i <= '9') || i == '-' ) { // A number... + Real r; + is >> r; + if (is.fail()) { + specific_error << "Stream failure/EOF while reading matrix data."; + goto cleanup; + } + cur_row->push_back(r); + } else if (isspace(i)) { + is.get(); // eat the space and do nothing. + } else { // NaN or inf or error. + std::string str; + is >> str; + if (!KALDI_STRCASECMP(str.c_str(), "inf") || + !KALDI_STRCASECMP(str.c_str(), "infinity")) { + cur_row->push_back(std::numeric_limits::infinity()); + KALDI_WARN << "Reading infinite value into matrix."; + } else if (!KALDI_STRCASECMP(str.c_str(), "nan")) { + cur_row->push_back(std::numeric_limits::quiet_NaN()); + KALDI_WARN << "Reading NaN value into matrix."; + } else { + if (str.length() > 20) str = str.substr(0, 17) + "..."; + specific_error << "Expecting numeric matrix data, got " << str; + goto cleanup; + } + } + } + // Note, we never leave the while () loop before this + // line (we return from it.) + cleanup: // We only reach here in case of error in the while loop above. + if(cur_row != NULL) + delete cur_row; + for (size_t i = 0; i < data.size(); i++) + if(data[i] != NULL) + delete data[i]; + // and then go on to "bad" below, where we print error. + } +bad: + KALDI_ERR << "Failed to read matrix from stream. " << specific_error.str() + << " File position at start is " + << pos_at_start << ", currently " << is.tellg(); +} + + +// Constructor... note that this is not const-safe as it would +// be quite complicated to implement a "const SubMatrix" class that +// would not allow its contents to be changed. +template +SubMatrix::SubMatrix(const MatrixBase &M, + const MatrixIndexT ro, + const MatrixIndexT r, + const MatrixIndexT co, + const MatrixIndexT c) { + if (r == 0 || c == 0) { + // we support the empty sub-matrix as a special case. + KALDI_ASSERT(c == 0 && r == 0); + this->data_ = NULL; + this->num_cols_ = 0; + this->num_rows_ = 0; + this->stride_ = 0; + return; + } + KALDI_ASSERT(static_cast(ro) < + static_cast(M.num_rows_) && + static_cast(co) < + static_cast(M.num_cols_) && + static_cast(r) <= + static_cast(M.num_rows_ - ro) && + static_cast(c) <= + static_cast(M.num_cols_ - co)); + // point to the begining of window + MatrixBase::num_rows_ = r; + MatrixBase::num_cols_ = c; + MatrixBase::stride_ = M.Stride(); + MatrixBase::data_ = M.Data_workaround() + + static_cast(co) + + static_cast(ro) * static_cast(M.Stride()); +} + + +template +SubMatrix::SubMatrix(Real *data, + MatrixIndexT num_rows, + MatrixIndexT num_cols, + MatrixIndexT stride): + MatrixBase(data, num_cols, num_rows, stride) { // caution: reversed order! + if (data == NULL) { + KALDI_ASSERT(num_rows * num_cols == 0); + this->num_rows_ = 0; + this->num_cols_ = 0; + this->stride_ = 0; + } else { + KALDI_ASSERT(this->stride_ >= this->num_cols_); + } +} + + +template +void MatrixBase::Add(const Real alpha) { + Real *data = data_; + MatrixIndexT stride = stride_; + for (MatrixIndexT r = 0; r < num_rows_; r++) + for (MatrixIndexT c = 0; c < num_cols_; c++) + data[c + stride*r] += alpha; +} + +template +void MatrixBase::AddToDiag(const Real alpha) { + Real *data = data_; + MatrixIndexT this_stride = stride_ + 1, + num_to_add = std::min(num_rows_, num_cols_); + for (MatrixIndexT r = 0; r < num_to_add; r++) + data[r * this_stride] += alpha; +} + + +template +Real MatrixBase::Cond() const { + KALDI_ASSERT(num_rows_ > 0&&num_cols_ > 0); + Vector singular_values(std::min(num_rows_, num_cols_)); + Svd(&singular_values); // Get singular values... + Real min = singular_values(0), max = singular_values(0); // both absolute values... + for (MatrixIndexT i = 1;i < singular_values.Dim();i++) { + min = std::min((Real)std::abs(singular_values(i)), min); max = std::max((Real)std::abs(singular_values(i)), max); + } + if (min > 0) return max/min; + else return std::numeric_limits::infinity(); +} + +template +Real MatrixBase::Trace(bool check_square) const { + KALDI_ASSERT(!check_square || num_rows_ == num_cols_); + Real ans = 0.0; + for (MatrixIndexT r = 0;r < std::min(num_rows_, num_cols_);r++) ans += data_ [r + stride_*r]; + return ans; +} + +template +Real MatrixBase::Max() const { + KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0); + Real ans= *data_; + for (MatrixIndexT r = 0; r < num_rows_; r++) + for (MatrixIndexT c = 0; c < num_cols_; c++) + if (data_[c + stride_*r] > ans) + ans = data_[c + stride_*r]; + return ans; +} + +template +Real MatrixBase::Min() const { + KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0); + Real ans= *data_; + for (MatrixIndexT r = 0; r < num_rows_; r++) + for (MatrixIndexT c = 0; c < num_cols_; c++) + if (data_[c + stride_*r] < ans) + ans = data_[c + stride_*r]; + return ans; +} + + + +template +void MatrixBase::AddMatMatMat(Real alpha, + const MatrixBase &A, MatrixTransposeType transA, + const MatrixBase &B, MatrixTransposeType transB, + const MatrixBase &C, MatrixTransposeType transC, + Real beta) { + // Note on time taken with different orders of computation. Assume not transposed in this / + // discussion. Firstly, normalize expressions using A.NumCols == B.NumRows and B.NumCols == C.NumRows, prefer + // rows where there is a choice. + // time taken for (AB) is: A.NumRows*B.NumRows*C.Rows + // time taken for (AB)C is A.NumRows*C.NumRows*C.Cols + // so this order is A.NumRows*B.NumRows*C.NumRows + A.NumRows*C.NumRows*C.NumCols. + + // time taken for (BC) is: B.NumRows*C.NumRows*C.Cols + // time taken for A(BC) is: A.NumRows*B.NumRows*C.Cols + // so this order is B.NumRows*C.NumRows*C.NumCols + A.NumRows*B.NumRows*C.Cols + + MatrixIndexT ARows = A.num_rows_, ACols = A.num_cols_, BRows = B.num_rows_, BCols = B.num_cols_, + CRows = C.num_rows_, CCols = C.num_cols_; + if (transA == kTrans) std::swap(ARows, ACols); + if (transB == kTrans) std::swap(BRows, BCols); + if (transC == kTrans) std::swap(CRows, CCols); + + MatrixIndexT AB_C_time = ARows*BRows*CRows + ARows*CRows*CCols; + MatrixIndexT A_BC_time = BRows*CRows*CCols + ARows*BRows*CCols; + + if (AB_C_time < A_BC_time) { + Matrix AB(ARows, BCols); + AB.AddMatMat(1.0, A, transA, B, transB, 0.0); // AB = A * B. + (*this).AddMatMat(alpha, AB, kNoTrans, C, transC, beta); + } else { + Matrix BC(BRows, CCols); + BC.AddMatMat(1.0, B, transB, C, transC, 0.0); // BC = B * C. + (*this).AddMatMat(alpha, A, transA, BC, kNoTrans, beta); + } +} + + + + +template +void MatrixBase::DestructiveSvd(VectorBase *s, MatrixBase *U, MatrixBase *Vt) { + // Svd, *this = U*diag(s)*Vt. + // With (*this).num_rows_ == m, (*this).num_cols_ == n, + // Support only skinny Svd with m>=n (NumRows>=NumCols), and zero sizes for U and Vt mean + // we do not want that output. We expect that s.Dim() == m, + // U is either 0 by 0 or m by n, and rv is either 0 by 0 or n by n. + // Throws exception on error. + + KALDI_ASSERT(num_rows_>=num_cols_ && "Svd requires that #rows by >= #cols."); // For compatibility with JAMA code. + KALDI_ASSERT(s->Dim() == num_cols_); // s should be the smaller dim. + KALDI_ASSERT(U == NULL || (U->num_rows_ == num_rows_&&U->num_cols_ == num_cols_)); + KALDI_ASSERT(Vt == NULL || (Vt->num_rows_ == num_cols_&&Vt->num_cols_ == num_cols_)); + + Real prescale = 1.0; + if ( std::abs((*this)(0, 0) ) < 1.0e-30) { // Very tiny value... can cause problems in Svd. + Real max_elem = LargestAbsElem(); + if (max_elem != 0) { + prescale = 1.0 / max_elem; + if (std::abs(prescale) == std::numeric_limits::infinity()) { prescale = 1.0e+40; } + (*this).Scale(prescale); + } + } + +#if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD) + // "S" == skinny Svd (only one we support because of compatibility with Jama one which is only skinny), + // "N"== no eigenvectors wanted. + LapackGesvd(s, U, Vt); +#else + /* if (num_rows_ > 1 && num_cols_ > 1 && (*this)(0, 0) == (*this)(1, 1) + && Max() == Min() && (*this)(0, 0) != 0.0) { // special case that JamaSvd sometimes crashes on. + KALDI_WARN << "Jama SVD crashes on this type of matrix, perturbing it to prevent crash."; + for(int32 i = 0; i < NumRows(); i++) + (*this)(i, i) *= 1.00001; + }*/ + bool ans = JamaSvd(s, U, Vt); + if (Vt != NULL) Vt->Transpose(); // possibly to do: change this and also the transpose inside the JamaSvd routine. note, Vt is square. + if (!ans) { + KALDI_ERR << "Error doing Svd"; // This one will be caught. + } +#endif + if (prescale != 1.0) s->Scale(1.0/prescale); +} + +template +void MatrixBase::Svd(VectorBase *s, MatrixBase *U, MatrixBase *Vt) const { + try { + if (num_rows_ >= num_cols_) { + Matrix tmp(*this); + tmp.DestructiveSvd(s, U, Vt); + } else { + Matrix tmp(*this, kTrans); // transpose of *this. + // rVt will have different dim so cannot transpose in-place --> use a temp matrix. + Matrix Vt_Trans(Vt ? Vt->num_cols_ : 0, Vt ? Vt->num_rows_ : 0); + // U will be transpose + tmp.DestructiveSvd(s, Vt ? &Vt_Trans : NULL, U); + if (U) U->Transpose(); + if (Vt) Vt->CopyFromMat(Vt_Trans, kTrans); // copy with transpose. + } + } catch (...) { + KALDI_ERR << "Error doing Svd (did not converge), first part of matrix is\n" + << SubMatrix(*this, 0, std::min((MatrixIndexT)10, num_rows_), + 0, std::min((MatrixIndexT)10, num_cols_)) + << ", min and max are: " << Min() << ", " << Max(); + } +} + +template +bool MatrixBase::IsSymmetric(Real cutoff) const { + MatrixIndexT R = num_rows_, C = num_cols_; + if (R != C) return false; + Real bad_sum = 0.0, good_sum = 0.0; + for (MatrixIndexT i = 0;i < R;i++) { + for (MatrixIndexT j = 0;j < i;j++) { + Real a = (*this)(i, j), b = (*this)(j, i), avg = 0.5*(a+b), diff = 0.5*(a-b); + good_sum += std::abs(avg); bad_sum += std::abs(diff); + } + good_sum += std::abs((*this)(i, i)); + } + if (bad_sum > cutoff*good_sum) return false; + return true; +} + +template +bool MatrixBase::IsDiagonal(Real cutoff) const{ + MatrixIndexT R = num_rows_, C = num_cols_; + Real bad_sum = 0.0, good_sum = 0.0; + for (MatrixIndexT i = 0;i < R;i++) { + for (MatrixIndexT j = 0;j < C;j++) { + if (i == j) good_sum += std::abs((*this)(i, j)); + else bad_sum += std::abs((*this)(i, j)); + } + } + return (!(bad_sum > good_sum * cutoff)); +} + +// This does nothing, it's designed to trigger Valgrind errors +// if any memory is uninitialized. +template +void MatrixBase::TestUninitialized() const { + MatrixIndexT R = num_rows_, C = num_cols_, positive = 0; + for (MatrixIndexT i = 0; i < R; i++) + for (MatrixIndexT j = 0; j < C; j++) + if ((*this)(i, j) > 0.0) positive++; + if (positive > R * C) + KALDI_ERR << "Error...."; +} + + +template +bool MatrixBase::IsUnit(Real cutoff) const { + MatrixIndexT R = num_rows_, C = num_cols_; + Real bad_max = 0.0; + for (MatrixIndexT i = 0; i < R;i++) + for (MatrixIndexT j = 0; j < C;j++) + bad_max = std::max(bad_max, static_cast(std::abs( (*this)(i, j) - (i == j?1.0:0.0)))); + return (bad_max <= cutoff); +} + +template +bool MatrixBase::IsZero(Real cutoff)const { + MatrixIndexT R = num_rows_, C = num_cols_; + Real bad_max = 0.0; + for (MatrixIndexT i = 0;i < R;i++) + for (MatrixIndexT j = 0;j < C;j++) + bad_max = std::max(bad_max, static_cast(std::abs( (*this)(i, j) ))); + return (bad_max <= cutoff); +} + +template +Real MatrixBase::FrobeniusNorm() const{ + return std::sqrt(TraceMatMat(*this, *this, kTrans)); +} + +template +bool MatrixBase::ApproxEqual(const MatrixBase &other, float tol) const { + if (num_rows_ != other.num_rows_ || num_cols_ != other.num_cols_) + KALDI_ERR << "ApproxEqual: size mismatch."; + Matrix tmp(*this); + tmp.AddMat(-1.0, other); + return (tmp.FrobeniusNorm() <= static_cast(tol) * + this->FrobeniusNorm()); +} + +template +bool MatrixBase::Equal(const MatrixBase &other) const { + if (num_rows_ != other.num_rows_ || num_cols_ != other.num_cols_) + KALDI_ERR << "Equal: size mismatch."; + for (MatrixIndexT i = 0; i < num_rows_; i++) + for (MatrixIndexT j = 0; j < num_cols_; j++) + if ( (*this)(i, j) != other(i, j)) + return false; + return true; +} + + +template +Real MatrixBase::LargestAbsElem() const{ + MatrixIndexT R = num_rows_, C = num_cols_; + Real largest = 0.0; + for (MatrixIndexT i = 0;i < R;i++) + for (MatrixIndexT j = 0;j < C;j++) + largest = std::max(largest, (Real)std::abs((*this)(i, j))); + return largest; +} + + +template +void MatrixBase::OrthogonalizeRows() { + KALDI_ASSERT(NumRows() <= NumCols()); + MatrixIndexT num_rows = num_rows_; + for (MatrixIndexT i = 0; i < num_rows; i++) { + int32 counter = 0; + while (1) { + Real start_prod = VecVec(this->Row(i), this->Row(i)); + if (start_prod - start_prod != 0.0 || start_prod == 0.0) { + KALDI_WARN << "Self-product of row " << i << " of matrix is " + << start_prod << ", randomizing."; + this->Row(i).SetRandn(); + counter++; + continue; // loop again. + } + for (MatrixIndexT j = 0; j < i; j++) { + Real prod = VecVec(this->Row(i), this->Row(j)); + this->Row(i).AddVec(-prod, this->Row(j)); + } + Real end_prod = VecVec(this->Row(i), this->Row(i)); + if (end_prod <= 0.01 * start_prod) { // We removed + // almost all of the vector during orthogonalization, + // so we have reason to doubt (for roundoff reasons) + // that it's still orthogonal to the other vectors. + // We need to orthogonalize again. + if (end_prod == 0.0) { // Row is exactly zero: + // generate random direction. + this->Row(i).SetRandn(); + } + counter++; + if (counter > 100) + KALDI_ERR << "Loop detected while orthogalizing matrix."; + } else { + this->Row(i).Scale(1.0 / std::sqrt(end_prod)); + break; + } + } + } +} + + +// Uses Svd to compute the eigenvalue decomposition of a symmetric positive semidefinite +// matrix: +// (*this) = rU * diag(rs) * rU^T, with rU an orthogonal matrix so rU^{-1} = rU^T. +// Does this by computing svd (*this) = U diag(rs) V^T ... answer is just U diag(rs) U^T. +// Throws exception if this failed to within supplied precision (typically because *this was not +// symmetric positive definite). + +template +void MatrixBase::SymPosSemiDefEig(VectorBase *rs, MatrixBase *rU, Real check_thresh) // e.g. check_thresh = 0.001 +{ + const MatrixIndexT D = num_rows_; + + KALDI_ASSERT(num_rows_ == num_cols_); + KALDI_ASSERT(IsSymmetric() && "SymPosSemiDefEig: expecting input to be symmetrical."); + KALDI_ASSERT(rU->num_rows_ == D && rU->num_cols_ == D && rs->Dim() == D); + + Matrix Vt(D, D); + Svd(rs, rU, &Vt); + + // First just zero any singular values if the column of U and V do not have +ve dot product-- + // this may mean we have small negative eigenvalues, and if we zero them the result will be closer to correct. + for (MatrixIndexT i = 0;i < D;i++) { + Real sum = 0.0; + for (MatrixIndexT j = 0;j < D;j++) sum += (*rU)(j, i) * Vt(i, j); + if (sum < 0.0) (*rs)(i) = 0.0; + } + + { + Matrix tmpU(*rU); Vector tmps(*rs); tmps.ApplyPow(0.5); + tmpU.MulColsVec(tmps); + SpMatrix tmpThis(D); + tmpThis.AddMat2(1.0, tmpU, kNoTrans, 0.0); + Matrix tmpThisFull(tmpThis); + float new_norm = tmpThisFull.FrobeniusNorm(); + float old_norm = (*this).FrobeniusNorm(); + tmpThisFull.AddMat(-1.0, (*this)); + + if (!(old_norm == 0 && new_norm == 0)) { + float diff_norm = tmpThisFull.FrobeniusNorm(); + if (std::abs(new_norm-old_norm) > old_norm*check_thresh || diff_norm > old_norm*check_thresh) { + KALDI_WARN << "SymPosSemiDefEig seems to have failed " << diff_norm << " !<< " + << check_thresh << "*" << old_norm << ", maybe matrix was not " + << "positive semi definite. Continuing anyway."; + } + } + } +} + + +template +Real MatrixBase::LogDet(Real *det_sign) const { + Real log_det; + Matrix tmp(*this); + tmp.Invert(&log_det, det_sign, false); // false== output not needed (saves some computation). + return log_det; +} + +template +void MatrixBase::InvertDouble(Real *log_det, Real *det_sign, + bool inverse_needed) { + double log_det_tmp, det_sign_tmp; + Matrix dmat(*this); + dmat.Invert(&log_det_tmp, &det_sign_tmp, inverse_needed); + if (inverse_needed) (*this).CopyFromMat(dmat); + if (log_det) *log_det = log_det_tmp; + if (det_sign) *det_sign = det_sign_tmp; +} + +template +void MatrixBase::CopyFromMat(const CompressedMatrix &mat) { + mat.CopyToMat(this); +} + +template +Matrix::Matrix(const CompressedMatrix &M): MatrixBase() { + Resize(M.NumRows(), M.NumCols(), kUndefined); + M.CopyToMat(this); +} + + + +template +void MatrixBase::InvertElements() { + for (MatrixIndexT r = 0; r < num_rows_; r++) { + for (MatrixIndexT c = 0; c < num_cols_; c++) { + (*this)(r, c) = static_cast(1.0 / (*this)(r, c)); + } + } +} + +template +void MatrixBase::Transpose() { + KALDI_ASSERT(num_rows_ == num_cols_); + MatrixIndexT M = num_rows_; + for (MatrixIndexT i = 0;i < M;i++) + for (MatrixIndexT j = 0;j < i;j++) { + Real &a = (*this)(i, j), &b = (*this)(j, i); + std::swap(a, b); + } +} + + +template +void Matrix::Transpose() { + if (this->num_rows_ != this->num_cols_) { + Matrix tmp(*this, kTrans); + Resize(this->num_cols_, this->num_rows_); + this->CopyFromMat(tmp); + } else { + (static_cast&>(*this)).Transpose(); + } +} + +template +void MatrixBase::Heaviside(const MatrixBase &src) { + KALDI_ASSERT(SameDim(*this, src)); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; + Real *row_data = data_; + const Real *src_row_data = src.Data(); + for (MatrixIndexT row = 0; row < num_rows; + row++,row_data += stride_, src_row_data += src.stride_) { + for (MatrixIndexT col = 0; col < num_cols; col++) + row_data[col] = (src_row_data[col] > 0 ? 1.0 : 0.0); + } +} + +template +void MatrixBase::Exp(const MatrixBase &src) { + KALDI_ASSERT(SameDim(*this, src)); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; + Real *row_data = data_; + const Real *src_row_data = src.Data(); + for (MatrixIndexT row = 0; row < num_rows; + row++,row_data += stride_, src_row_data += src.stride_) { + for (MatrixIndexT col = 0; col < num_cols; col++) + row_data[col] = kaldi::Exp(src_row_data[col]); + } +} + +template +void MatrixBase::Pow(const MatrixBase &src, Real power) { + KALDI_ASSERT(SameDim(*this, src)); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; + Real *row_data = data_; + const Real *src_row_data = src.Data(); + for (MatrixIndexT row = 0; row < num_rows; + row++,row_data += stride_, src_row_data += src.stride_) { + for (MatrixIndexT col = 0; col < num_cols; col++) { + row_data[col] = pow(src_row_data[col], power); + } + } +} + +template +void MatrixBase::PowAbs(const MatrixBase &src, Real power, bool include_sign) { + KALDI_ASSERT(SameDim(*this, src)); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; + Real *row_data = data_; + const Real *src_row_data = src.Data(); + for (MatrixIndexT row = 0; row < num_rows; + row++,row_data += stride_, src_row_data += src.stride_) { + for (MatrixIndexT col = 0; col < num_cols; col ++) { + if (include_sign == true && src_row_data[col] < 0) { + row_data[col] = -pow(std::abs(src_row_data[col]), power); + } else { + row_data[col] = pow(std::abs(src_row_data[col]), power); + } + } + } +} + +template +void MatrixBase::Floor(const MatrixBase &src, Real floor_val) { + KALDI_ASSERT(SameDim(*this, src)); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; + Real *row_data = data_; + const Real *src_row_data = src.Data(); + for (MatrixIndexT row = 0; row < num_rows; + row++,row_data += stride_, src_row_data += src.stride_) { + for (MatrixIndexT col = 0; col < num_cols; col++) + row_data[col] = (src_row_data[col] < floor_val ? floor_val : src_row_data[col]); + } +} + +template +void MatrixBase::Ceiling(const MatrixBase &src, Real ceiling_val) { + KALDI_ASSERT(SameDim(*this, src)); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; + Real *row_data = data_; + const Real *src_row_data = src.Data(); + for (MatrixIndexT row = 0; row < num_rows; + row++,row_data += stride_, src_row_data += src.stride_) { + for (MatrixIndexT col = 0; col < num_cols; col++) + row_data[col] = (src_row_data[col] > ceiling_val ? ceiling_val : src_row_data[col]); + } +} + +template +void MatrixBase::Log(const MatrixBase &src) { + KALDI_ASSERT(SameDim(*this, src)); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; + Real *row_data = data_; + const Real *src_row_data = src.Data(); + for (MatrixIndexT row = 0; row < num_rows; + row++,row_data += stride_, src_row_data += src.stride_) { + for (MatrixIndexT col = 0; col < num_cols; col++) + row_data[col] = kaldi::Log(src_row_data[col]); + } +} + +template +void MatrixBase::ExpSpecial(const MatrixBase &src) { + KALDI_ASSERT(SameDim(*this, src)); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; + Real *row_data = data_; + const Real *src_row_data = src.Data(); + for (MatrixIndexT row = 0; row < num_rows; + row++,row_data += stride_, src_row_data += src.stride_) { + for (MatrixIndexT col = 0; col < num_cols; col++) + row_data[col] = (src_row_data[col] < Real(0) ? kaldi::Exp(src_row_data[col]) : (src_row_data[col] + Real(1))); + } +} + +template +void MatrixBase::ExpLimited(const MatrixBase &src, Real lower_limit, Real upper_limit) { + KALDI_ASSERT(SameDim(*this, src)); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; + Real *row_data = data_; + const Real *src_row_data = src.Data(); + for (MatrixIndexT row = 0; row < num_rows; + row++,row_data += stride_, src_row_data += src.stride_) { + for (MatrixIndexT col = 0; col < num_cols; col++) { + const Real x = src_row_data[col]; + if (!(x >= lower_limit)) + row_data[col] = kaldi::Exp(lower_limit); + else if (x > upper_limit) + row_data[col] = kaldi::Exp(upper_limit); + else + row_data[col] = kaldi::Exp(x); + } + } +} + +template +bool MatrixBase::Power(Real power) { + KALDI_ASSERT(num_rows_ > 0 && num_rows_ == num_cols_); + MatrixIndexT n = num_rows_; + Matrix P(n, n); + Vector re(n), im(n); + this->Eig(&P, &re, &im); + // Now attempt to take the complex eigenvalues to this power. + for (MatrixIndexT i = 0; i < n; i++) + if (!AttemptComplexPower(&(re(i)), &(im(i)), power)) + return false; // e.g. real and negative, or zero, eigenvalues. + + Matrix D(n, n); // D to the power. + CreateEigenvalueMatrix(re, im, &D); + + Matrix tmp(n, n); // P times D + tmp.AddMatMat(1.0, P, kNoTrans, D, kNoTrans, 0.0); // tmp := P*D + P.Invert(); + // next line is: *this = tmp * P^{-1} = P * D * P^{-1} + (*this).AddMatMat(1.0, tmp, kNoTrans, P, kNoTrans, 0.0); + return true; +} + +template +void Matrix::Swap(Matrix *other) { + std::swap(this->data_, other->data_); + std::swap(this->num_cols_, other->num_cols_); + std::swap(this->num_rows_, other->num_rows_); + std::swap(this->stride_, other->stride_); +} + +// Repeating this comment that appeared in the header: +// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D +// P^{-1}. Be careful: the relationship of D to the eigenvalues we output is +// slightly complicated, due to the need for P to be real. In the symmetric +// case D is diagonal and real, but in +// the non-symmetric case there may be complex-conjugate pairs of eigenvalues. +// In this case, for the equation (*this) = P D P^{-1} to hold, D must actually +// be block diagonal, with 2x2 blocks corresponding to any such pairs. If a +// pair is lambda +- i*mu, D will have a corresponding 2x2 block +// [lambda, mu; -mu, lambda]. +// Note that if the input matrix (*this) is non-invertible, P may not be invertible +// so in this case instead of the equation (*this) = P D P^{-1} holding, we have +// instead (*this) P = P D. +// +// By making the pointer arguments non-NULL or NULL, the user can choose to take +// not to take the eigenvalues directly, and/or the matrix D which is block-diagonal +// with 2x2 blocks. +template +void MatrixBase::Eig(MatrixBase *P, + VectorBase *r, + VectorBase *i) const { + EigenvalueDecomposition eig(*this); + if (P) eig.GetV(P); + if (r) eig.GetRealEigenvalues(r); + if (i) eig.GetImagEigenvalues(i); +} + + +// Begin non-member function definitions. + +// /** +// * @brief Extension of the HTK header +// */ +// struct HtkHeaderExt +// { +// INT_32 mHeaderSize; +// INT_32 mVersion; +// INT_32 mSampSize; +// }; + +template +bool ReadHtk(std::istream &is, Matrix *M_ptr, HtkHeader *header_ptr) +{ + // check instantiated with double or float. + KALDI_ASSERT_IS_FLOATING_TYPE(Real); + Matrix &M = *M_ptr; + HtkHeader htk_hdr; + + // TODO(arnab): this fails if the HTK file has CRC cheksum or is compressed. + is.read((char*)&htk_hdr, sizeof(htk_hdr)); // we're being really POSIX here! + if (is.fail()) { + KALDI_WARN << "Could not read header from HTK feature file "; + return false; + } + + KALDI_SWAP4(htk_hdr.mNSamples); + KALDI_SWAP4(htk_hdr.mSamplePeriod); + KALDI_SWAP2(htk_hdr.mSampleSize); + KALDI_SWAP2(htk_hdr.mSampleKind); + + bool has_checksum = false; + { + // See HParm.h in HTK code for sources of these things. + enum BaseParmKind{ + Waveform, Lpc, Lprefc, Lpcepstra, Lpdelcep, + Irefc, Mfcc, Fbank, Melspec, User, Discrete, Plp, Anon }; + + const int32 IsCompressed = 02000, HasChecksum = 010000, HasVq = 040000, + Problem = IsCompressed | HasVq; + int32 base_parm = htk_hdr.mSampleKind & (077); + has_checksum = (base_parm & HasChecksum) != 0; + htk_hdr.mSampleKind &= ~HasChecksum; // We don't support writing with + // checksum so turn it off. + if (htk_hdr.mSampleKind & Problem) + KALDI_ERR << "Code to read HTK features does not support compressed " + "features, or features with VQ."; + if (base_parm == Waveform || base_parm == Irefc || base_parm == Discrete) + KALDI_ERR << "Attempting to read HTK features from unsupported type " + "(e.g. waveform or discrete features."; + } + + KALDI_VLOG(3) << "HTK header: Num Samples: " << htk_hdr.mNSamples + << "; Sample period: " << htk_hdr.mSamplePeriod + << "; Sample size: " << htk_hdr.mSampleSize + << "; Sample kind: " << htk_hdr.mSampleKind; + + M.Resize(htk_hdr.mNSamples, htk_hdr.mSampleSize / sizeof(float)); + + MatrixIndexT i; + MatrixIndexT j; + if (sizeof(Real) == sizeof(float)) { + for (i = 0; i< M.NumRows(); i++) { + is.read((char*)M.RowData(i), sizeof(float)*M.NumCols()); + if (is.fail()) { + KALDI_WARN << "Could not read data from HTK feature file "; + return false; + } + if (MachineIsLittleEndian()) { + MatrixIndexT C = M.NumCols(); + for (j = 0; j < C; j++) { + KALDI_SWAP4((M(i, j))); // The HTK standard is big-endian! + } + } + } + } else { + float *pmem = new float[M.NumCols()]; + for (i = 0; i < M.NumRows(); i++) { + is.read((char*)pmem, sizeof(float)*M.NumCols()); + if (is.fail()) { + KALDI_WARN << "Could not read data from HTK feature file "; + delete [] pmem; + return false; + } + MatrixIndexT C = M.NumCols(); + for (j = 0; j < C; j++) { + if (MachineIsLittleEndian()) // HTK standard is big-endian! + KALDI_SWAP4(pmem[j]); + M(i, j) = static_cast(pmem[j]); + } + } + delete [] pmem; + } + if (header_ptr) *header_ptr = htk_hdr; + if (has_checksum) { + int16 checksum; + is.read((char*)&checksum, sizeof(checksum)); + if (is.fail()) + KALDI_WARN << "Could not read checksum from HTK feature file "; + // We ignore the checksum. + } + return true; +} + + +template +bool ReadHtk(std::istream &is, Matrix *M, HtkHeader *header_ptr); + +template +bool ReadHtk(std::istream &is, Matrix *M, HtkHeader *header_ptr); + +template +bool WriteHtk(std::ostream &os, const MatrixBase &M, HtkHeader htk_hdr) // header may be derived from a previous call to ReadHtk. Must be in binary mode. +{ + KALDI_ASSERT(M.NumRows() == static_cast(htk_hdr.mNSamples)); + KALDI_ASSERT(M.NumCols() == static_cast(htk_hdr.mSampleSize) / + static_cast(sizeof(float))); + + KALDI_SWAP4(htk_hdr.mNSamples); + KALDI_SWAP4(htk_hdr.mSamplePeriod); + KALDI_SWAP2(htk_hdr.mSampleSize); + KALDI_SWAP2(htk_hdr.mSampleKind); + + os.write((char*)&htk_hdr, sizeof(htk_hdr)); + if (os.fail()) goto bad; + + MatrixIndexT i; + MatrixIndexT j; + if (sizeof(Real) == sizeof(float) && !MachineIsLittleEndian()) { + for (i = 0; i< M.NumRows(); i++) { // Unlikely to reach here ever! + os.write((char*)M.RowData(i), sizeof(float)*M.NumCols()); + if (os.fail()) goto bad; + } + } else { + float *pmem = new float[M.NumCols()]; + + for (i = 0; i < M.NumRows(); i++) { + const Real *rowData = M.RowData(i); + for (j = 0;j < M.NumCols();j++) + pmem[j] = static_cast ( rowData[j] ); + if (MachineIsLittleEndian()) + for (j = 0;j < M.NumCols();j++) + KALDI_SWAP4(pmem[j]); + os.write((char*)pmem, sizeof(float)*M.NumCols()); + if (os.fail()) { + delete [] pmem; + goto bad; + } + } + delete [] pmem; + } + return true; +bad: + KALDI_WARN << "Could not write to HTK feature file "; + return false; +} + +template +bool WriteHtk(std::ostream &os, const MatrixBase &M, HtkHeader htk_hdr); + +template +bool WriteHtk(std::ostream &os, const MatrixBase &M, HtkHeader htk_hdr); + +template +bool WriteSphinx(std::ostream &os, const MatrixBase &M) +{ + // CMUSphinx mfc file header contains count of the floats, followed + // by the data in float little endian format. + + int size = M.NumRows() * M.NumCols(); + os.write((char*)&size, sizeof(int)); + if (os.fail()) goto bad; + + MatrixIndexT i; + MatrixIndexT j; + if (sizeof(Real) == sizeof(float) && MachineIsLittleEndian()) { + for (i = 0; i< M.NumRows(); i++) { // Unlikely to reach here ever! + os.write((char*)M.RowData(i), sizeof(float)*M.NumCols()); + if (os.fail()) goto bad; + } + } else { + float *pmem = new float[M.NumCols()]; + + for (i = 0; i < M.NumRows(); i++) { + const Real *rowData = M.RowData(i); + for (j = 0;j < M.NumCols();j++) + pmem[j] = static_cast ( rowData[j] ); + if (!MachineIsLittleEndian()) + for (j = 0;j < M.NumCols();j++) + KALDI_SWAP4(pmem[j]); + os.write((char*)pmem, sizeof(float)*M.NumCols()); + if (os.fail()) { + delete [] pmem; + goto bad; + } + } + delete [] pmem; + } + return true; +bad: + KALDI_WARN << "Could not write to Sphinx feature file"; + return false; +} + +template +bool WriteSphinx(std::ostream &os, const MatrixBase &M); + +template +bool WriteSphinx(std::ostream &os, const MatrixBase &M); + +template +Real TraceMatMatMat(const MatrixBase &A, MatrixTransposeType transA, + const MatrixBase &B, MatrixTransposeType transB, + const MatrixBase &C, MatrixTransposeType transC) { + MatrixIndexT ARows = A.NumRows(), ACols = A.NumCols(), BRows = B.NumRows(), BCols = B.NumCols(), + CRows = C.NumRows(), CCols = C.NumCols(); + if (transA == kTrans) std::swap(ARows, ACols); + if (transB == kTrans) std::swap(BRows, BCols); + if (transC == kTrans) std::swap(CRows, CCols); + KALDI_ASSERT( CCols == ARows && ACols == BRows && BCols == CRows && "TraceMatMatMat: args have mismatched dimensions."); + if (ARows*BCols < std::min(BRows*CCols, CRows*ACols)) { + Matrix AB(ARows, BCols); + AB.AddMatMat(1.0, A, transA, B, transB, 0.0); // AB = A * B. + return TraceMatMat(AB, C, transC); + } else if ( BRows*CCols < CRows*ACols) { + Matrix BC(BRows, CCols); + BC.AddMatMat(1.0, B, transB, C, transC, 0.0); // BC = B * C. + return TraceMatMat(BC, A, transA); + } else { + Matrix CA(CRows, ACols); + CA.AddMatMat(1.0, C, transC, A, transA, 0.0); // CA = C * A + return TraceMatMat(CA, B, transB); + } +} + +template +float TraceMatMatMat(const MatrixBase &A, MatrixTransposeType transA, + const MatrixBase &B, MatrixTransposeType transB, + const MatrixBase &C, MatrixTransposeType transC); + +template +double TraceMatMatMat(const MatrixBase &A, MatrixTransposeType transA, + const MatrixBase &B, MatrixTransposeType transB, + const MatrixBase &C, MatrixTransposeType transC); + + +template +Real TraceMatMatMatMat(const MatrixBase &A, MatrixTransposeType transA, + const MatrixBase &B, MatrixTransposeType transB, + const MatrixBase &C, MatrixTransposeType transC, + const MatrixBase &D, MatrixTransposeType transD) { + MatrixIndexT ARows = A.NumRows(), ACols = A.NumCols(), BRows = B.NumRows(), BCols = B.NumCols(), + CRows = C.NumRows(), CCols = C.NumCols(), DRows = D.NumRows(), DCols = D.NumCols(); + if (transA == kTrans) std::swap(ARows, ACols); + if (transB == kTrans) std::swap(BRows, BCols); + if (transC == kTrans) std::swap(CRows, CCols); + if (transD == kTrans) std::swap(DRows, DCols); + KALDI_ASSERT( DCols == ARows && ACols == BRows && BCols == CRows && CCols == DRows && "TraceMatMatMat: args have mismatched dimensions."); + if (ARows*BCols < std::min(BRows*CCols, std::min(CRows*DCols, DRows*ACols))) { + Matrix AB(ARows, BCols); + AB.AddMatMat(1.0, A, transA, B, transB, 0.0); // AB = A * B. + return TraceMatMatMat(AB, kNoTrans, C, transC, D, transD); + } else if ((BRows*CCols) < std::min(CRows*DCols, DRows*ACols)) { + Matrix BC(BRows, CCols); + BC.AddMatMat(1.0, B, transB, C, transC, 0.0); // BC = B * C. + return TraceMatMatMat(BC, kNoTrans, D, transD, A, transA); + } else if (CRows*DCols < DRows*ACols) { + Matrix CD(CRows, DCols); + CD.AddMatMat(1.0, C, transC, D, transD, 0.0); // CD = C * D + return TraceMatMatMat(CD, kNoTrans, A, transA, B, transB); + } else { + Matrix DA(DRows, ACols); + DA.AddMatMat(1.0, D, transD, A, transA, 0.0); // DA = D * A + return TraceMatMatMat(DA, kNoTrans, B, transB, C, transC); + } +} + +template +float TraceMatMatMatMat(const MatrixBase &A, MatrixTransposeType transA, + const MatrixBase &B, MatrixTransposeType transB, + const MatrixBase &C, MatrixTransposeType transC, + const MatrixBase &D, MatrixTransposeType transD); + +template +double TraceMatMatMatMat(const MatrixBase &A, MatrixTransposeType transA, + const MatrixBase &B, MatrixTransposeType transB, + const MatrixBase &C, MatrixTransposeType transC, + const MatrixBase &D, MatrixTransposeType transD); + +template void SortSvd(VectorBase *s, MatrixBase *U, + MatrixBase *Vt, bool sort_on_absolute_value) { + /// Makes sure the Svd is sorted (from greatest to least absolute value). + MatrixIndexT num_singval = s->Dim(); + KALDI_ASSERT(U == NULL || U->NumCols() == num_singval); + KALDI_ASSERT(Vt == NULL || Vt->NumRows() == num_singval); + + std::vector > vec(num_singval); + // negative because we want revese order. + for (MatrixIndexT d = 0; d < num_singval; d++) { + Real val = (*s)(d), + sort_val = -(sort_on_absolute_value ? std::abs(val) : val); + vec[d] = std::pair(sort_val, d); + } + std::sort(vec.begin(), vec.end()); + Vector s_copy(*s); + for (MatrixIndexT d = 0; d < num_singval; d++) + (*s)(d) = s_copy(vec[d].second); + if (U != NULL) { + Matrix Utmp(*U); + MatrixIndexT dim = Utmp.NumRows(); + for (MatrixIndexT d = 0; d < num_singval; d++) { + MatrixIndexT oldidx = vec[d].second; + for (MatrixIndexT e = 0; e < dim; e++) + (*U)(e, d) = Utmp(e, oldidx); + } + } + if (Vt != NULL) { + Matrix Vttmp(*Vt); + for (MatrixIndexT d = 0; d < num_singval; d++) + (*Vt).Row(d).CopyFromVec(Vttmp.Row(vec[d].second)); + } +} + +template +void SortSvd(VectorBase *s, MatrixBase *U, + MatrixBase *Vt, bool); + +template +void SortSvd(VectorBase *s, MatrixBase *U, + MatrixBase *Vt, bool); + +template +void CreateEigenvalueMatrix(const VectorBase &re, const VectorBase &im, + MatrixBase *D) { + MatrixIndexT n = re.Dim(); + KALDI_ASSERT(im.Dim() == n && D->NumRows() == n && D->NumCols() == n); + + MatrixIndexT j = 0; + D->SetZero(); + while (j < n) { + if (im(j) == 0) { // Real eigenvalue + (*D)(j, j) = re(j); + j++; + } else { // First of a complex pair + KALDI_ASSERT(j+1 < n && ApproxEqual(im(j+1), -im(j)) + && ApproxEqual(re(j+1), re(j))); + /// if (im(j) < 0.0) KALDI_WARN << "Negative first im part of pair"; // TEMP + Real lambda = re(j), mu = im(j); + // create 2x2 block [lambda, mu; -mu, lambda] + (*D)(j, j) = lambda; + (*D)(j, j+1) = mu; + (*D)(j+1, j) = -mu; + (*D)(j+1, j+1) = lambda; + j += 2; + } + } +} + +template +void CreateEigenvalueMatrix(const VectorBase &re, const VectorBase &im, + MatrixBase *D); +template +void CreateEigenvalueMatrix(const VectorBase &re, const VectorBase &im, + MatrixBase *D); + + + +template +bool AttemptComplexPower(Real *x_re, Real *x_im, Real power) { + // Used in Matrix::Power(). + // Attempts to take the complex value x to the power "power", + // assuming that power is fractional (i.e. we don't treat integers as a + // special case). Returns false if this is not possible, either + // because x is negative and real (hence there is no obvious answer + // that is "closest to 1", and anyway this case does not make sense + // in the Matrix::Power() routine); + // or because power is negative, and x is zero. + + // First solve for r and theta in + // x_re = r*cos(theta), x_im = r*sin(theta) + if (*x_re < 0.0 && *x_im == 0.0) return false; // can't do + // it for negative real values. + Real r = std::sqrt((*x_re * *x_re) + (*x_im * *x_im)); // r == radius. + if (power < 0.0 && r == 0.0) return false; + Real theta = std::atan2(*x_im, *x_re); + // Take the power. + r = std::pow(r, power); + theta *= power; + *x_re = r * std::cos(theta); + *x_im = r * std::sin(theta); + return true; +} + +template +bool AttemptComplexPower(float *x_re, float *x_im, float power); +template +bool AttemptComplexPower(double *x_re, double *x_im, double power); + + + +template +Real TraceMatMat(const MatrixBase &A, + const MatrixBase &B, + MatrixTransposeType trans) { // tr(A B), equivalent to sum of each element of A times same element in B' + MatrixIndexT aStride = A.stride_, bStride = B.stride_; + if (trans == kNoTrans) { + KALDI_ASSERT(A.NumRows() == B.NumCols() && A.NumCols() == B.NumRows()); + Real ans = 0.0; + Real *adata = A.data_, *bdata = B.data_; + MatrixIndexT arows = A.NumRows(), acols = A.NumCols(); + for (MatrixIndexT row = 0;row < arows;row++, adata+=aStride, bdata++) + ans += cblas_Xdot(acols, adata, 1, bdata, bStride); + return ans; + } else { + KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols()); + Real ans = 0.0; + Real *adata = A.data_, *bdata = B.data_; + MatrixIndexT arows = A.NumRows(), acols = A.NumCols(); + for (MatrixIndexT row = 0;row < arows;row++, adata+=aStride, bdata+=bStride) + ans += cblas_Xdot(acols, adata, 1, bdata, 1); + return ans; + } +} + + +// Instantiate the template above for float and double. +template +float TraceMatMat(const MatrixBase &A, + const MatrixBase &B, + MatrixTransposeType trans); +template +double TraceMatMat(const MatrixBase &A, + const MatrixBase &B, + MatrixTransposeType trans); + + +template +Real MatrixBase::LogSumExp(Real prune) const { + Real sum; + if (sizeof(sum) == 8) sum = kLogZeroDouble; + else sum = kLogZeroFloat; + Real max_elem = Max(), cutoff; + if (sizeof(Real) == 4) cutoff = max_elem + kMinLogDiffFloat; + else cutoff = max_elem + kMinLogDiffDouble; + if (prune > 0.0 && max_elem - prune > cutoff) // explicit pruning... + cutoff = max_elem - prune; + + double sum_relto_max_elem = 0.0; + + for (MatrixIndexT i = 0; i < num_rows_; i++) { + for (MatrixIndexT j = 0; j < num_cols_; j++) { + BaseFloat f = (*this)(i, j); + if (f >= cutoff) + sum_relto_max_elem += kaldi::Exp(f - max_elem); + } + } + return max_elem + kaldi::Log(sum_relto_max_elem); +} + +template +Real MatrixBase::ApplySoftMax() { + Real max = this->Max(), sum = 0.0; + // the 'max' helps to get in good numeric range. + for (MatrixIndexT i = 0; i < num_rows_; i++) + for (MatrixIndexT j = 0; j < num_cols_; j++) + sum += ((*this)(i, j) = kaldi::Exp((*this)(i, j) - max)); + this->Scale(1.0 / sum); + return max + kaldi::Log(sum); +} + +template +void MatrixBase::Tanh(const MatrixBase &src) { + KALDI_ASSERT(SameDim(*this, src)); + + if (num_cols_ == stride_ && src.num_cols_ == src.stride_) { + SubVector src_vec(src.data_, num_rows_ * num_cols_), + dst_vec(this->data_, num_rows_ * num_cols_); + dst_vec.Tanh(src_vec); + } else { + for (MatrixIndexT r = 0; r < num_rows_; r++) { + SubVector src_vec(src, r), dest_vec(*this, r); + dest_vec.Tanh(src_vec); + } + } +} + +template +void MatrixBase::SoftHinge(const MatrixBase &src) { + KALDI_ASSERT(SameDim(*this, src)); + int32 num_rows = num_rows_, num_cols = num_cols_; + for (MatrixIndexT r = 0; r < num_rows; r++) { + Real *row_data = this->RowData(r); + const Real *src_row_data = src.RowData(r); + for (MatrixIndexT c = 0; c < num_cols; c++) { + Real x = src_row_data[c], y; + if (x > 10.0) y = x; // avoid exponentiating large numbers; function + // approaches y=x. + else y = Log1p(kaldi::Exp(x)); // these defined in kaldi-math.h + row_data[c] = y; + } + } +} + +template +void MatrixBase::GroupPnorm(const MatrixBase &src, Real power) { + KALDI_ASSERT(src.NumCols() % this->NumCols() == 0 && + src.NumRows() == this->NumRows()); + int group_size = src.NumCols() / this->NumCols(), + num_rows = this->NumRows(), num_cols = this->NumCols(); + for (MatrixIndexT i = 0; i < num_rows; i++) + for (MatrixIndexT j = 0; j < num_cols; j++) + (*this)(i, j) = src.Row(i).Range(j * group_size, group_size).Norm(power); +} + +template +void MatrixBase::GroupMax(const MatrixBase &src) { + KALDI_ASSERT(src.NumCols() % this->NumCols() == 0 && + src.NumRows() == this->NumRows()); + int group_size = src.NumCols() / this->NumCols(), + num_rows = this->NumRows(), num_cols = this->NumCols(); + for (MatrixIndexT i = 0; i < num_rows; i++) { + const Real *src_row_data = src.RowData(i); + for (MatrixIndexT j = 0; j < num_cols; j++) { + Real max_val = -1e20; + for (MatrixIndexT k = 0; k < group_size; k++) { + Real src_data = src_row_data[j * group_size + k]; + if (src_data > max_val) + max_val = src_data; + } + (*this)(i, j) = max_val; + } + } +} + +template +void MatrixBase::CopyCols(const MatrixBase &src, + const MatrixIndexT *indices) { + KALDI_ASSERT(NumRows() == src.NumRows()); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_, + this_stride = stride_, src_stride = src.stride_; + Real *this_data = this->data_; + const Real *src_data = src.data_; +#ifdef KALDI_PARANOID + MatrixIndexT src_cols = src.NumCols(); + for (MatrixIndexT i = 0; i < num_cols; i++) + KALDI_ASSERT(indices[i] >= -1 && indices[i] < src_cols); +#endif + + // For the sake of memory locality we do this row by row, rather + // than doing it column-wise using cublas_Xcopy + for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) { + const MatrixIndexT *index_ptr = &(indices[0]); + for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) { + if (*index_ptr < 0) this_data[c] = 0; + else this_data[c] = src_data[*index_ptr]; + } + } +} + + +template +void MatrixBase::AddCols(const MatrixBase &src, + const MatrixIndexT *indices) { + KALDI_ASSERT(NumRows() == src.NumRows()); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_, + this_stride = stride_, src_stride = src.stride_; + Real *this_data = this->data_; + const Real *src_data = src.data_; +#ifdef KALDI_PARANOID + MatrixIndexT src_cols = src.NumCols(); + for (MatrixIndexT i = 0; i < num_cols; i++) + KALDI_ASSERT(indices[i] >= -1 && indices[i] < src_cols); +#endif + + // For the sake of memory locality we do this row by row, rather + // than doing it column-wise using cublas_Xcopy + for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) { + const MatrixIndexT *index_ptr = &(indices[0]); + for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) { + if (*index_ptr >= 0) + this_data[c] += src_data[*index_ptr]; + } + } +} + +template +void MatrixBase::CopyRows(const MatrixBase &src, + const MatrixIndexT *indices) { + KALDI_ASSERT(NumCols() == src.NumCols()); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_, + this_stride = stride_; + Real *this_data = this->data_; + + for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) { + MatrixIndexT index = indices[r]; + if (index < 0) memset(this_data, 0, sizeof(Real) * num_cols_); + else cblas_Xcopy(num_cols, src.RowData(index), 1, this_data, 1); + } +} + +template +void MatrixBase::CopyRows(const Real *const *src) { + MatrixIndexT num_rows = num_rows_, + num_cols = num_cols_, this_stride = stride_; + Real *this_data = this->data_; + + for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) { + const Real *const src_data = src[r]; + if (src_data == NULL) memset(this_data, 0, sizeof(Real) * num_cols); + else cblas_Xcopy(num_cols, src_data, 1, this_data, 1); + } +} + +template +void MatrixBase::CopyToRows(Real *const *dst) const { + MatrixIndexT num_rows = num_rows_, + num_cols = num_cols_, this_stride = stride_; + const Real *this_data = this->data_; + + for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) { + Real *const dst_data = dst[r]; + if (dst_data != NULL) + cblas_Xcopy(num_cols, this_data, 1, dst_data, 1); + } +} + +template +void MatrixBase::AddRows(Real alpha, + const MatrixBase &src, + const MatrixIndexT *indexes) { + KALDI_ASSERT(NumCols() == src.NumCols()); + MatrixIndexT num_rows = num_rows_, + num_cols = num_cols_, this_stride = stride_; + Real *this_data = this->data_; + + for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) { + MatrixIndexT index = indexes[r]; + KALDI_ASSERT(index >= -1 && index < src.NumRows()); + if (index != -1) + cblas_Xaxpy(num_cols, alpha, src.RowData(index), 1, this_data, 1); + } +} + +template +void MatrixBase::AddRows(Real alpha, const Real *const *src) { + MatrixIndexT num_rows = num_rows_, + num_cols = num_cols_, this_stride = stride_; + Real *this_data = this->data_; + + for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) { + const Real *const src_data = src[r]; + if (src_data != NULL) + cblas_Xaxpy(num_cols, alpha, src_data, 1, this_data, 1); + } +} + +template +void MatrixBase::AddToRows(Real alpha, + const MatrixIndexT *indexes, + MatrixBase *dst) const { + KALDI_ASSERT(NumCols() == dst->NumCols()); + MatrixIndexT num_rows = num_rows_, + num_cols = num_cols_, this_stride = stride_; + Real *this_data = this->data_; + + for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) { + MatrixIndexT index = indexes[r]; + KALDI_ASSERT(index >= -1 && index < dst->NumRows()); + if (index != -1) + cblas_Xaxpy(num_cols, alpha, this_data, 1, dst->RowData(index), 1); + } +} + +template +void MatrixBase::AddToRows(Real alpha, Real *const *dst) const { + MatrixIndexT num_rows = num_rows_, + num_cols = num_cols_, this_stride = stride_; + const Real *this_data = this->data_; + + for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) { + Real *const dst_data = dst[r]; + if (dst_data != NULL) + cblas_Xaxpy(num_cols, alpha, this_data, 1, dst_data, 1); + } +} + +template +void MatrixBase::Sigmoid(const MatrixBase &src) { + KALDI_ASSERT(SameDim(*this, src)); + + if (num_cols_ == stride_ && src.num_cols_ == src.stride_) { + SubVector src_vec(src.data_, num_rows_ * num_cols_), + dst_vec(this->data_, num_rows_ * num_cols_); + dst_vec.Sigmoid(src_vec); + } else { + for (MatrixIndexT r = 0; r < num_rows_; r++) { + SubVector src_vec(src, r), dest_vec(*this, r); + dest_vec.Sigmoid(src_vec); + } + } +} + +template +void MatrixBase::DiffSigmoid(const MatrixBase &value, + const MatrixBase &diff) { + KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff)); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_, + stride = stride_, value_stride = value.stride_, diff_stride = diff.stride_; + Real *data = data_; + const Real *value_data = value.data_, *diff_data = diff.data_; + for (MatrixIndexT r = 0; r < num_rows; r++) { + for (MatrixIndexT c = 0; c < num_cols; c++) + data[c] = diff_data[c] * value_data[c] * (1.0 - value_data[c]); + data += stride; + value_data += value_stride; + diff_data += diff_stride; + } +} + +template +void MatrixBase::DiffTanh(const MatrixBase &value, + const MatrixBase &diff) { + KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff)); + MatrixIndexT num_rows = num_rows_, num_cols = num_cols_, + stride = stride_, value_stride = value.stride_, diff_stride = diff.stride_; + Real *data = data_; + const Real *value_data = value.data_, *diff_data = diff.data_; + for (MatrixIndexT r = 0; r < num_rows; r++) { + for (MatrixIndexT c = 0; c < num_cols; c++) + data[c] = diff_data[c] * (1.0 - (value_data[c] * value_data[c])); + data += stride; + value_data += value_stride; + diff_data += diff_stride; + } +} + + +template +template +void MatrixBase::AddVecToRows(const Real alpha, const VectorBase &v) { + const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_, + stride = stride_; + KALDI_ASSERT(v.Dim() == num_cols); + if(num_cols <= 64) { + Real *data = data_; + const OtherReal *vdata = v.Data(); + for (MatrixIndexT i = 0; i < num_rows; i++, data += stride) { + for (MatrixIndexT j = 0; j < num_cols; j++) + data[j] += alpha * vdata[j]; + } + + } else { + Vector ones(num_rows); + ones.Set(1.0); + this->AddVecVec(alpha, ones, v); + } +} + +template void MatrixBase::AddVecToRows(const float alpha, + const VectorBase &v); +template void MatrixBase::AddVecToRows(const float alpha, + const VectorBase &v); +template void MatrixBase::AddVecToRows(const double alpha, + const VectorBase &v); +template void MatrixBase::AddVecToRows(const double alpha, + const VectorBase &v); + + +template +template +void MatrixBase::AddVecToCols(const Real alpha, const VectorBase &v) { + const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_, + stride = stride_; + KALDI_ASSERT(v.Dim() == num_rows); + + if (num_rows <= 64) { + Real *data = data_; + const OtherReal *vdata = v.Data(); + for (MatrixIndexT i = 0; i < num_rows; i++, data += stride) { + Real to_add = alpha * vdata[i]; + for (MatrixIndexT j = 0; j < num_cols; j++) + data[j] += to_add; + } + + } else { + Vector ones(num_cols); + ones.Set(1.0); + this->AddVecVec(alpha, v, ones); + } +} + +template void MatrixBase::AddVecToCols(const float alpha, + const VectorBase &v); +template void MatrixBase::AddVecToCols(const float alpha, + const VectorBase &v); +template void MatrixBase::AddVecToCols(const double alpha, + const VectorBase &v); +template void MatrixBase::AddVecToCols(const double alpha, + const VectorBase &v); + +//Explicit instantiation of the classes +//Apparently, it seems to be necessary that the instantiation +//happens at the end of the file. Otherwise, not all the member +//functions will get instantiated. + +template class Matrix; +template class Matrix; +template class MatrixBase; +template class MatrixBase; +template class SubMatrix; +template class SubMatrix; + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/matrix/kaldi-matrix.h b/speechx/speechx/kaldi/matrix/kaldi-matrix.h new file mode 100644 index 00000000..4387538c --- /dev/null +++ b/speechx/speechx/kaldi/matrix/kaldi-matrix.h @@ -0,0 +1,1122 @@ +// matrix/kaldi-matrix.h + +// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Lukas Burget; +// Saarland University; Petr Schwarz; Yanmin Qian; +// Karel Vesely; Go Vivace Inc.; Haihua Xu +// 2017 Shiyin Kang +// 2019 Yiwen Shao + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_MATRIX_KALDI_MATRIX_H_ +#define KALDI_MATRIX_KALDI_MATRIX_H_ 1 + +#include + +#include "matrix/matrix-common.h" + +namespace kaldi { + +/// @{ \addtogroup matrix_funcs_scalar + +/// We need to declare this here as it will be a friend function. +/// tr(A B), or tr(A B^T). +template +Real TraceMatMat(const MatrixBase &A, const MatrixBase &B, + MatrixTransposeType trans = kNoTrans); +/// @} + +/// \addtogroup matrix_group +/// @{ + +/// Base class which provides matrix operations not involving resizing +/// or allocation. Classes Matrix and SubMatrix inherit from it and take care +/// of allocation and resizing. +template +class MatrixBase { + public: + // so this child can access protected members of other instances. + friend class Matrix; + // friend declarations for CUDA matrices (see ../cudamatrix/) + friend class CuMatrixBase; + friend class CuMatrix; + friend class CuSubMatrix; + friend class CuPackedMatrix; + friend class PackedMatrix; + friend class SparseMatrix; + friend class SparseMatrix; + friend class SparseMatrix; + + /// Returns number of rows (or zero for empty matrix). + inline MatrixIndexT NumRows() const { return num_rows_; } + + /// Returns number of columns (or zero for empty matrix). + inline MatrixIndexT NumCols() const { return num_cols_; } + + /// Stride (distance in memory between each row). Will be >= NumCols. + inline MatrixIndexT Stride() const { return stride_; } + + /// Returns size in bytes of the data held by the matrix. + size_t SizeInBytes() const { + return static_cast(num_rows_) * static_cast(stride_) * + sizeof(Real); + } + + /// Gives pointer to raw data (const). + inline const Real* Data() const { + return data_; + } + + /// Gives pointer to raw data (non-const). + inline Real* Data() { return data_; } + + /// Returns pointer to data for one row (non-const) + inline Real* RowData(MatrixIndexT i) { + KALDI_ASSERT(static_cast(i) < + static_cast(num_rows_)); + return data_ + i * stride_; + } + + /// Returns pointer to data for one row (const) + inline const Real* RowData(MatrixIndexT i) const { + KALDI_ASSERT(static_cast(i) < + static_cast(num_rows_)); + return data_ + i * stride_; + } + + /// Indexing operator, non-const + /// (only checks sizes if compiled with -DKALDI_PARANOID) + inline Real& operator() (MatrixIndexT r, MatrixIndexT c) { + KALDI_PARANOID_ASSERT(static_cast(r) < + static_cast(num_rows_) && + static_cast(c) < + static_cast(num_cols_)); + return *(data_ + r * stride_ + c); + } + /// Indexing operator, provided for ease of debugging (gdb doesn't work + /// with parenthesis operator). + Real &Index (MatrixIndexT r, MatrixIndexT c) { return (*this)(r, c); } + + /// Indexing operator, const + /// (only checks sizes if compiled with -DKALDI_PARANOID) + inline const Real operator() (MatrixIndexT r, MatrixIndexT c) const { + KALDI_PARANOID_ASSERT(static_cast(r) < + static_cast(num_rows_) && + static_cast(c) < + static_cast(num_cols_)); + return *(data_ + r * stride_ + c); + } + + /* Basic setting-to-special values functions. */ + + /// Sets matrix to zero. + void SetZero(); + /// Sets all elements to a specific value. + void Set(Real); + /// Sets to zero, except ones along diagonal [for non-square matrices too] + void SetUnit(); + /// Sets to random values of a normal distribution + void SetRandn(); + /// Sets to numbers uniformly distributed on (0, 1) + void SetRandUniform(); + + /* Copying functions. These do not resize the matrix! */ + + + /// Copy given matrix. (no resize is done). + template + void CopyFromMat(const MatrixBase & M, + MatrixTransposeType trans = kNoTrans); + + /// Copy from compressed matrix. + void CopyFromMat(const CompressedMatrix &M); + + /// Copy given spmatrix. (no resize is done). + template + void CopyFromSp(const SpMatrix &M); + + /// Copy given tpmatrix. (no resize is done). + template + void CopyFromTp(const TpMatrix &M, + MatrixTransposeType trans = kNoTrans); + + /// Copy from CUDA matrix. Implemented in ../cudamatrix/cu-matrix.h + template + void CopyFromMat(const CuMatrixBase &M, + MatrixTransposeType trans = kNoTrans); + + /// This function has two modes of operation. If v.Dim() == NumRows() * + /// NumCols(), then treats the vector as a row-by-row concatenation of a + /// matrix and copies to *this. + /// if v.Dim() == NumCols(), it sets each row of *this to a copy of v. + void CopyRowsFromVec(const VectorBase &v); + + /// This version of CopyRowsFromVec is implemented in ../cudamatrix/cu-vector.cc + void CopyRowsFromVec(const CuVectorBase &v); + + template + void CopyRowsFromVec(const VectorBase &v); + + /// Copies vector into matrix, column-by-column. + /// Note that rv.Dim() must either equal NumRows()*NumCols() or NumRows(); + /// this has two modes of operation. + void CopyColsFromVec(const VectorBase &v); + + /// Copy vector into specific column of matrix. + void CopyColFromVec(const VectorBase &v, const MatrixIndexT col); + /// Copy vector into specific row of matrix. + void CopyRowFromVec(const VectorBase &v, const MatrixIndexT row); + /// Copy vector into diagonal of matrix. + void CopyDiagFromVec(const VectorBase &v); + + /* Accessing of sub-parts of the matrix. */ + + /// Return specific row of matrix [const]. + inline const SubVector Row(MatrixIndexT i) const { + KALDI_ASSERT(static_cast(i) < + static_cast(num_rows_)); + return SubVector(data_ + (i * stride_), NumCols()); + } + + /// Return specific row of matrix. + inline SubVector Row(MatrixIndexT i) { + KALDI_ASSERT(static_cast(i) < + static_cast(num_rows_)); + return SubVector(data_ + (i * stride_), NumCols()); + } + + /// Return a sub-part of matrix. + inline SubMatrix Range(const MatrixIndexT row_offset, + const MatrixIndexT num_rows, + const MatrixIndexT col_offset, + const MatrixIndexT num_cols) const { + return SubMatrix(*this, row_offset, num_rows, + col_offset, num_cols); + } + inline SubMatrix RowRange(const MatrixIndexT row_offset, + const MatrixIndexT num_rows) const { + return SubMatrix(*this, row_offset, num_rows, 0, num_cols_); + } + inline SubMatrix ColRange(const MatrixIndexT col_offset, + const MatrixIndexT num_cols) const { + return SubMatrix(*this, 0, num_rows_, col_offset, num_cols); + } + + /* Various special functions. */ + /// Returns sum of all elements in matrix. + Real Sum() const; + /// Returns trace of matrix. + Real Trace(bool check_square = true) const; + // If check_square = true, will crash if matrix is not square. + + /// Returns maximum element of matrix. + Real Max() const; + /// Returns minimum element of matrix. + Real Min() const; + + /// Element by element multiplication with a given matrix. + void MulElements(const MatrixBase &A); + + /// Divide each element by the corresponding element of a given matrix. + void DivElements(const MatrixBase &A); + + /// Multiply each element with a scalar value. + void Scale(Real alpha); + + /// Set, element-by-element, *this = max(*this, A) + void Max(const MatrixBase &A); + /// Set, element-by-element, *this = min(*this, A) + void Min(const MatrixBase &A); + + /// Equivalent to (*this) = (*this) * diag(scale). Scaling + /// each column by a scalar taken from that dimension of the vector. + void MulColsVec(const VectorBase &scale); + + /// Equivalent to (*this) = diag(scale) * (*this). Scaling + /// each row by a scalar taken from that dimension of the vector. + void MulRowsVec(const VectorBase &scale); + + /// Divide each row into src.NumCols() equal groups, and then scale i'th row's + /// j'th group of elements by src(i, j). Requires src.NumRows() == + /// this->NumRows() and this->NumCols() % src.NumCols() == 0. + void MulRowsGroupMat(const MatrixBase &src); + + /// Returns logdet of matrix. + Real LogDet(Real *det_sign = NULL) const; + + /// matrix inverse. + /// if inverse_needed = false, will fill matrix with garbage. + /// (only useful if logdet wanted). + void Invert(Real *log_det = NULL, Real *det_sign = NULL, + bool inverse_needed = true); + /// matrix inverse [double]. + /// if inverse_needed = false, will fill matrix with garbage + /// (only useful if logdet wanted). + /// Does inversion in double precision even if matrix was not double. + void InvertDouble(Real *LogDet = NULL, Real *det_sign = NULL, + bool inverse_needed = true); + + /// Inverts all the elements of the matrix + void InvertElements(); + + /// Transpose the matrix. This one is only + /// applicable to square matrices (the one in the + /// Matrix child class works also for non-square. + void Transpose(); + + /// Copies column r from column indices[r] of src. + /// As a special case, if indexes[i] == -1, sets column i to zero. + /// all elements of "indices" must be in [-1, src.NumCols()-1], + /// and src.NumRows() must equal this.NumRows() + void CopyCols(const MatrixBase &src, + const MatrixIndexT *indices); + + /// Copies row r from row indices[r] of src (does nothing + /// As a special case, if indexes[i] == -1, sets row i to zero. + /// all elements of "indices" must be in [-1, src.NumRows()-1], + /// and src.NumCols() must equal this.NumCols() + void CopyRows(const MatrixBase &src, + const MatrixIndexT *indices); + + /// Add column indices[r] of src to column r. + /// As a special case, if indexes[i] == -1, skip column i + /// indices.size() must equal this->NumCols(), + /// all elements of "reorder" must be in [-1, src.NumCols()-1], + /// and src.NumRows() must equal this.NumRows() + void AddCols(const MatrixBase &src, + const MatrixIndexT *indices); + + /// Copies row r of this matrix from an array of floats at the location given + /// by src[r]. If any src[r] is NULL then this.Row(r) will be set to zero. + /// Note: we are using "pointer to const pointer to const object" for "src", + /// because we may create "src" by calling Data() of const CuArray + void CopyRows(const Real *const *src); + + /// Copies row r of this matrix to the array of floats at the location given + /// by dst[r]. If dst[r] is NULL, does not copy anywhere. Requires that none + /// of the memory regions pointed to by the pointers in "dst" overlap (e.g. + /// none of the pointers should be the same). + void CopyToRows(Real *const *dst) const; + + /// Does for each row r, this.Row(r) += alpha * src.row(indexes[r]). + /// If indexes[r] < 0, does not add anything. all elements of "indexes" must + /// be in [-1, src.NumRows()-1], and src.NumCols() must equal this.NumCols(). + void AddRows(Real alpha, + const MatrixBase &src, + const MatrixIndexT *indexes); + + /// Does for each row r, this.Row(r) += alpha * src[r], treating src[r] as the + /// beginning of a region of memory representing a vector of floats, of the + /// same length as this.NumCols(). If src[r] is NULL, does not add anything. + void AddRows(Real alpha, const Real *const *src); + + /// For each row r of this matrix, adds it (times alpha) to the array of + /// floats at the location given by dst[r]. If dst[r] is NULL, does not do + /// anything for that row. Requires that none of the memory regions pointed + /// to by the pointers in "dst" overlap (e.g. none of the pointers should be + /// the same). + void AddToRows(Real alpha, Real *const *dst) const; + + /// For each row i of *this, adds this->Row(i) to + /// dst->Row(indexes(i)) if indexes(i) >= 0, else do nothing. + /// Requires that all the indexes[i] that are >= 0 + /// be distinct, otherwise the behavior is undefined. + void AddToRows(Real alpha, + const MatrixIndexT *indexes, + MatrixBase *dst) const; + + inline void ApplyPow(Real power) { + this -> Pow(*this, power); + } + + + inline void ApplyPowAbs(Real power, bool include_sign=false) { + this -> PowAbs(*this, power, include_sign); + } + + inline void ApplyHeaviside() { + this -> Heaviside(*this); + } + + inline void ApplyFloor(Real floor_val) { + this -> Floor(*this, floor_val); + } + + inline void ApplyCeiling(Real ceiling_val) { + this -> Ceiling(*this, ceiling_val); + } + + inline void ApplyExp() { + this -> Exp(*this); + } + + inline void ApplyExpSpecial() { + this -> ExpSpecial(*this); + } + + inline void ApplyExpLimited(Real lower_limit, Real upper_limit) { + this -> ExpLimited(*this, lower_limit, upper_limit); + } + + inline void ApplyLog() { + this -> Log(*this); + } + + /// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D + /// P^{-1}. Be careful: the relationship of D to the eigenvalues we output is + /// slightly complicated, due to the need for P to be real. In the symmetric + /// case D is diagonal and real, but in + /// the non-symmetric case there may be complex-conjugate pairs of eigenvalues. + /// In this case, for the equation (*this) = P D P^{-1} to hold, D must actually + /// be block diagonal, with 2x2 blocks corresponding to any such pairs. If a + /// pair is lambda +- i*mu, D will have a corresponding 2x2 block + /// [lambda, mu; -mu, lambda]. + /// Note that if the input matrix (*this) is non-invertible, P may not be invertible + /// so in this case instead of the equation (*this) = P D P^{-1} holding, we have + /// instead (*this) P = P D. + /// + /// The non-member function CreateEigenvalueMatrix creates D from eigs_real and eigs_imag. + void Eig(MatrixBase *P, + VectorBase *eigs_real, + VectorBase *eigs_imag) const; + + /// The Power method attempts to take the matrix to a power using a method that + /// works in general for fractional and negative powers. The input matrix must + /// be invertible and have reasonable condition (or we don't guarantee the + /// results. The method is based on the eigenvalue decomposition. It will + /// return false and leave the matrix unchanged, if at entry the matrix had + /// real negative eigenvalues (or if it had zero eigenvalues and the power was + /// negative). + bool Power(Real pow); + + /** Singular value decomposition + Major limitations: + For nonsquare matrices, we assume m>=n (NumRows >= NumCols), and we return + the "skinny" Svd, i.e. the matrix in the middle is diagonal, and the + one on the left is rectangular. + + In Svd, *this = U*diag(S)*Vt. + Null pointers for U and/or Vt at input mean we do not want that output. We + expect that S.Dim() == m, U is either NULL or m by n, + and v is either NULL or n by n. + The singular values are not sorted (use SortSvd for that). */ + void DestructiveSvd(VectorBase *s, MatrixBase *U, + MatrixBase *Vt); // Destroys calling matrix. + + /// Compute SVD (*this) = U diag(s) Vt. Note that the V in the call is already + /// transposed; the normal formulation is U diag(s) V^T. + /// Null pointers for U or V mean we don't want that output (this saves + /// compute). The singular values are not sorted (use SortSvd for that). + void Svd(VectorBase *s, MatrixBase *U, + MatrixBase *Vt) const; + /// Compute SVD but only retain the singular values. + void Svd(VectorBase *s) const { Svd(s, NULL, NULL); } + + + /// Returns smallest singular value. + Real MinSingularValue() const { + Vector tmp(std::min(NumRows(), NumCols())); + Svd(&tmp); + return tmp.Min(); + } + + void TestUninitialized() const; // This function is designed so that if any element + // if the matrix is uninitialized memory, valgrind will complain. + + /// Returns condition number by computing Svd. Works even if cols > rows. + /// Returns infinity if all singular values are zero. + Real Cond() const; + + /// Returns true if matrix is Symmetric. + bool IsSymmetric(Real cutoff = 1.0e-05) const; // replace magic number + + /// Returns true if matrix is Diagonal. + bool IsDiagonal(Real cutoff = 1.0e-05) const; // replace magic number + + /// Returns true if the matrix is all zeros, except for ones on diagonal. (it + /// does not have to be square). More specifically, this function returns + /// false if for any i, j, (*this)(i, j) differs by more than cutoff from the + /// expression (i == j ? 1 : 0). + bool IsUnit(Real cutoff = 1.0e-05) const; // replace magic number + + /// Returns true if matrix is all zeros. + bool IsZero(Real cutoff = 1.0e-05) const; // replace magic number + + /// Frobenius norm, which is the sqrt of sum of square elements. Same as Schatten 2-norm, + /// or just "2-norm". + Real FrobeniusNorm() const; + + /// Returns true if ((*this)-other).FrobeniusNorm() + /// <= tol * (*this).FrobeniusNorm(). + bool ApproxEqual(const MatrixBase &other, float tol = 0.01) const; + + /// Tests for exact equality. It's usually preferable to use ApproxEqual. + bool Equal(const MatrixBase &other) const; + + /// largest absolute value. + Real LargestAbsElem() const; // largest absolute value. + + /// Returns log(sum(exp())) without exp overflow + /// If prune > 0.0, it uses a pruning beam, discarding + /// terms less than (max - prune). Note: in future + /// we may change this so that if prune = 0.0, it takes + /// the max, so use -1 if you don't want to prune. + Real LogSumExp(Real prune = -1.0) const; + + /// Apply soft-max to the collection of all elements of the + /// matrix and return normalizer (log sum of exponentials). + Real ApplySoftMax(); + + /// Set each element to the sigmoid of the corresponding element of "src". + void Sigmoid(const MatrixBase &src); + + /// Sets each element to the Heaviside step function (x > 0 ? 1 : 0) of the + /// corresponding element in "src". Note: in general you can make different + /// choices for x = 0, but for now please leave it as it (i.e. returning zero) + /// because it affects the RectifiedLinearComponent in the neural net code. + void Heaviside(const MatrixBase &src); + + void Exp(const MatrixBase &src); + + void Pow(const MatrixBase &src, Real power); + + void Log(const MatrixBase &src); + + /// Apply power to the absolute value of each element. + /// If include_sign is true, the result will be multiplied with + /// the sign of the input value. + /// If the power is negative and the input to the power is zero, + /// The output will be set zero. If include_sign is true, it will + /// multiply the result by the sign of the input. + void PowAbs(const MatrixBase &src, Real power, bool include_sign=false); + + void Floor(const MatrixBase &src, Real floor_val); + + void Ceiling(const MatrixBase &src, Real ceiling_val); + + /// For each element x of the matrix, set it to + /// (x < 0 ? exp(x) : x + 1). This function is used + /// in our RNNLM training. + void ExpSpecial(const MatrixBase &src); + + /// This is equivalent to running: + /// Floor(src, lower_limit); + /// Ceiling(src, upper_limit); + /// Exp(src) + void ExpLimited(const MatrixBase &src, Real lower_limit, Real upper_limit); + + /// Set each element to y = log(1 + exp(x)) + void SoftHinge(const MatrixBase &src); + + /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j^(power))^(1 / p). + /// Requires src.NumRows() == this->NumRows() and src.NumCols() % this->NumCols() == 0. + void GroupPnorm(const MatrixBase &src, Real power); + + /// Calculate derivatives for the GroupPnorm function above... + /// if "input" is the input to the GroupPnorm function above (i.e. the "src" variable), + /// and "output" is the result of the computation (i.e. the "this" of that function + /// call), and *this has the same dimension as "input", then it sets each element + /// of *this to the derivative d(output-elem)/d(input-elem) for each element of "input", where + /// "output-elem" is whichever element of output depends on that input element. + void GroupPnormDeriv(const MatrixBase &input, const MatrixBase &output, + Real power); + + /// Apply the function y(i) = (max_{j = i*G}^{(i+1)*G-1} x_j + /// Requires src.NumRows() == this->NumRows() and src.NumCols() % this->NumCols() == 0. + void GroupMax(const MatrixBase &src); + + /// Calculate derivatives for the GroupMax function above, where + /// "input" is the input to the GroupMax function above (i.e. the "src" variable), + /// and "output" is the result of the computation (i.e. the "this" of that function + /// call), and *this must have the same dimension as "input". Each element + /// of *this will be set to 1 if the corresponding input equals the output of + /// the group, and 0 otherwise. The equals the function derivative where it is + /// defined (it's not defined where multiple inputs in the group are equal to the output). + void GroupMaxDeriv(const MatrixBase &input, const MatrixBase &output); + + /// Set each element to the tanh of the corresponding element of "src". + void Tanh(const MatrixBase &src); + + // Function used in backpropagating derivatives of the sigmoid function: + // element-by-element, set *this = diff * value * (1.0 - value). + void DiffSigmoid(const MatrixBase &value, + const MatrixBase &diff); + + // Function used in backpropagating derivatives of the tanh function: + // element-by-element, set *this = diff * (1.0 - value^2). + void DiffTanh(const MatrixBase &value, + const MatrixBase &diff); + + /** Uses Svd to compute the eigenvalue decomposition of a symmetric positive + * semi-definite matrix: (*this) = rP * diag(rS) * rP^T, with rP an + * orthogonal matrix so rP^{-1} = rP^T. Throws exception if input was not + * positive semi-definite (check_thresh controls how stringent the check is; + * set it to 2 to ensure it won't ever complain, but it will zero out negative + * dimensions in your matrix. + * + * Caution: if you want the eigenvalues, it may make more sense to convert to + * SpMatrix and use Eig() function there, which uses eigenvalue decomposition + * directly rather than SVD. + */ + void SymPosSemiDefEig(VectorBase *s, MatrixBase *P, + Real check_thresh = 0.001); + + friend Real kaldi::TraceMatMat(const MatrixBase &A, + const MatrixBase &B, MatrixTransposeType trans); // tr (A B) + + // so it can get around const restrictions on the pointer to data_. + friend class SubMatrix; + + /// Add a scalar to each element + void Add(const Real alpha); + + /// Add a scalar to each diagonal element. + void AddToDiag(const Real alpha); + + /// *this += alpha * a * b^T + template + void AddVecVec(const Real alpha, const VectorBase &a, + const VectorBase &b); + + /// [each row of *this] += alpha * v + template + void AddVecToRows(const Real alpha, const VectorBase &v); + + /// [each col of *this] += alpha * v + template + void AddVecToCols(const Real alpha, const VectorBase &v); + + /// *this += alpha * M [or M^T] + void AddMat(const Real alpha, const MatrixBase &M, + MatrixTransposeType transA = kNoTrans); + + /// *this += alpha * A [or A^T]. + void AddSmat(Real alpha, const SparseMatrix &A, + MatrixTransposeType trans = kNoTrans); + + /// (*this) = alpha * op(A) * B + beta * (*this), where A is sparse. + /// Multiplication of sparse with dense matrix. See also AddMatSmat. + void AddSmatMat(Real alpha, const SparseMatrix &A, + MatrixTransposeType transA, const MatrixBase &B, + Real beta); + + /// (*this) = alpha * A * op(B) + beta * (*this), where B is sparse + /// and op(B) is either B or trans(B) depending on the 'transB' argument. + /// This is multiplication of a dense by a sparse matrix. See also + /// AddSmatMat. + void AddMatSmat(Real alpha, const MatrixBase &A, + const SparseMatrix &B, MatrixTransposeType transB, + Real beta); + + /// *this = beta * *this + alpha * M M^T, for symmetric matrices. It only + /// updates the lower triangle of *this. It will leave the matrix asymmetric; + /// if you need it symmetric as a regular matrix, do CopyLowerToUpper(). + void SymAddMat2(const Real alpha, const MatrixBase &M, + MatrixTransposeType transA, Real beta); + + /// *this = beta * *this + alpha * diag(v) * M [or M^T]. + /// The same as adding M but scaling each row M_i by v(i). + void AddDiagVecMat(const Real alpha, const VectorBase &v, + const MatrixBase &M, MatrixTransposeType transM, + Real beta = 1.0); + + /// *this = beta * *this + alpha * M [or M^T] * diag(v) + /// The same as adding M but scaling each column M_j by v(j). + void AddMatDiagVec(const Real alpha, + const MatrixBase &M, MatrixTransposeType transM, + VectorBase &v, + Real beta = 1.0); + + /// *this = beta * *this + alpha * A .* B (.* element by element multiplication) + void AddMatMatElements(const Real alpha, + const MatrixBase& A, + const MatrixBase& B, + const Real beta); + + /// *this += alpha * S + template + void AddSp(const Real alpha, const SpMatrix &S); + + void AddMatMat(const Real alpha, + const MatrixBase& A, MatrixTransposeType transA, + const MatrixBase& B, MatrixTransposeType transB, + const Real beta); + + /// *this = a * b / c (by element; when c = 0, *this = a) + void SetMatMatDivMat(const MatrixBase& A, + const MatrixBase& B, + const MatrixBase& C); + + /// A version of AddMatMat specialized for when the second argument + /// contains a lot of zeroes. + void AddMatSmat(const Real alpha, + const MatrixBase& A, MatrixTransposeType transA, + const MatrixBase& B, MatrixTransposeType transB, + const Real beta); + + /// A version of AddMatMat specialized for when the first argument + /// contains a lot of zeroes. + void AddSmatMat(const Real alpha, + const MatrixBase& A, MatrixTransposeType transA, + const MatrixBase& B, MatrixTransposeType transB, + const Real beta); + + /// this <-- beta*this + alpha*A*B*C. + void AddMatMatMat(const Real alpha, + const MatrixBase& A, MatrixTransposeType transA, + const MatrixBase& B, MatrixTransposeType transB, + const MatrixBase& C, MatrixTransposeType transC, + const Real beta); + + /// this <-- beta*this + alpha*SpA*B. + // This and the routines below are really + // stubs that need to be made more efficient. + void AddSpMat(const Real alpha, + const SpMatrix& A, + const MatrixBase& B, MatrixTransposeType transB, + const Real beta) { + Matrix M(A); + return AddMatMat(alpha, M, kNoTrans, B, transB, beta); + } + /// this <-- beta*this + alpha*A*B. + void AddTpMat(const Real alpha, + const TpMatrix& A, MatrixTransposeType transA, + const MatrixBase& B, MatrixTransposeType transB, + const Real beta) { + Matrix M(A); + return AddMatMat(alpha, M, transA, B, transB, beta); + } + /// this <-- beta*this + alpha*A*B. + void AddMatSp(const Real alpha, + const MatrixBase& A, MatrixTransposeType transA, + const SpMatrix& B, + const Real beta) { + Matrix M(B); + return AddMatMat(alpha, A, transA, M, kNoTrans, beta); + } + /// this <-- beta*this + alpha*A*B*C. + void AddSpMatSp(const Real alpha, + const SpMatrix &A, + const MatrixBase& B, MatrixTransposeType transB, + const SpMatrix& C, + const Real beta) { + Matrix M(A), N(C); + return AddMatMatMat(alpha, M, kNoTrans, B, transB, N, kNoTrans, beta); + } + /// this <-- beta*this + alpha*A*B. + void AddMatTp(const Real alpha, + const MatrixBase& A, MatrixTransposeType transA, + const TpMatrix& B, MatrixTransposeType transB, + const Real beta) { + Matrix M(B); + return AddMatMat(alpha, A, transA, M, transB, beta); + } + + /// this <-- beta*this + alpha*A*B. + void AddTpTp(const Real alpha, + const TpMatrix& A, MatrixTransposeType transA, + const TpMatrix& B, MatrixTransposeType transB, + const Real beta) { + Matrix M(A), N(B); + return AddMatMat(alpha, M, transA, N, transB, beta); + } + + /// this <-- beta*this + alpha*A*B. + // This one is more efficient, not like the others above. + void AddSpSp(const Real alpha, + const SpMatrix& A, const SpMatrix& B, + const Real beta); + + /// Copy lower triangle to upper triangle (symmetrize) + void CopyLowerToUpper(); + + /// Copy upper triangle to lower triangle (symmetrize) + void CopyUpperToLower(); + + /// This function orthogonalizes the rows of a matrix using the Gram-Schmidt + /// process. It is only applicable if NumRows() <= NumCols(). It will use + /// random number generation to fill in rows with something nonzero, in cases + /// where the original matrix was of deficient row rank. + void OrthogonalizeRows(); + + /// stream read. + /// Use instead of stream<<*this, if you want to add to existing contents. + // Will throw exception on failure. + void Read(std::istream & in, bool binary, bool add = false); + /// write to stream. + void Write(std::ostream & out, bool binary) const; + + // Below is internal methods for Svd, user does not have to know about this. +#if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD) + // protected: + // Should be protected but used directly in testing routine. + // destroys *this! + void LapackGesvd(VectorBase *s, MatrixBase *U, + MatrixBase *Vt); +#else + protected: + // destroys *this! + bool JamaSvd(VectorBase *s, MatrixBase *U, + MatrixBase *V); + +#endif + protected: + + /// Initializer, callable only from child. + explicit MatrixBase(Real *data, MatrixIndexT cols, MatrixIndexT rows, MatrixIndexT stride) : + data_(data), num_cols_(cols), num_rows_(rows), stride_(stride) { + KALDI_ASSERT_IS_FLOATING_TYPE(Real); + } + + /// Initializer, callable only from child. + /// Empty initializer, for un-initialized matrix. + explicit MatrixBase(): data_(NULL) { + KALDI_ASSERT_IS_FLOATING_TYPE(Real); + } + + // Make sure pointers to MatrixBase cannot be deleted. + ~MatrixBase() { } + + /// A workaround that allows SubMatrix to get a pointer to non-const data + /// for const Matrix. Unfortunately C++ does not allow us to declare a + /// "public const" inheritance or anything like that, so it would require + /// a lot of work to make the SubMatrix class totally const-correct-- + /// we would have to override many of the Matrix functions. + inline Real* Data_workaround() const { + return data_; + } + + /// data memory area + Real* data_; + + /// these attributes store the real matrix size as it is stored in memory + /// including memalignment + MatrixIndexT num_cols_; /// < Number of columns + MatrixIndexT num_rows_; /// < Number of rows + /** True number of columns for the internal matrix. This number may differ + * from num_cols_ as memory alignment might be used. */ + MatrixIndexT stride_; + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(MatrixBase); +}; + +/// A class for storing matrices. +template +class Matrix : public MatrixBase { + public: + + /// Empty constructor. + Matrix(); + + /// Basic constructor. + Matrix(const MatrixIndexT r, const MatrixIndexT c, + MatrixResizeType resize_type = kSetZero, + MatrixStrideType stride_type = kDefaultStride): + MatrixBase() { Resize(r, c, resize_type, stride_type); } + + /// Copy constructor from CUDA matrix + /// This is defined in ../cudamatrix/cu-matrix.h + template + explicit Matrix(const CuMatrixBase &cu, + MatrixTransposeType trans = kNoTrans); + + + /// Swaps the contents of *this and *other. Shallow swap. + void Swap(Matrix *other); + + /// Defined in ../cudamatrix/cu-matrix.cc + void Swap(CuMatrix *mat); + + /// Constructor from any MatrixBase. Can also copy with transpose. + /// Allocates new memory. + explicit Matrix(const MatrixBase & M, + MatrixTransposeType trans = kNoTrans); + + /// Same as above, but need to avoid default copy constructor. + Matrix(const Matrix & M); // (cannot make explicit) + + /// Copy constructor: as above, but from another type. + template + explicit Matrix(const MatrixBase & M, + MatrixTransposeType trans = kNoTrans); + + /// Copy constructor taking SpMatrix... + /// It is symmetric, so no option for transpose, and NumRows == Cols + template + explicit Matrix(const SpMatrix & M) : MatrixBase() { + Resize(M.NumRows(), M.NumRows(), kUndefined); + this->CopyFromSp(M); + } + + /// Constructor from CompressedMatrix + explicit Matrix(const CompressedMatrix &C); + + /// Copy constructor taking TpMatrix... + template + explicit Matrix(const TpMatrix & M, + MatrixTransposeType trans = kNoTrans) : MatrixBase() { + if (trans == kNoTrans) { + Resize(M.NumRows(), M.NumCols(), kUndefined); + this->CopyFromTp(M); + } else { + Resize(M.NumCols(), M.NumRows(), kUndefined); + this->CopyFromTp(M, kTrans); + } + } + + /// read from stream. + // Unlike one in base, allows resizing. + void Read(std::istream & in, bool binary, bool add = false); + + /// Remove a specified row. + void RemoveRow(MatrixIndexT i); + + /// Transpose the matrix. Works for non-square + /// matrices as well as square ones. + void Transpose(); + + /// Distructor to free matrices. + ~Matrix() { Destroy(); } + + /// Sets matrix to a specified size (zero is OK as long as both r and c are + /// zero). The value of the new data depends on resize_type: + /// -if kSetZero, the new data will be zero + /// -if kUndefined, the new data will be undefined + /// -if kCopyData, the new data will be the same as the old data in any + /// shared positions, and zero elsewhere. + /// + /// You can set stride_type to kStrideEqualNumCols to force the stride + /// to equal the number of columns; by default it is set so that the stride + /// in bytes is a multiple of 16. + /// + /// This function takes time proportional to the number of data elements. + void Resize(const MatrixIndexT r, + const MatrixIndexT c, + MatrixResizeType resize_type = kSetZero, + MatrixStrideType stride_type = kDefaultStride); + + /// Assignment operator that takes MatrixBase. + Matrix &operator = (const MatrixBase &other) { + if (MatrixBase::NumRows() != other.NumRows() || + MatrixBase::NumCols() != other.NumCols()) + Resize(other.NumRows(), other.NumCols(), kUndefined); + MatrixBase::CopyFromMat(other); + return *this; + } + + /// Assignment operator. Needed for inclusion in std::vector. + Matrix &operator = (const Matrix &other) { + if (MatrixBase::NumRows() != other.NumRows() || + MatrixBase::NumCols() != other.NumCols()) + Resize(other.NumRows(), other.NumCols(), kUndefined); + MatrixBase::CopyFromMat(other); + return *this; + } + + + private: + /// Deallocates memory and sets to empty matrix (dimension 0, 0). + void Destroy(); + + /// Init assumes the current class contents are invalid (i.e. junk or have + /// already been freed), and it sets the matrix to newly allocated memory with + /// the specified number of rows and columns. r == c == 0 is acceptable. The data + /// memory contents will be undefined. + void Init(const MatrixIndexT r, + const MatrixIndexT c, + const MatrixStrideType stride_type); + +}; +/// @} end "addtogroup matrix_group" + +/// \addtogroup matrix_funcs_io +/// @{ + +/// A structure containing the HTK header. +/// [TODO: change the style of the variables to Kaldi-compliant] +struct HtkHeader { + /// Number of samples. + int32 mNSamples; + /// Sample period. + int32 mSamplePeriod; + /// Sample size + int16 mSampleSize; + /// Sample kind. + uint16 mSampleKind; +}; + +// Read HTK formatted features from file into matrix. +template +bool ReadHtk(std::istream &is, Matrix *M, HtkHeader *header_ptr); + +// Write (HTK format) features to file from matrix. +template +bool WriteHtk(std::ostream &os, const MatrixBase &M, HtkHeader htk_hdr); + +// Write (CMUSphinx format) features to file from matrix. +template +bool WriteSphinx(std::ostream &os, const MatrixBase &M); + +/// @} end of "addtogroup matrix_funcs_io" + +/** + Sub-matrix representation. + Can work with sub-parts of a matrix using this class. + Note that SubMatrix is not very const-correct-- it allows you to + change the contents of a const Matrix. Be careful! +*/ + +template +class SubMatrix : public MatrixBase { + public: + // Initialize a SubMatrix from part of a matrix; this is + // a bit like A(b:c, d:e) in Matlab. + // This initializer is against the proper semantics of "const", since + // SubMatrix can change its contents. It would be hard to implement + // a "const-safe" version of this class. + SubMatrix(const MatrixBase& T, + const MatrixIndexT ro, // row offset, 0 < ro < NumRows() + const MatrixIndexT r, // number of rows, r > 0 + const MatrixIndexT co, // column offset, 0 < co < NumCols() + const MatrixIndexT c); // number of columns, c > 0 + + // This initializer is mostly intended for use in CuMatrix and related + // classes. Be careful! + SubMatrix(Real *data, + MatrixIndexT num_rows, + MatrixIndexT num_cols, + MatrixIndexT stride); + + ~SubMatrix() {} + + /// This type of constructor is needed for Range() to work [in Matrix base + /// class]. Cannot make it explicit. + SubMatrix (const SubMatrix &other): + MatrixBase (other.data_, other.num_cols_, other.num_rows_, + other.stride_) {} + + private: + /// Disallow assignment. + SubMatrix &operator = (const SubMatrix &other); +}; +/// @} End of "addtogroup matrix_funcs_io". + +/// \addtogroup matrix_funcs_scalar +/// @{ + +// Some declarations. These are traces of products. + + +template +bool ApproxEqual(const MatrixBase &A, + const MatrixBase &B, Real tol = 0.01) { + return A.ApproxEqual(B, tol); +} + +template +inline void AssertEqual(const MatrixBase &A, const MatrixBase &B, + float tol = 0.01) { + KALDI_ASSERT(A.ApproxEqual(B, tol)); +} + +/// Returns trace of matrix. +template +double TraceMat(const MatrixBase &A) { return A.Trace(); } + + +/// Returns tr(A B C) +template +Real TraceMatMatMat(const MatrixBase &A, MatrixTransposeType transA, + const MatrixBase &B, MatrixTransposeType transB, + const MatrixBase &C, MatrixTransposeType transC); + +/// Returns tr(A B C D) +template +Real TraceMatMatMatMat(const MatrixBase &A, MatrixTransposeType transA, + const MatrixBase &B, MatrixTransposeType transB, + const MatrixBase &C, MatrixTransposeType transC, + const MatrixBase &D, MatrixTransposeType transD); + +/// @} end "addtogroup matrix_funcs_scalar" + + +/// \addtogroup matrix_funcs_misc +/// @{ + + +/// Function to ensure that SVD is sorted. This function is made as generic as +/// possible, to be applicable to other types of problems. s->Dim() should be +/// the same as U->NumCols(), and we sort s from greatest to least absolute +/// value (if sort_on_absolute_value == true) or greatest to least value +/// otherwise, moving the columns of U, if it exists, and the rows of Vt, if it +/// exists, around in the same way. Note: the "absolute value" part won't matter +/// if this is an actual SVD, since singular values are non-negative. +template void SortSvd(VectorBase *s, MatrixBase *U, + MatrixBase* Vt = NULL, + bool sort_on_absolute_value = true); + +/// Creates the eigenvalue matrix D that is part of the decomposition used Matrix::Eig. +/// D will be block-diagonal with blocks of size 1 (for real eigenvalues) or 2x2 +/// for complex pairs. If a complex pair is lambda +- i*mu, D will have a corresponding +/// 2x2 block [lambda, mu; -mu, lambda]. +/// This function will throw if any complex eigenvalues are not in complex conjugate +/// pairs (or the members of such pairs are not consecutively numbered). +template +void CreateEigenvalueMatrix(const VectorBase &real, const VectorBase &imag, + MatrixBase *D); + +/// The following function is used in Matrix::Power, and separately tested, so we +/// declare it here mainly for the testing code to see. It takes a complex value to +/// a power using a method that will work for noninteger powers (but will fail if the +/// complex value is real and negative). +template +bool AttemptComplexPower(Real *x_re, Real *x_im, Real power); + + + +/// @} end of addtogroup matrix_funcs_misc + +/// \addtogroup matrix_funcs_io +/// @{ +template +std::ostream & operator << (std::ostream & Out, const MatrixBase & M); + +template +std::istream & operator >> (std::istream & In, MatrixBase & M); + +// The Matrix read allows resizing, so we override the MatrixBase one. +template +std::istream & operator >> (std::istream & In, Matrix & M); + + +template +bool SameDim(const MatrixBase &M, const MatrixBase &N) { + return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols()); +} + +/// @} end of \addtogroup matrix_funcs_io + + +} // namespace kaldi + + + +// we need to include the implementation and some +// template specializations. +#include "matrix/kaldi-matrix-inl.h" + + +#endif // KALDI_MATRIX_KALDI_MATRIX_H_ diff --git a/speechx/speechx/kaldi/matrix/kaldi-vector-inl.h b/speechx/speechx/kaldi/matrix/kaldi-vector-inl.h new file mode 100644 index 00000000..c3a4f52f --- /dev/null +++ b/speechx/speechx/kaldi/matrix/kaldi-vector-inl.h @@ -0,0 +1,58 @@ +// matrix/kaldi-vector-inl.h + +// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; +// Haihua Xu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +// This is an internal header file, included by other library headers. +// You should not attempt to use it directly. + +#ifndef KALDI_MATRIX_KALDI_VECTOR_INL_H_ +#define KALDI_MATRIX_KALDI_VECTOR_INL_H_ 1 + +namespace kaldi { + +template +std::ostream & operator << (std::ostream &os, const VectorBase &rv) { + rv.Write(os, false); + return os; +} + +template +std::istream &operator >> (std::istream &is, VectorBase &rv) { + rv.Read(is, false); + return is; +} + +template +std::istream &operator >> (std::istream &is, Vector &rv) { + rv.Read(is, false); + return is; +} + +template<> +template<> +void VectorBase::AddVec(const float alpha, const VectorBase &rv); + +template<> +template<> +void VectorBase::AddVec(const double alpha, + const VectorBase &rv); + +} // namespace kaldi + +#endif // KALDI_MATRIX_KALDI_VECTOR_INL_H_ diff --git a/speechx/speechx/kaldi/matrix/kaldi-vector.cc b/speechx/speechx/kaldi/matrix/kaldi-vector.cc new file mode 100644 index 00000000..ccc7e89b --- /dev/null +++ b/speechx/speechx/kaldi/matrix/kaldi-vector.cc @@ -0,0 +1,1355 @@ +// matrix/kaldi-vector.cc + +// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; +// Saarland University; Go Vivace Inc.; Ariya Rastrow; +// Petr Schwarz; Yanmin Qian; Jan Silovsky; +// Haihua Xu; Wei Shi +// 2015 Guoguo Chen +// 2017 Daniel Galvez +// 2019 Yiwen Shao + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "matrix/cblas-wrappers.h" +#include "matrix/kaldi-vector.h" +#include "matrix/kaldi-matrix.h" +#include "matrix/sp-matrix.h" +#include "matrix/sparse-matrix.h" + +namespace kaldi { + +template +Real VecVec(const VectorBase &a, + const VectorBase &b) { + MatrixIndexT adim = a.Dim(); + KALDI_ASSERT(adim == b.Dim()); + return cblas_Xdot(adim, a.Data(), 1, b.Data(), 1); +} + +template +float VecVec<>(const VectorBase &a, + const VectorBase &b); +template +double VecVec<>(const VectorBase &a, + const VectorBase &b); + +template +Real VecVec(const VectorBase &ra, + const VectorBase &rb) { + MatrixIndexT adim = ra.Dim(); + KALDI_ASSERT(adim == rb.Dim()); + const Real *a_data = ra.Data(); + const OtherReal *b_data = rb.Data(); + Real sum = 0.0; + for (MatrixIndexT i = 0; i < adim; i++) + sum += a_data[i]*b_data[i]; + return sum; +} + +// instantiate the template above. +template +float VecVec<>(const VectorBase &ra, + const VectorBase &rb); +template +double VecVec<>(const VectorBase &ra, + const VectorBase &rb); + + +template<> +template<> +void VectorBase::AddVec(const float alpha, + const VectorBase &v) { + KALDI_ASSERT(dim_ == v.dim_); + KALDI_ASSERT(&v != this); + cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1); +} + +template<> +template<> +void VectorBase::AddVec(const double alpha, + const VectorBase &v) { + KALDI_ASSERT(dim_ == v.dim_); + KALDI_ASSERT(&v != this); + cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1); +} + +template +void VectorBase::AddMatVec(const Real alpha, + const MatrixBase &M, + MatrixTransposeType trans, + const VectorBase &v, + const Real beta) { + KALDI_ASSERT((trans == kNoTrans && M.NumCols() == v.dim_ && M.NumRows() == dim_) + || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_)); + KALDI_ASSERT(&v != this); + cblas_Xgemv(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(), + v.Data(), 1, beta, data_, 1); +} + +template +void VectorBase::AddMatSvec(const Real alpha, + const MatrixBase &M, + MatrixTransposeType trans, + const VectorBase &v, + const Real beta) { + KALDI_ASSERT((trans == kNoTrans && M.NumCols() == v.dim_ && M.NumRows() == dim_) + || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_)); + KALDI_ASSERT(&v != this); + Xgemv_sparsevec(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(), + v.Data(), 1, beta, data_, 1); + return; + /* + MatrixIndexT this_dim = this->dim_, v_dim = v.dim_, + M_stride = M.Stride(); + Real *this_data = this->data_; + const Real *M_data = M.Data(), *v_data = v.data_; + if (beta != 1.0) this->Scale(beta); + if (trans == kNoTrans) { + for (MatrixIndexT i = 0; i < v_dim; i++) { + Real v_i = v_data[i]; + if (v_i == 0.0) continue; + // Add to *this, the i'th column of the Matrix, times v_i. + cblas_Xaxpy(this_dim, v_i * alpha, M_data + i, M_stride, this_data, 1); + } + } else { // The transposed case is slightly more efficient, I guess. + for (MatrixIndexT i = 0; i < v_dim; i++) { + Real v_i = v.data_[i]; + if (v_i == 0.0) continue; + // Add to *this, the i'th row of the Matrix, times v_i. + cblas_Xaxpy(this_dim, v_i * alpha, + M_data + (i * M_stride), 1, this_data, 1); + } + }*/ +} + +template +void VectorBase::AddSpVec(const Real alpha, + const SpMatrix &M, + const VectorBase &v, + const Real beta) { + KALDI_ASSERT(M.NumRows() == v.dim_ && dim_ == v.dim_); + KALDI_ASSERT(&v != this); + cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), 1, beta, data_, 1); +} + + +template +void VectorBase::MulTp(const TpMatrix &M, + const MatrixTransposeType trans) { + KALDI_ASSERT(M.NumRows() == dim_); + cblas_Xtpmv(trans,M.Data(),M.NumRows(),data_,1); +} + +template +void VectorBase::Solve(const TpMatrix &M, + const MatrixTransposeType trans) { + KALDI_ASSERT(M.NumRows() == dim_); + cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, 1); +} + + +template +inline void Vector::Init(const MatrixIndexT dim) { + KALDI_ASSERT(dim >= 0); + if (dim == 0) { + this->dim_ = 0; + this->data_ = NULL; + return; + } + MatrixIndexT size; + void *data; + void *free_data; + + size = dim * sizeof(Real); + + if ((data = KALDI_MEMALIGN(16, size, &free_data)) != NULL) { + this->data_ = static_cast (data); + this->dim_ = dim; + } else { + throw std::bad_alloc(); + } +} + + +template +void Vector::Resize(const MatrixIndexT dim, MatrixResizeType resize_type) { + + // the next block uses recursion to handle what we have to do if + // resize_type == kCopyData. + if (resize_type == kCopyData) { + if (this->data_ == NULL || dim == 0) resize_type = kSetZero; // nothing to copy. + else if (this->dim_ == dim) { return; } // nothing to do. + else { + // set tmp to a vector of the desired size. + Vector tmp(dim, kUndefined); + if (dim > this->dim_) { + memcpy(tmp.data_, this->data_, sizeof(Real)*this->dim_); + memset(tmp.data_+this->dim_, 0, sizeof(Real)*(dim-this->dim_)); + } else { + memcpy(tmp.data_, this->data_, sizeof(Real)*dim); + } + tmp.Swap(this); + // and now let tmp go out of scope, deleting what was in *this. + return; + } + } + // At this point, resize_type == kSetZero or kUndefined. + + if (this->data_ != NULL) { + if (this->dim_ == dim) { + if (resize_type == kSetZero) this->SetZero(); + return; + } else { + Destroy(); + } + } + Init(dim); + if (resize_type == kSetZero) this->SetZero(); +} + + +/// Copy data from another vector +template +void VectorBase::CopyFromVec(const VectorBase &v) { + KALDI_ASSERT(Dim() == v.Dim()); + if (data_ != v.data_) { + std::memcpy(this->data_, v.data_, dim_ * sizeof(Real)); + } +} + +template +template +void VectorBase::CopyFromPacked(const PackedMatrix& M) { + SubVector v(M); + this->CopyFromVec(v); +} +// instantiate the template. +template void VectorBase::CopyFromPacked(const PackedMatrix &other); +template void VectorBase::CopyFromPacked(const PackedMatrix &other); +template void VectorBase::CopyFromPacked(const PackedMatrix &other); +template void VectorBase::CopyFromPacked(const PackedMatrix &other); + +/// Load data into the vector +template +void VectorBase::CopyFromPtr(const Real *data, MatrixIndexT sz) { + KALDI_ASSERT(dim_ == sz); + std::memcpy(this->data_, data, Dim() * sizeof(Real)); +} + +template +template +void VectorBase::CopyFromVec(const VectorBase &other) { + KALDI_ASSERT(dim_ == other.Dim()); + Real * __restrict__ ptr = data_; + const OtherReal * __restrict__ other_ptr = other.Data(); + for (MatrixIndexT i = 0; i < dim_; i++) + ptr[i] = other_ptr[i]; +} + +template void VectorBase::CopyFromVec(const VectorBase &other); +template void VectorBase::CopyFromVec(const VectorBase &other); + +// Remove element from the vector. The vector is not reallocated +template +void Vector::RemoveElement(MatrixIndexT i) { + KALDI_ASSERT(i < this->dim_ && "Access out of vector"); + for (MatrixIndexT j = i + 1; j < this->dim_; j++) + this->data_[j-1] = this->data_[j]; + this->dim_--; +} + + +/// Deallocates memory and sets object to empty vector. +template +void Vector::Destroy() { + /// we need to free the data block if it was defined + if (this->data_ != NULL) + KALDI_MEMALIGN_FREE(this->data_); + this->data_ = NULL; + this->dim_ = 0; +} + +template +void VectorBase::SetZero() { + std::memset(data_, 0, dim_ * sizeof(Real)); +} + +template +bool VectorBase::IsZero(Real cutoff) const { + Real abs_max = 0.0; + for (MatrixIndexT i = 0; i < Dim(); i++) + abs_max = std::max(std::abs(data_[i]), abs_max); + return (abs_max <= cutoff); +} + +template +void VectorBase::SetRandn() { + kaldi::RandomState rstate; + MatrixIndexT last = (Dim() % 2 == 1) ? Dim() - 1 : Dim(); + for (MatrixIndexT i = 0; i < last; i += 2) { + kaldi::RandGauss2(data_ + i, data_ + i + 1, &rstate); + } + if (Dim() != last) data_[last] = static_cast(kaldi::RandGauss(&rstate)); +} + +template +void VectorBase::SetRandUniform() { + kaldi::RandomState rstate; + for (MatrixIndexT i = 0; i < Dim(); i++) { + *(data_+i) = RandUniform(&rstate); + } +} + +template +MatrixIndexT VectorBase::RandCategorical() const { + kaldi::RandomState rstate; + Real sum = this->Sum(); + KALDI_ASSERT(this->Min() >= 0.0 && sum > 0.0); + Real r = RandUniform(&rstate) * sum; + Real *data = this->data_; + MatrixIndexT dim = this->dim_; + Real running_sum = 0.0; + for (MatrixIndexT i = 0; i < dim; i++) { + running_sum += data[i]; + if (r < running_sum) return i; + } + return dim_ - 1; // Should only happen if RandUniform() + // returns exactly 1, or due to roundoff. +} + +template +void VectorBase::Set(Real f) { + // Why not use memset here? + // The basic unit of memset is a byte. + // If f != 0 and sizeof(Real) > 1, then we cannot use memset. + if (f == 0) { + this->SetZero(); // calls std::memset + } else { + for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = f; } + } +} + +template +void VectorBase::CopyRowsFromMat(const MatrixBase &mat) { + KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows()); + + Real *inc_data = data_; + const MatrixIndexT cols = mat.NumCols(), rows = mat.NumRows(); + + if (mat.Stride() == mat.NumCols()) { + memcpy(inc_data, mat.Data(), cols*rows*sizeof(Real)); + } else { + for (MatrixIndexT i = 0; i < rows; i++) { + // copy the data to the propper position + memcpy(inc_data, mat.RowData(i), cols * sizeof(Real)); + // set new copy position + inc_data += cols; + } + } +} + +template +template +void VectorBase::CopyRowsFromMat(const MatrixBase &mat) { + KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows()); + Real *vec_data = data_; + const MatrixIndexT cols = mat.NumCols(), + rows = mat.NumRows(); + + for (MatrixIndexT i = 0; i < rows; i++) { + const OtherReal *mat_row = mat.RowData(i); + for (MatrixIndexT j = 0; j < cols; j++) { + vec_data[j] = static_cast(mat_row[j]); + } + vec_data += cols; + } +} + +template +void VectorBase::CopyRowsFromMat(const MatrixBase &mat); +template +void VectorBase::CopyRowsFromMat(const MatrixBase &mat); + + +template +void VectorBase::CopyColsFromMat(const MatrixBase &mat) { + KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows()); + + Real* inc_data = data_; + const MatrixIndexT cols = mat.NumCols(), rows = mat.NumRows(), stride = mat.Stride(); + const Real *mat_inc_data = mat.Data(); + + for (MatrixIndexT i = 0; i < cols; i++) { + for (MatrixIndexT j = 0; j < rows; j++) { + inc_data[j] = mat_inc_data[j*stride]; + } + mat_inc_data++; + inc_data += rows; + } +} + +template +void VectorBase::CopyRowFromMat(const MatrixBase &mat, MatrixIndexT row) { + KALDI_ASSERT(row < mat.NumRows()); + KALDI_ASSERT(dim_ == mat.NumCols()); + const Real *mat_row = mat.RowData(row); + memcpy(data_, mat_row, sizeof(Real)*dim_); +} + +template +template +void VectorBase::CopyRowFromMat(const MatrixBase &mat, MatrixIndexT row) { + KALDI_ASSERT(row < mat.NumRows()); + KALDI_ASSERT(dim_ == mat.NumCols()); + const OtherReal *mat_row = mat.RowData(row); + for (MatrixIndexT i = 0; i < dim_; i++) + data_[i] = static_cast(mat_row[i]); +} + +template +void VectorBase::CopyRowFromMat(const MatrixBase &mat, MatrixIndexT row); +template +void VectorBase::CopyRowFromMat(const MatrixBase &mat, MatrixIndexT row); + +template +template +void VectorBase::CopyRowFromSp(const SpMatrix &sp, MatrixIndexT row) { + KALDI_ASSERT(row < sp.NumRows()); + KALDI_ASSERT(dim_ == sp.NumCols()); + + const OtherReal *sp_data = sp.Data(); + + sp_data += (row*(row+1)) / 2; // takes us to beginning of this row. + MatrixIndexT i; + for (i = 0; i < row; i++) // copy consecutive elements. + data_[i] = static_cast(*(sp_data++)); + for(; i < dim_; ++i, sp_data += i) + data_[i] = static_cast(*sp_data); +} + +template +void VectorBase::CopyRowFromSp(const SpMatrix &mat, MatrixIndexT row); +template +void VectorBase::CopyRowFromSp(const SpMatrix &mat, MatrixIndexT row); +template +void VectorBase::CopyRowFromSp(const SpMatrix &mat, MatrixIndexT row); +template +void VectorBase::CopyRowFromSp(const SpMatrix &mat, MatrixIndexT row); + + +#ifdef HAVE_MKL +template<> +void VectorBase::Pow(const VectorBase &v, float power) { + vsPowx(dim_, data_, power, v.data_); +} +template<> +void VectorBase::Pow(const VectorBase &v, double power) { + vdPowx(dim_, data_, power, v.data_); +} +#else + +// takes elements to a power. Does not check output. +template +void VectorBase::Pow(const VectorBase &v, Real power) { + KALDI_ASSERT(dim_ == v.dim_); + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = pow(v.data_[i], power); + } +} +#endif + +// takes absolute value of the elements to a power. +// Throws exception if could not (but only for power != 1 and power != 2). +template +void VectorBase::ApplyPowAbs(Real power, bool include_sign) { + if (power == 1.0) + for (MatrixIndexT i = 0; i < dim_; i++) + data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * std::abs(data_[i]); + if (power == 2.0) { + for (MatrixIndexT i = 0; i < dim_; i++) + data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * data_[i] * data_[i]; + } else if (power == 0.5) { + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * std::sqrt(std::abs(data_[i])); + } + } else if (power < 0.0) { + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = (data_[i] == 0.0 ? 0.0 : pow(std::abs(data_[i]), power)); + data_[i] *= (include_sign && data_[i] < 0 ? -1 : 1); + if (data_[i] == HUGE_VAL) { // HUGE_VAL is what errno returns on error. + KALDI_ERR << "Could not raise element " << i << "to power " + << power << ": returned value = " << data_[i]; + } + } + } else { + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = (include_sign && data_[i] < 0 ? -1 : 1) * pow(std::abs(data_[i]), power); + if (data_[i] == HUGE_VAL) { // HUGE_VAL is what errno returns on error. + KALDI_ERR << "Could not raise element " << i << "to power " + << power << ": returned value = " << data_[i]; + } + } + } +} + +// Computes the p-th norm. Throws exception if could not. +template +Real VectorBase::Norm(Real p) const { + KALDI_ASSERT(p >= 0.0); + Real sum = 0.0; + if (p == 0.0) { + for (MatrixIndexT i = 0; i < dim_; i++) + if (data_[i] != 0.0) sum += 1.0; + return sum; + } else if (p == 1.0) { + for (MatrixIndexT i = 0; i < dim_; i++) + sum += std::abs(data_[i]); + return sum; + } else if (p == 2.0) { + for (MatrixIndexT i = 0; i < dim_; i++) + sum += data_[i] * data_[i]; + return std::sqrt(sum); + } else if (p == std::numeric_limits::infinity()){ + for (MatrixIndexT i = 0; i < dim_; i++) + sum = std::max(sum, std::abs(data_[i])); + return sum; + } else { + Real tmp; + bool ok = true; + for (MatrixIndexT i = 0; i < dim_; i++) { + tmp = pow(std::abs(data_[i]), p); + if (tmp == HUGE_VAL) // HUGE_VAL is what pow returns on error. + ok = false; + sum += tmp; + } + tmp = pow(sum, static_cast(1.0/p)); + KALDI_ASSERT(tmp != HUGE_VAL); // should not happen here. + if (ok) { + return tmp; + } else { + Real maximum = this->Max(), minimum = this->Min(), + max_abs = std::max(maximum, -minimum); + KALDI_ASSERT(max_abs > 0); // Or should not have reached here. + Vector tmp(*this); + tmp.Scale(1.0 / max_abs); + return tmp.Norm(p) * max_abs; + } + } +} + +template +bool VectorBase::ApproxEqual(const VectorBase &other, float tol) const { + if (dim_ != other.dim_) KALDI_ERR << "ApproxEqual: size mismatch " + << dim_ << " vs. " << other.dim_; + KALDI_ASSERT(tol >= 0.0); + if (tol != 0.0) { + Vector tmp(*this); + tmp.AddVec(-1.0, other); + return (tmp.Norm(2.0) <= static_cast(tol) * this->Norm(2.0)); + } else { // Test for exact equality. + const Real *data = data_; + const Real *other_data = other.data_; + for (MatrixIndexT dim = dim_, i = 0; i < dim; i++) + if (data[i] != other_data[i]) return false; + return true; + } +} + +template +Real VectorBase::Max() const { + Real ans = - std::numeric_limits::infinity(); + const Real *data = data_; + MatrixIndexT i, dim = dim_; + for (i = 0; i + 4 <= dim; i += 4) { + Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3]; + if (a1 > ans || a2 > ans || a3 > ans || a4 > ans) { + Real b1 = (a1 > a2 ? a1 : a2), b2 = (a3 > a4 ? a3 : a4); + if (b1 > ans) ans = b1; + if (b2 > ans) ans = b2; + } + } + for (; i < dim; i++) + if (data[i] > ans) ans = data[i]; + return ans; +} + +template +Real VectorBase::Max(MatrixIndexT *index_out) const { + if (dim_ == 0) KALDI_ERR << "Empty vector"; + Real ans = - std::numeric_limits::infinity(); + MatrixIndexT index = 0; + const Real *data = data_; + MatrixIndexT i, dim = dim_; + for (i = 0; i + 4 <= dim; i += 4) { + Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3]; + if (a1 > ans || a2 > ans || a3 > ans || a4 > ans) { + if (a1 > ans) { ans = a1; index = i; } + if (a2 > ans) { ans = a2; index = i + 1; } + if (a3 > ans) { ans = a3; index = i + 2; } + if (a4 > ans) { ans = a4; index = i + 3; } + } + } + for (; i < dim; i++) + if (data[i] > ans) { ans = data[i]; index = i; } + *index_out = index; + return ans; +} + +template +Real VectorBase::Min() const { + Real ans = std::numeric_limits::infinity(); + const Real *data = data_; + MatrixIndexT i, dim = dim_; + for (i = 0; i + 4 <= dim; i += 4) { + Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3]; + if (a1 < ans || a2 < ans || a3 < ans || a4 < ans) { + Real b1 = (a1 < a2 ? a1 : a2), b2 = (a3 < a4 ? a3 : a4); + if (b1 < ans) ans = b1; + if (b2 < ans) ans = b2; + } + } + for (; i < dim; i++) + if (data[i] < ans) ans = data[i]; + return ans; +} + +template +Real VectorBase::Min(MatrixIndexT *index_out) const { + if (dim_ == 0) KALDI_ERR << "Empty vector"; + Real ans = std::numeric_limits::infinity(); + MatrixIndexT index = 0; + const Real *data = data_; + MatrixIndexT i, dim = dim_; + for (i = 0; i + 4 <= dim; i += 4) { + Real a1 = data[i], a2 = data[i+1], a3 = data[i+2], a4 = data[i+3]; + if (a1 < ans || a2 < ans || a3 < ans || a4 < ans) { + if (a1 < ans) { ans = a1; index = i; } + if (a2 < ans) { ans = a2; index = i + 1; } + if (a3 < ans) { ans = a3; index = i + 2; } + if (a4 < ans) { ans = a4; index = i + 3; } + } + } + for (; i < dim; i++) + if (data[i] < ans) { ans = data[i]; index = i; } + *index_out = index; + return ans; +} + + +template +template +void VectorBase::CopyColFromMat(const MatrixBase &mat, MatrixIndexT col) { + KALDI_ASSERT(col < mat.NumCols()); + KALDI_ASSERT(dim_ == mat.NumRows()); + for (MatrixIndexT i = 0; i < dim_; i++) + data_[i] = mat(i, col); + // can't do this very efficiently so don't really bother. could improve this though. +} +// instantiate the template above. +template +void VectorBase::CopyColFromMat(const MatrixBase &mat, MatrixIndexT col); +template +void VectorBase::CopyColFromMat(const MatrixBase &mat, MatrixIndexT col); +template +void VectorBase::CopyColFromMat(const MatrixBase &mat, MatrixIndexT col); +template +void VectorBase::CopyColFromMat(const MatrixBase &mat, MatrixIndexT col); + +template +void VectorBase::CopyDiagFromMat(const MatrixBase &M) { + KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols())); + cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1); +} + +template +void VectorBase::CopyDiagFromPacked(const PackedMatrix &M) { + KALDI_ASSERT(dim_ == M.NumCols()); + for (MatrixIndexT i = 0; i < dim_; i++) + data_[i] = M(i, i); + // could make this more efficient. +} + +template +Real VectorBase::Sum() const { + // Do a dot-product with a size-1 array with a stride of 0 to + // implement sum. This allows us to access SIMD operations in a + // cross-platform way via your BLAS library. + Real one(1); + return cblas_Xdot(dim_, data_, 1, &one, 0); +} + +template +Real VectorBase::SumLog() const { + double sum_log = 0.0; + double prod = 1.0; + for (MatrixIndexT i = 0; i < dim_; i++) { + prod *= data_[i]; + // Possible future work (arnab): change these magic values to pre-defined + // constants + if (prod < 1.0e-10 || prod > 1.0e+10) { + sum_log += Log(prod); + prod = 1.0; + } + } + if (prod != 1.0) sum_log += Log(prod); + return sum_log; +} + +template +void VectorBase::AddRowSumMat(Real alpha, const MatrixBase &M, Real beta) { + KALDI_ASSERT(dim_ == M.NumCols()); + MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_; + Real *data = data_; + + // implement the function according to a dimension cutoff for computation efficiency + if (num_rows <= 64) { + cblas_Xscal(dim, beta, data, 1); + const Real *m_data = M.Data(); + for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride) + cblas_Xaxpy(dim, alpha, m_data, 1, data, 1); + + } else { + Vector ones(M.NumRows()); + ones.Set(1.0); + this->AddMatVec(alpha, M, kTrans, ones, beta); + } +} + +template +void VectorBase::AddColSumMat(Real alpha, const MatrixBase &M, Real beta) { + KALDI_ASSERT(dim_ == M.NumRows()); + MatrixIndexT num_cols = M.NumCols(); + + // implement the function according to a dimension cutoff for computation efficiency + if (num_cols <= 64) { + for (MatrixIndexT i = 0; i < dim_; i++) { + double sum = 0.0; + const Real *src = M.RowData(i); + for (MatrixIndexT j = 0; j < num_cols; j++) + sum += src[j]; + data_[i] = alpha * sum + beta * data_[i]; + } + } else { + Vector ones(M.NumCols()); + ones.Set(1.0); + this->AddMatVec(alpha, M, kNoTrans, ones, beta); + } +} + +template +Real VectorBase::LogSumExp(Real prune) const { + Real sum; + if (sizeof(sum) == 8) sum = kLogZeroDouble; + else sum = kLogZeroFloat; + Real max_elem = Max(), cutoff; + if (sizeof(Real) == 4) cutoff = max_elem + kMinLogDiffFloat; + else cutoff = max_elem + kMinLogDiffDouble; + if (prune > 0.0 && max_elem - prune > cutoff) // explicit pruning... + cutoff = max_elem - prune; + + double sum_relto_max_elem = 0.0; + + for (MatrixIndexT i = 0; i < dim_; i++) { + BaseFloat f = data_[i]; + if (f >= cutoff) + sum_relto_max_elem += Exp(f - max_elem); + } + return max_elem + Log(sum_relto_max_elem); +} + +template +void VectorBase::InvertElements() { + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = static_cast(1 / data_[i]); + } +} + +template +void VectorBase::ApplyLog() { + for (MatrixIndexT i = 0; i < dim_; i++) { + if (data_[i] < 0.0) + KALDI_ERR << "Trying to take log of a negative number."; + data_[i] = Log(data_[i]); + } +} + +template +void VectorBase::ApplyLogAndCopy(const VectorBase &v) { + KALDI_ASSERT(dim_ == v.Dim()); + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = Log(v(i)); + } +} + +template +void VectorBase::ApplyExp() { + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = Exp(data_[i]); + } +} + +template +void VectorBase::ApplyAbs() { + for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = std::abs(data_[i]); } +} + +template +void VectorBase::Floor(const VectorBase &v, Real floor_val, MatrixIndexT *floored_count) { + KALDI_ASSERT(dim_ == v.dim_); + if (floored_count == nullptr) { + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = std::max(v.data_[i], floor_val); + } + } else { + MatrixIndexT num_floored = 0; + for (MatrixIndexT i = 0; i < dim_; i++) { + if (v.data_[i] < floor_val) { + data_[i] = floor_val; + num_floored++; + } else { + data_[i] = v.data_[i]; + } + } + *floored_count = num_floored; + } +} + +template +void VectorBase::Ceiling(const VectorBase &v, Real ceil_val, MatrixIndexT *ceiled_count) { + KALDI_ASSERT(dim_ == v.dim_); + if (ceiled_count == nullptr) { + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = std::min(v.data_[i], ceil_val); + } + } else { + MatrixIndexT num_changed = 0; + for (MatrixIndexT i = 0; i < dim_; i++) { + if (v.data_[i] > ceil_val) { + data_[i] = ceil_val; + num_changed++; + } else { + data_[i] = v.data_[i]; + } + } + *ceiled_count = num_changed; + } +} + +template +MatrixIndexT VectorBase::ApplyFloor(const VectorBase &floor_vec) { + KALDI_ASSERT(floor_vec.Dim() == dim_); + MatrixIndexT num_floored = 0; + for (MatrixIndexT i = 0; i < dim_; i++) { + if (data_[i] < floor_vec(i)) { + data_[i] = floor_vec(i); + num_floored++; + } + } + return num_floored; +} + +template +Real VectorBase::ApplySoftMax() { + Real max = this->Max(), sum = 0.0; + for (MatrixIndexT i = 0; i < dim_; i++) { + sum += (data_[i] = Exp(data_[i] - max)); + } + this->Scale(1.0 / sum); + return max + Log(sum); +} + +template +Real VectorBase::ApplyLogSoftMax() { + Real max = this->Max(), sum = 0.0; + for (MatrixIndexT i = 0; i < dim_; i++) { + sum += Exp((data_[i] -= max)); + } + sum = Log(sum); + this->Add(-1.0 * sum); + return max + sum; +} + +#ifdef HAVE_MKL +template<> +void VectorBase::Tanh(const VectorBase &src) { + KALDI_ASSERT(dim_ == src.dim_); + vsTanh(dim_, src.data_, data_); +} +template<> +void VectorBase::Tanh(const VectorBase &src) { + KALDI_ASSERT(dim_ == src.dim_); + vdTanh(dim_, src.data_, data_); +} +#else +template +void VectorBase::Tanh(const VectorBase &src) { + KALDI_ASSERT(dim_ == src.dim_); + for (MatrixIndexT i = 0; i < dim_; i++) { + Real x = src.data_[i]; + if (x > 0.0) { + Real inv_expx = Exp(-x); + x = -1.0 + 2.0 / (1.0 + inv_expx * inv_expx); + } else { + Real expx = Exp(x); + x = 1.0 - 2.0 / (1.0 + expx * expx); + } + data_[i] = x; + } +} +#endif + +#ifdef HAVE_MKL +// Implementing sigmoid based on tanh. +template<> +void VectorBase::Sigmoid(const VectorBase &src) { + KALDI_ASSERT(dim_ == src.dim_); + this->CopyFromVec(src); + this->Scale(0.5); + vsTanh(dim_, data_, data_); + this->Add(1.0); + this->Scale(0.5); +} +template<> +void VectorBase::Sigmoid(const VectorBase &src) { + KALDI_ASSERT(dim_ == src.dim_); + this->CopyFromVec(src); + this->Scale(0.5); + vdTanh(dim_, data_, data_); + this->Add(1.0); + this->Scale(0.5); +} +#else +template +void VectorBase::Sigmoid(const VectorBase &src) { + KALDI_ASSERT(dim_ == src.dim_); + for (MatrixIndexT i = 0; i < dim_; i++) { + Real x = src.data_[i]; + // We aim to avoid floating-point overflow here. + if (x > 0.0) { + x = 1.0 / (1.0 + Exp(-x)); + } else { + Real ex = Exp(x); + x = ex / (ex + 1.0); + } + data_[i] = x; + } +} +#endif + + +template +void VectorBase::Add(Real c) { + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] += c; + } +} + +template +void VectorBase::Scale(Real alpha) { + cblas_Xscal(dim_, alpha, data_, 1); +} + +template +void VectorBase::MulElements(const VectorBase &v) { + KALDI_ASSERT(dim_ == v.dim_); + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] *= v.data_[i]; + } +} + +template // Set each element to y = (x == orig ? changed : x). +void VectorBase::ReplaceValue(Real orig, Real changed) { + Real *data = data_; + for (MatrixIndexT i = 0; i < dim_; i++) + if (data[i] == orig) data[i] = changed; +} + + +template +template +void VectorBase::MulElements(const VectorBase &v) { + KALDI_ASSERT(dim_ == v.Dim()); + const OtherReal *other_ptr = v.Data(); + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] *= other_ptr[i]; + } +} +// instantiate template. +template +void VectorBase::MulElements(const VectorBase &v); +template +void VectorBase::MulElements(const VectorBase &v); + + +template +void VectorBase::AddVecVec(Real alpha, const VectorBase &v, + const VectorBase &r, Real beta) { + KALDI_ASSERT(v.data_ != this->data_ && r.data_ != this->data_); + // We pretend that v is a band-diagonal matrix. + KALDI_ASSERT(dim_ == v.dim_ && dim_ == r.dim_); + cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1, + r.data_, 1, beta, this->data_, 1); +} + + +template +void VectorBase::DivElements(const VectorBase &v) { + KALDI_ASSERT(dim_ == v.dim_); + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] /= v.data_[i]; + } +} + +template +template +void VectorBase::DivElements(const VectorBase &v) { + KALDI_ASSERT(dim_ == v.Dim()); + const OtherReal *other_ptr = v.Data(); + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] /= other_ptr[i]; + } +} +// instantiate template. +template +void VectorBase::DivElements(const VectorBase &v); +template +void VectorBase::DivElements(const VectorBase &v); + +template +void VectorBase::AddVecDivVec(Real alpha, const VectorBase &v, + const VectorBase &rr, Real beta) { + KALDI_ASSERT((dim_ == v.dim_ && dim_ == rr.dim_)); + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = alpha * v.data_[i]/rr.data_[i] + beta * data_[i] ; + } +} + +template +template +void VectorBase::AddVec(const Real alpha, const VectorBase &v) { + KALDI_ASSERT(dim_ == v.dim_); + // remove __restrict__ if it causes compilation problems. + Real *__restrict__ data = data_; + OtherReal *__restrict__ other_data = v.data_; + MatrixIndexT dim = dim_; + if (alpha != 1.0) + for (MatrixIndexT i = 0; i < dim; i++) + data[i] += alpha * other_data[i]; + else + for (MatrixIndexT i = 0; i < dim; i++) + data[i] += other_data[i]; +} + +template +void VectorBase::AddVec(const float alpha, const VectorBase &v); +template +void VectorBase::AddVec(const double alpha, const VectorBase &v); + +template +template +void VectorBase::AddVec2(const Real alpha, const VectorBase &v) { + KALDI_ASSERT(dim_ == v.dim_); + // remove __restrict__ if it causes compilation problems. + Real *__restrict__ data = data_; + OtherReal *__restrict__ other_data = v.data_; + MatrixIndexT dim = dim_; + if (alpha != 1.0) + for (MatrixIndexT i = 0; i < dim; i++) + data[i] += alpha * other_data[i] * other_data[i]; + else + for (MatrixIndexT i = 0; i < dim; i++) + data[i] += other_data[i] * other_data[i]; +} + +template +void VectorBase::AddVec2(const float alpha, const VectorBase &v); +template +void VectorBase::AddVec2(const double alpha, const VectorBase &v); + + +template +void VectorBase::Read(std::istream &is, bool binary, bool add) { + if (add) { + Vector tmp(Dim()); + tmp.Read(is, binary, false); // read without adding. + if (this->Dim() != tmp.Dim()) { + KALDI_ERR << "VectorBase::Read, size mismatch " << this->Dim()<<" vs. "<AddVec(1.0, tmp); + return; + } // now assume add == false. + + // In order to avoid rewriting this, we just declare a Vector and + // use it to read the data, then copy. + Vector tmp; + tmp.Read(is, binary, false); + if (tmp.Dim() != Dim()) + KALDI_ERR << "VectorBase::Read, size mismatch " + << Dim() << " vs. " << tmp.Dim(); + CopyFromVec(tmp); +} + + +template +void Vector::Read(std::istream &is, bool binary, bool add) { + if (add) { + Vector tmp(this->Dim()); + tmp.Read(is, binary, false); // read without adding. + if (this->Dim() == 0) this->Resize(tmp.Dim()); + if (this->Dim() != tmp.Dim()) { + KALDI_ERR << "Vector::Read, adding but dimensions mismatch " + << this->Dim() << " vs. " << tmp.Dim(); + } + this->AddVec(1.0, tmp); + return; + } // now assume add == false. + + std::ostringstream specific_error; + MatrixIndexT pos_at_start = is.tellg(); + + if (binary) { + int peekval = Peek(is, binary); + const char *my_token = (sizeof(Real) == 4 ? "FV" : "DV"); + char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F'); + if (peekval == other_token_start) { // need to instantiate the other type to read it. + typedef typename OtherReal::Real OtherType; // if Real == float, OtherType == double, and vice versa. + Vector other(this->Dim()); + other.Read(is, binary, false); // add is false at this point. + if (this->Dim() != other.Dim()) this->Resize(other.Dim()); + this->CopyFromVec(other); + return; + } + std::string token; + ReadToken(is, binary, &token); + if (token != my_token) { + if (token.length() > 20) token = token.substr(0, 17) + "..."; + specific_error << ": Expected token " << my_token << ", got " << token; + goto bad; + } + int32 size; + ReadBasicType(is, binary, &size); // throws on error. + if ((MatrixIndexT)size != this->Dim()) this->Resize(size); + if (size > 0) + is.read(reinterpret_cast(this->data_), sizeof(Real)*size); + if (is.fail()) { + specific_error << "Error reading vector data (binary mode); truncated " + "stream? (size = " << size << ")"; + goto bad; + } + return; + } else { // Text mode reading; format is " [ 1.1 2.0 3.4 ]\n" + std::string s; + is >> s; + // if ((s.compare("DV") == 0) || (s.compare("FV") == 0)) { // Back compatibility. + // is >> s; // get dimension + // is >> s; // get "[" + // } + if (is.fail()) { specific_error << "EOF while trying to read vector."; goto bad; } + if (s.compare("[]") == 0) { Resize(0); return; } // tolerate this variant. + if (s.compare("[")) { + if (s.length() > 20) s = s.substr(0, 17) + "..."; + specific_error << "Expected \"[\" but got " << s; + goto bad; + } + std::vector data; + while (1) { + int i = is.peek(); + if (i == '-' || (i >= '0' && i <= '9')) { // common cases first. + Real r; + is >> r; + if (is.fail()) { specific_error << "Failed to read number."; goto bad; } + if (! std::isspace(is.peek()) && is.peek() != ']') { + specific_error << "Expected whitespace after number."; goto bad; + } + data.push_back(r); + // But don't eat whitespace... we want to check that it's not newlines + // which would be valid only for a matrix. + } else if (i == ' ' || i == '\t') { + is.get(); + } else if (i == ']') { + is.get(); // eat the ']' + this->Resize(data.size()); + for (size_t j = 0; j < data.size(); j++) + this->data_[j] = data[j]; + i = is.peek(); + if (static_cast(i) == '\r') { + is.get(); + is.get(); // get \r\n (must eat what we wrote) + } else if (static_cast(i) == '\n') { is.get(); } // get \n (must eat what we wrote) + if (is.fail()) { + KALDI_WARN << "After end of vector data, read error."; + // we got the data we needed, so just warn for this error. + } + return; // success. + } else if (i == -1) { + specific_error << "EOF while reading vector data."; + goto bad; + } else if (i == '\n' || i == '\r') { + specific_error << "Newline found while reading vector (maybe it's a matrix?)"; + goto bad; + } else { + is >> s; // read string. + if (!KALDI_STRCASECMP(s.c_str(), "inf") || + !KALDI_STRCASECMP(s.c_str(), "infinity")) { + data.push_back(std::numeric_limits::infinity()); + KALDI_WARN << "Reading infinite value into vector."; + } else if (!KALDI_STRCASECMP(s.c_str(), "nan")) { + data.push_back(std::numeric_limits::quiet_NaN()); + KALDI_WARN << "Reading NaN value into vector."; + } else { + if (s.length() > 20) s = s.substr(0, 17) + "..."; + specific_error << "Expecting numeric vector data, got " << s; + goto bad; + } + } + } + } + // we never reach this line (the while loop returns directly). +bad: + KALDI_ERR << "Failed to read vector from stream. " << specific_error.str() + << " File position at start is " + << pos_at_start<<", currently "< +void VectorBase::Write(std::ostream & os, bool binary) const { + if (!os.good()) { + KALDI_ERR << "Failed to write vector to stream: stream not good"; + } + if (binary) { + std::string my_token = (sizeof(Real) == 4 ? "FV" : "DV"); + WriteToken(os, binary, my_token); + + int32 size = Dim(); // make the size 32-bit on disk. + KALDI_ASSERT(Dim() == (MatrixIndexT) size); + WriteBasicType(os, binary, size); + os.write(reinterpret_cast(Data()), sizeof(Real) * size); + } else { + os << " [ "; + for (MatrixIndexT i = 0; i < Dim(); i++) + os << (*this)(i) << " "; + os << "]\n"; + } + if (!os.good()) + KALDI_ERR << "Failed to write vector to stream"; +} + + +template +void VectorBase::AddVec2(const Real alpha, const VectorBase &v) { + KALDI_ASSERT(dim_ == v.dim_); + for (MatrixIndexT i = 0; i < dim_; i++) + data_[i] += alpha * v.data_[i] * v.data_[i]; +} + +// this <-- beta*this + alpha*M*v. +template +void VectorBase::AddTpVec(const Real alpha, const TpMatrix &M, + const MatrixTransposeType trans, + const VectorBase &v, + const Real beta) { + KALDI_ASSERT(dim_ == v.dim_ && dim_ == M.NumRows()); + if (beta == 0.0) { + if (&v != this) CopyFromVec(v); + MulTp(M, trans); + if (alpha != 1.0) Scale(alpha); + } else { + Vector tmp(v); + tmp.MulTp(M, trans); + if (beta != 1.0) Scale(beta); // *this <-- beta * *this + AddVec(alpha, tmp); // *this += alpha * M * v + } +} + +template +Real VecMatVec(const VectorBase &v1, const MatrixBase &M, + const VectorBase &v2) { + KALDI_ASSERT(v1.Dim() == M.NumRows() && v2.Dim() == M.NumCols()); + Vector vtmp(M.NumRows()); + vtmp.AddMatVec(1.0, M, kNoTrans, v2, 0.0); + return VecVec(v1, vtmp); +} + +template +float VecMatVec(const VectorBase &v1, const MatrixBase &M, + const VectorBase &v2); +template +double VecMatVec(const VectorBase &v1, const MatrixBase &M, + const VectorBase &v2); + +template +void Vector::Swap(Vector *other) { + std::swap(this->data_, other->data_); + std::swap(this->dim_, other->dim_); +} + + +template +void VectorBase::AddDiagMat2( + Real alpha, const MatrixBase &M, + MatrixTransposeType trans, Real beta) { + if (trans == kNoTrans) { + KALDI_ASSERT(this->dim_ == M.NumRows()); + MatrixIndexT rows = this->dim_, cols = M.NumCols(), + mat_stride = M.Stride(); + Real *data = this->data_; + const Real *mat_data = M.Data(); + for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data++) + *data = beta * *data + alpha * cblas_Xdot(cols,mat_data,1,mat_data,1); + } else { + KALDI_ASSERT(this->dim_ == M.NumCols()); + MatrixIndexT rows = M.NumRows(), cols = this->dim_, + mat_stride = M.Stride(); + Real *data = this->data_; + const Real *mat_data = M.Data(); + for (MatrixIndexT i = 0; i < cols; i++, mat_data++, data++) + *data = beta * *data + alpha * cblas_Xdot(rows, mat_data, mat_stride, + mat_data, mat_stride); + } +} + +template +void VectorBase::AddDiagMatMat( + Real alpha, + const MatrixBase &M, MatrixTransposeType transM, + const MatrixBase &N, MatrixTransposeType transN, + Real beta) { + MatrixIndexT dim = this->dim_, + M_col_dim = (transM == kTrans ? M.NumRows() : M.NumCols()), + N_row_dim = (transN == kTrans ? N.NumCols() : N.NumRows()); + KALDI_ASSERT(M_col_dim == N_row_dim); // this is the dimension we sum over + MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1; + if (transM == kTrans) std::swap(M_row_stride, M_col_stride); + MatrixIndexT N_row_stride = N.Stride(), N_col_stride = 1; + if (transN == kTrans) std::swap(N_row_stride, N_col_stride); + + Real *data = this->data_; + const Real *Mdata = M.Data(), *Ndata = N.Data(); + for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata += N_col_stride, data++) { + *data = beta * *data + alpha * cblas_Xdot(M_col_dim, Mdata, M_col_stride, Ndata, N_row_stride); + } +} + + +template class Vector; +template class Vector; +template class VectorBase; +template class VectorBase; + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/matrix/kaldi-vector.h b/speechx/speechx/kaldi/matrix/kaldi-vector.h new file mode 100644 index 00000000..2a032354 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/kaldi-vector.h @@ -0,0 +1,612 @@ +// matrix/kaldi-vector.h + +// Copyright 2009-2012 Ondrej Glembek; Microsoft Corporation; Lukas Burget; +// Saarland University (Author: Arnab Ghoshal); +// Ariya Rastrow; Petr Schwarz; Yanmin Qian; +// Karel Vesely; Go Vivace Inc.; Arnab Ghoshal +// Wei Shi; +// 2015 Guoguo Chen +// 2017 Daniel Galvez +// 2019 Yiwen Shao + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_MATRIX_KALDI_VECTOR_H_ +#define KALDI_MATRIX_KALDI_VECTOR_H_ 1 + +#include "matrix/matrix-common.h" + +namespace kaldi { + +/// \addtogroup matrix_group +/// @{ + +/// Provides a vector abstraction class. +/// This class provides a way to work with vectors in kaldi. +/// It encapsulates basic operations and memory optimizations. +template +class VectorBase { + public: + /// Set vector to all zeros. + void SetZero(); + + /// Returns true if matrix is all zeros. + bool IsZero(Real cutoff = 1.0e-06) const; // replace magic number + + /// Set all members of a vector to a specified value. + void Set(Real f); + + /// Set vector to random normally-distributed noise. + void SetRandn(); + + /// Sets to numbers uniformly distributed on (0,1) + void SetRandUniform(); + + /// This function returns a random index into this vector, + /// chosen with probability proportional to the corresponding + /// element. Requires that this->Min() >= 0 and this->Sum() > 0. + MatrixIndexT RandCategorical() const; + + /// Returns the dimension of the vector. + inline MatrixIndexT Dim() const { return dim_; } + + /// Returns the size in memory of the vector, in bytes. + inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); } + + /// Returns a pointer to the start of the vector's data. + inline Real* Data() { return data_; } + + /// Returns a pointer to the start of the vector's data (const). + inline const Real* Data() const { return data_; } + + /// Indexing operator (const). + inline Real operator() (MatrixIndexT i) const { + KALDI_PARANOID_ASSERT(static_cast(i) < + static_cast(dim_)); + return *(data_ + i); + } + + /// Indexing operator (non-const). + inline Real & operator() (MatrixIndexT i) { + KALDI_PARANOID_ASSERT(static_cast(i) < + static_cast(dim_)); + return *(data_ + i); + } + + /** @brief Returns a sub-vector of a vector (a range of elements). + * @param o [in] Origin, 0 < o < Dim() + * @param l [in] Length 0 < l < Dim()-o + * @return A SubVector object that aliases the data of the Vector object. + * See @c SubVector class for details */ + SubVector Range(const MatrixIndexT o, const MatrixIndexT l) { + return SubVector(*this, o, l); + } + + /** @brief Returns a const sub-vector of a vector (a range of elements). + * @param o [in] Origin, 0 < o < Dim() + * @param l [in] Length 0 < l < Dim()-o + * @return A SubVector object that aliases the data of the Vector object. + * See @c SubVector class for details */ + const SubVector Range(const MatrixIndexT o, + const MatrixIndexT l) const { + return SubVector(*this, o, l); + } + + /// Copy data from another vector (must match own size). + void CopyFromVec(const VectorBase &v); + + /// Copy data from a SpMatrix or TpMatrix (must match own size). + template + void CopyFromPacked(const PackedMatrix &M); + + /// Copy data from another vector of different type (double vs. float) + template + void CopyFromVec(const VectorBase &v); + + /// Copy from CuVector. This is defined in ../cudamatrix/cu-vector.h + template + void CopyFromVec(const CuVectorBase &v); + + /// Applies floor to all elements. Returns number of elements + /// floored in floored_count if it is non-null. + void Floor(const VectorBase &v, Real floor_val, MatrixIndexT *floored_count = nullptr); + + /// Applies ceiling to all elements. Returns number of elements + /// changed in ceiled_count if it is non-null. + void Ceiling(const VectorBase &v, Real ceil_val, MatrixIndexT *ceiled_count = nullptr); + + void Pow(const VectorBase &v, Real power); + + /// Apply natural log to all elements. Throw if any element of + /// the vector is negative (but doesn't complain about zero; the + /// log will be -infinity + void ApplyLog(); + + /// Apply natural log to another vector and put result in *this. + void ApplyLogAndCopy(const VectorBase &v); + + /// Apply exponential to each value in vector. + void ApplyExp(); + + /// Take absolute value of each of the elements + void ApplyAbs(); + + /// Applies floor to all elements. Returns number of elements + /// floored in floored_count if it is non-null. + inline void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = nullptr) { + this->Floor(*this, floor_val, floored_count); + }; + + /// Applies ceiling to all elements. Returns number of elements + /// changed in ceiled_count if it is non-null. + inline void ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count = nullptr) { + this->Ceiling(*this, ceil_val, ceiled_count); + }; + + /// Applies floor to all elements. Returns number of elements floored. + MatrixIndexT ApplyFloor(const VectorBase &floor_vec); + + /// Apply soft-max to vector and return normalizer (log sum of exponentials). + /// This is the same as: \f$ x(i) = exp(x(i)) / \sum_i exp(x(i)) \f$ + Real ApplySoftMax(); + + /// Applies log soft-max to vector and returns normalizer (log sum of + /// exponentials). + /// This is the same as: \f$ x(i) = x(i) - log(\sum_i exp(x(i))) \f$ + Real ApplyLogSoftMax(); + + /// Sets each element of *this to the tanh of the corresponding element of "src". + void Tanh(const VectorBase &src); + + /// Sets each element of *this to the sigmoid function of the corresponding + /// element of "src". + void Sigmoid(const VectorBase &src); + + /// Take all elements of vector to a power. + inline void ApplyPow(Real power) { + this->Pow(*this, power); + }; + + /// Take the absolute value of all elements of a vector to a power. + /// Include the sign of the input element if include_sign == true. + /// If power is negative and the input value is zero, the output is set zero. + void ApplyPowAbs(Real power, bool include_sign=false); + + /// Compute the p-th norm of the vector. + Real Norm(Real p) const; + + /// Returns true if ((*this)-other).Norm(2.0) <= tol * (*this).Norm(2.0). + bool ApproxEqual(const VectorBase &other, float tol = 0.01) const; + + /// Invert all elements. + void InvertElements(); + + /// Add vector : *this = *this + alpha * rv (with casting between floats and + /// doubles) + template + void AddVec(const Real alpha, const VectorBase &v); + + /// Add vector : *this = *this + alpha * rv^2 [element-wise squaring]. + void AddVec2(const Real alpha, const VectorBase &v); + + /// Add vector : *this = *this + alpha * rv^2 [element-wise squaring], + /// with casting between floats and doubles. + template + void AddVec2(const Real alpha, const VectorBase &v); + + /// Add matrix times vector : this <-- beta*this + alpha*M*v. + /// Calls BLAS GEMV. + void AddMatVec(const Real alpha, const MatrixBase &M, + const MatrixTransposeType trans, const VectorBase &v, + const Real beta); // **beta previously defaulted to 0.0** + + /// This is as AddMatVec, except optimized for where v contains a lot + /// of zeros. + void AddMatSvec(const Real alpha, const MatrixBase &M, + const MatrixTransposeType trans, const VectorBase &v, + const Real beta); // **beta previously defaulted to 0.0** + + + /// Add symmetric positive definite matrix times vector: + /// this <-- beta*this + alpha*M*v. Calls BLAS SPMV. + void AddSpVec(const Real alpha, const SpMatrix &M, + const VectorBase &v, const Real beta); // **beta previously defaulted to 0.0** + + /// Add triangular matrix times vector: this <-- beta*this + alpha*M*v. + /// Works even if rv == *this. + void AddTpVec(const Real alpha, const TpMatrix &M, + const MatrixTransposeType trans, const VectorBase &v, + const Real beta); // **beta previously defaulted to 0.0** + + /// Set each element to y = (x == orig ? changed : x). + void ReplaceValue(Real orig, Real changed); + + /// Multiply element-by-element by another vector. + void MulElements(const VectorBase &v); + /// Multiply element-by-element by another vector of different type. + template + void MulElements(const VectorBase &v); + + /// Divide element-by-element by a vector. + void DivElements(const VectorBase &v); + /// Divide element-by-element by a vector of different type. + template + void DivElements(const VectorBase &v); + + /// Add a constant to each element of a vector. + void Add(Real c); + + /// Add element-by-element product of vectors: + // this <-- alpha * v .* r + beta*this . + void AddVecVec(Real alpha, const VectorBase &v, + const VectorBase &r, Real beta); + + /// Add element-by-element quotient of two vectors. + /// this <---- alpha*v/r + beta*this + void AddVecDivVec(Real alpha, const VectorBase &v, + const VectorBase &r, Real beta); + + /// Multiplies all elements by this constant. + void Scale(Real alpha); + + /// Multiplies this vector by lower-triangular matrix: *this <-- *this *M + void MulTp(const TpMatrix &M, const MatrixTransposeType trans); + + /// If trans == kNoTrans, solves M x = b, where b is the value of *this at input + /// and x is the value of *this at output. + /// If trans == kTrans, solves M' x = b. + /// Does not test for M being singular or near-singular, so test it before + /// calling this routine. + void Solve(const TpMatrix &M, const MatrixTransposeType trans); + + /// Performs a row stack of the matrix M + void CopyRowsFromMat(const MatrixBase &M); + template + void CopyRowsFromMat(const MatrixBase &M); + + /// The following is implemented in ../cudamatrix/cu-matrix.cc + void CopyRowsFromMat(const CuMatrixBase &M); + + /// Performs a column stack of the matrix M + void CopyColsFromMat(const MatrixBase &M); + + /// Extracts a row of the matrix M. Could also do this with + /// this->Copy(M[row]). + void CopyRowFromMat(const MatrixBase &M, MatrixIndexT row); + /// Extracts a row of the matrix M with type conversion. + template + void CopyRowFromMat(const MatrixBase &M, MatrixIndexT row); + + /// Extracts a row of the symmetric matrix S. + template + void CopyRowFromSp(const SpMatrix &S, MatrixIndexT row); + + /// Extracts a column of the matrix M. + template + void CopyColFromMat(const MatrixBase &M , MatrixIndexT col); + + /// Extracts the diagonal of the matrix M. + void CopyDiagFromMat(const MatrixBase &M); + + /// Extracts the diagonal of a packed matrix M; works for Sp or Tp. + void CopyDiagFromPacked(const PackedMatrix &M); + + + /// Extracts the diagonal of a symmetric matrix. + inline void CopyDiagFromSp(const SpMatrix &M) { CopyDiagFromPacked(M); } + + /// Extracts the diagonal of a triangular matrix. + inline void CopyDiagFromTp(const TpMatrix &M) { CopyDiagFromPacked(M); } + + /// Returns the maximum value of any element, or -infinity for the empty vector. + Real Max() const; + + /// Returns the maximum value of any element, and the associated index. + /// Error if vector is empty. + Real Max(MatrixIndexT *index) const; + + /// Returns the minimum value of any element, or +infinity for the empty vector. + Real Min() const; + + /// Returns the minimum value of any element, and the associated index. + /// Error if vector is empty. + Real Min(MatrixIndexT *index) const; + + /// Returns sum of the elements + Real Sum() const; + + /// Returns sum of the logs of the elements. More efficient than + /// just taking log of each. Will return NaN if any elements are + /// negative. + Real SumLog() const; + + /// Does *this = alpha * (sum of rows of M) + beta * *this. + void AddRowSumMat(Real alpha, const MatrixBase &M, Real beta = 1.0); + + /// Does *this = alpha * (sum of columns of M) + beta * *this. + void AddColSumMat(Real alpha, const MatrixBase &M, Real beta = 1.0); + + /// Add the diagonal of a matrix times itself: + /// *this = diag(M M^T) + beta * *this (if trans == kNoTrans), or + /// *this = diag(M^T M) + beta * *this (if trans == kTrans). + void AddDiagMat2(Real alpha, const MatrixBase &M, + MatrixTransposeType trans = kNoTrans, Real beta = 1.0); + + /// Add the diagonal of a matrix product: *this = diag(M N), assuming the + /// "trans" arguments are both kNoTrans; for transpose arguments, it behaves + /// as you would expect. + void AddDiagMatMat(Real alpha, const MatrixBase &M, MatrixTransposeType transM, + const MatrixBase &N, MatrixTransposeType transN, + Real beta = 1.0); + + /// Returns log(sum(exp())) without exp overflow + /// If prune > 0.0, ignores terms less than the max - prune. + /// [Note: in future, if prune = 0.0, it will take the max. + /// For now, use -1 if you don't want it to prune.] + Real LogSumExp(Real prune = -1.0) const; + + /// Reads from C++ stream (option to add to existing contents). + /// Throws exception on failure + void Read(std::istream &in, bool binary, bool add = false); + + /// Writes to C++ stream (option to write in binary). + void Write(std::ostream &Out, bool binary) const; + + friend class VectorBase; + friend class VectorBase; + friend class CuVectorBase; + friend class CuVector; + protected: + /// Destructor; does not deallocate memory, this is handled by child classes. + /// This destructor is protected so this object can only be + /// deleted via a child. + ~VectorBase() {} + + /// Empty initializer, corresponds to vector of zero size. + explicit VectorBase(): data_(NULL), dim_(0) { + KALDI_ASSERT_IS_FLOATING_TYPE(Real); + } + +// Took this out since it is not currently used, and it is possible to create +// objects where the allocated memory is not the same size as dim_ : Arnab +// /// Initializer from a pointer and a size; keeps the pointer internally +// /// (ownership or non-ownership depends on the child class). +// explicit VectorBase(Real* data, MatrixIndexT dim) +// : data_(data), dim_(dim) {} + + // Arnab : made this protected since it is unsafe too. + /// Load data into the vector: sz must match own size. + void CopyFromPtr(const Real* Data, MatrixIndexT sz); + + /// data memory area + Real* data_; + /// dimension of vector + MatrixIndexT dim_; + KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase); +}; // class VectorBase + +/** @brief A class representing a vector. + * + * This class provides a way to work with vectors in kaldi. + * It encapsulates basic operations and memory optimizations. */ +template +class Vector: public VectorBase { + public: + /// Constructor that takes no arguments. Initializes to empty. + Vector(): VectorBase() {} + + /// Constructor with specific size. Sets to all-zero by default + /// if set_zero == false, memory contents are undefined. + explicit Vector(const MatrixIndexT s, + MatrixResizeType resize_type = kSetZero) + : VectorBase() { Resize(s, resize_type); } + + /// Copy constructor from CUDA vector + /// This is defined in ../cudamatrix/cu-vector.h + template + explicit Vector(const CuVectorBase &cu); + + /// Copy constructor. The need for this is controversial. + Vector(const Vector &v) : VectorBase() { // (cannot be explicit) + Resize(v.Dim(), kUndefined); + this->CopyFromVec(v); + } + + /// Copy-constructor from base-class, needed to copy from SubVector. + explicit Vector(const VectorBase &v) : VectorBase() { + Resize(v.Dim(), kUndefined); + this->CopyFromVec(v); + } + + /// Type conversion constructor. + template + explicit Vector(const VectorBase &v): VectorBase() { + Resize(v.Dim(), kUndefined); + this->CopyFromVec(v); + } + +// Took this out since it is unsafe : Arnab +// /// Constructor from a pointer and a size; copies the data to a location +// /// it owns. +// Vector(const Real* Data, const MatrixIndexT s): VectorBase() { +// Resize(s); + // CopyFromPtr(Data, s); +// } + + + /// Swaps the contents of *this and *other. Shallow swap. + void Swap(Vector *other); + + /// Destructor. Deallocates memory. + ~Vector() { Destroy(); } + + /// Read function using C++ streams. Can also add to existing contents + /// of matrix. + void Read(std::istream &in, bool binary, bool add = false); + + /// Set vector to a specified size (can be zero). + /// The value of the new data depends on resize_type: + /// -if kSetZero, the new data will be zero + /// -if kUndefined, the new data will be undefined + /// -if kCopyData, the new data will be the same as the old data in any + /// shared positions, and zero elsewhere. + /// This function takes time proportional to the number of data elements. + void Resize(MatrixIndexT length, MatrixResizeType resize_type = kSetZero); + + /// Remove one element and shifts later elements down. + void RemoveElement(MatrixIndexT i); + + /// Assignment operator. + Vector &operator = (const Vector &other) { + Resize(other.Dim(), kUndefined); + this->CopyFromVec(other); + return *this; + } + + /// Assignment operator that takes VectorBase. + Vector &operator = (const VectorBase &other) { + Resize(other.Dim(), kUndefined); + this->CopyFromVec(other); + return *this; + } + private: + /// Init assumes the current contents of the class are invalid (i.e. junk or + /// has already been freed), and it sets the vector to newly allocated memory + /// with the specified dimension. dim == 0 is acceptable. The memory contents + /// pointed to by data_ will be undefined. + void Init(const MatrixIndexT dim); + + /// Destroy function, called internally. + void Destroy(); + +}; + + +/// Represents a non-allocating general vector which can be defined +/// as a sub-vector of higher-level vector [or as the row of a matrix]. +template +class SubVector : public VectorBase { + public: + /// Constructor from a Vector or SubVector. + /// SubVectors are not const-safe and it's very hard to make them + /// so for now we just give up. This function contains const_cast. + SubVector(const VectorBase &t, const MatrixIndexT origin, + const MatrixIndexT length) : VectorBase() { + // following assert equiv to origin>=0 && length>=0 && + // origin+length <= rt.dim_ + KALDI_ASSERT(static_cast(origin)+ + static_cast(length) <= + static_cast(t.Dim())); + VectorBase::data_ = const_cast (t.Data()+origin); + VectorBase::dim_ = length; + } + + /// This constructor initializes the vector to point at the contents + /// of this packed matrix (SpMatrix or TpMatrix). + SubVector(const PackedMatrix &M) { + VectorBase::data_ = const_cast (M.Data()); + VectorBase::dim_ = (M.NumRows()*(M.NumRows()+1))/2; + } + + /// Copy constructor + SubVector(const SubVector &other) : VectorBase () { + // this copy constructor needed for Range() to work in base class. + VectorBase::data_ = other.data_; + VectorBase::dim_ = other.dim_; + } + + /// Constructor from a pointer to memory and a length. Keeps a pointer + /// to the data but does not take ownership (will never delete). + /// Caution: this constructor enables you to evade const constraints. + SubVector(const Real *data, MatrixIndexT length) : VectorBase () { + VectorBase::data_ = const_cast(data); + VectorBase::dim_ = length; + } + + /// This operation does not preserve const-ness, so be careful. + SubVector(const MatrixBase &matrix, MatrixIndexT row) { + VectorBase::data_ = const_cast(matrix.RowData(row)); + VectorBase::dim_ = matrix.NumCols(); + } + + ~SubVector() {} ///< Destructor (does nothing; no pointers are owned here). + + private: + /// Disallow assignment operator. + SubVector & operator = (const SubVector &other) {} +}; + +/// @} end of "addtogroup matrix_group" +/// \addtogroup matrix_funcs_io +/// @{ +/// Output to a C++ stream. Non-binary by default (use Write for +/// binary output). +template +std::ostream & operator << (std::ostream & out, const VectorBase & v); + +/// Input from a C++ stream. Will automatically read text or +/// binary data from the stream. +template +std::istream & operator >> (std::istream & in, VectorBase & v); + +/// Input from a C++ stream. Will automatically read text or +/// binary data from the stream. +template +std::istream & operator >> (std::istream & in, Vector & v); +/// @} end of \addtogroup matrix_funcs_io + +/// \addtogroup matrix_funcs_scalar +/// @{ + + +template +bool ApproxEqual(const VectorBase &a, + const VectorBase &b, Real tol = 0.01) { + return a.ApproxEqual(b, tol); +} + +template +inline void AssertEqual(VectorBase &a, VectorBase &b, + float tol = 0.01) { + KALDI_ASSERT(a.ApproxEqual(b, tol)); +} + + +/// Returns dot product between v1 and v2. +template +Real VecVec(const VectorBase &v1, const VectorBase &v2); + +template +Real VecVec(const VectorBase &v1, const VectorBase &v2); + + +/// Returns \f$ v_1^T M v_2 \f$ . +/// Not as efficient as it could be where v1 == v2. +template +Real VecMatVec(const VectorBase &v1, const MatrixBase &M, + const VectorBase &v2); + +/// @} End of "addtogroup matrix_funcs_scalar" + + +} // namespace kaldi + +// we need to include the implementation +#include "matrix/kaldi-vector-inl.h" + + + +#endif // KALDI_MATRIX_KALDI_VECTOR_H_ diff --git a/speechx/speechx/kaldi/matrix/matrix-common.h b/speechx/speechx/kaldi/matrix/matrix-common.h new file mode 100644 index 00000000..f7047d71 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/matrix-common.h @@ -0,0 +1,111 @@ +// matrix/matrix-common.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_MATRIX_MATRIX_COMMON_H_ +#define KALDI_MATRIX_MATRIX_COMMON_H_ + +// This file contains some #includes, forward declarations +// and typedefs that are needed by all the main header +// files in this directory. + +#include "base/kaldi-common.h" + +namespace kaldi { +// this enums equal to CblasTrans and CblasNoTrans constants from CBLAS library +// we are writing them as literals because we don't want to include here matrix/kaldi-blas.h, +// which puts many symbols into global scope (like "real") via the header f2c.h +typedef enum { + kTrans = 112, // = CblasTrans + kNoTrans = 111 // = CblasNoTrans +} MatrixTransposeType; + +typedef enum { + kSetZero, + kUndefined, + kCopyData +} MatrixResizeType; + + +typedef enum { + kDefaultStride, + kStrideEqualNumCols, +} MatrixStrideType; + +typedef enum { + kTakeLower, + kTakeUpper, + kTakeMean, + kTakeMeanAndCheck +} SpCopyType; + +template class VectorBase; +template class Vector; +template class SubVector; +template class MatrixBase; +template class SubMatrix; +template class Matrix; +template class SpMatrix; +template class TpMatrix; +template class PackedMatrix; +template class SparseMatrix; + +// these are classes that won't be defined in this +// directory; they're mostly needed for friend declarations. +template class CuMatrixBase; +template class CuSubMatrix; +template class CuMatrix; +template class CuVectorBase; +template class CuSubVector; +template class CuVector; +template class CuPackedMatrix; +template class CuSpMatrix; +template class CuTpMatrix; +template class CuSparseMatrix; + +class CompressedMatrix; +class GeneralMatrix; + +/// This class provides a way for switching between double and float types. +template class OtherReal { }; // useful in reading+writing routines + // to switch double and float. +/// A specialized class for switching from float to double. +template<> class OtherReal { + public: + typedef double Real; +}; +/// A specialized class for switching from double to float. +template<> class OtherReal { + public: + typedef float Real; +}; + + +typedef int32 MatrixIndexT; +typedef int32 SignedMatrixIndexT; +typedef uint32 UnsignedMatrixIndexT; + +// If you want to use size_t for the index type, do as follows instead: +//typedef size_t MatrixIndexT; +//typedef ssize_t SignedMatrixIndexT; +//typedef size_t UnsignedMatrixIndexT; + +} + + + +#endif // KALDI_MATRIX_MATRIX_COMMON_H_ diff --git a/speechx/speechx/kaldi/matrix/matrix-functions-inl.h b/speechx/speechx/kaldi/matrix/matrix-functions-inl.h new file mode 100644 index 00000000..9fac851e --- /dev/null +++ b/speechx/speechx/kaldi/matrix/matrix-functions-inl.h @@ -0,0 +1,56 @@ +// matrix/matrix-functions-inl.h + +// Copyright 2009-2011 Microsoft Corporation +// +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +// +// (*) incorporates, with permission, FFT code from his book +// "Signal Processing with Lapped Transforms", Artech, 1992. + + + +#ifndef KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_ +#define KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_ + +namespace kaldi { + +//! ComplexMul implements, inline, the complex multiplication b *= a. +template inline void ComplexMul(const Real &a_re, const Real &a_im, + Real *b_re, Real *b_im) { + Real tmp_re = (*b_re * a_re) - (*b_im * a_im); + *b_im = *b_re * a_im + *b_im * a_re; + *b_re = tmp_re; +} + +template inline void ComplexAddProduct(const Real &a_re, const Real &a_im, + const Real &b_re, const Real &b_im, + Real *c_re, Real *c_im) { + *c_re += b_re*a_re - b_im*a_im; + *c_im += b_re*a_im + b_im*a_re; +} + + +template inline void ComplexImExp(Real x, Real *a_re, Real *a_im) { + *a_re = std::cos(x); + *a_im = std::sin(x); +} + + +} // end namespace kaldi + + +#endif // KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_ + diff --git a/speechx/speechx/kaldi/matrix/matrix-functions.cc b/speechx/speechx/kaldi/matrix/matrix-functions.cc new file mode 100644 index 00000000..496c09f5 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/matrix-functions.cc @@ -0,0 +1,773 @@ +// matrix/matrix-functions.cc + +// Copyright 2009-2011 Microsoft Corporation; Go Vivace Inc.; Jan Silovsky +// Yanmin Qian; Saarland University; Johns Hopkins University (Author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +// +// (*) incorporates, with permission, FFT code from his book +// "Signal Processing with Lapped Transforms", Artech, 1992. + +#include "matrix/matrix-functions.h" +#include "matrix/sp-matrix.h" + +namespace kaldi { + +template void ComplexFt (const VectorBase &in, + VectorBase *out, bool forward) { + int exp_sign = (forward ? -1 : 1); + KALDI_ASSERT(out != NULL); + KALDI_ASSERT(in.Dim() == out->Dim()); + KALDI_ASSERT(in.Dim() % 2 == 0); + int twoN = in.Dim(), N = twoN / 2; + const Real *data_in = in.Data(); + Real *data_out = out->Data(); + + Real exp1N_re, exp1N_im; // forward -> exp(-2pi / N), backward -> exp(2pi / N). + Real fraction = exp_sign * M_2PI / static_cast(N); // forward -> -2pi/N, backward->-2pi/N + ComplexImExp(fraction, &exp1N_re, &exp1N_im); + + Real expm_re = 1.0, expm_im = 0.0; // forward -> exp(-2pi m / N). + + for (int two_m = 0; two_m < twoN; two_m+=2) { // For each output component. + Real expmn_re = 1.0, expmn_im = 0.0; // forward -> exp(-2pi m n / N). + Real sum_re = 0.0, sum_im = 0.0; // complex output for index m (the sum expression) + for (int two_n = 0; two_n < twoN; two_n+=2) { + ComplexAddProduct(data_in[two_n], data_in[two_n+1], + expmn_re, expmn_im, + &sum_re, &sum_im); + ComplexMul(expm_re, expm_im, &expmn_re, &expmn_im); + } + data_out[two_m] = sum_re; + data_out[two_m + 1] = sum_im; + + + if (two_m % 10 == 0) { // occasionally renew "expm" from scratch to avoid + // loss of precision. + int nextm = 1 + two_m/2; + Real fraction_mult = fraction * nextm; + ComplexImExp(fraction_mult, &expm_re, &expm_im); + } else { + ComplexMul(exp1N_re, exp1N_im, &expm_re, &expm_im); + } + } +} + +template +void ComplexFt (const VectorBase &in, + VectorBase *out, bool forward); +template +void ComplexFt (const VectorBase &in, + VectorBase *out, bool forward); + + +#define KALDI_COMPLEXFFT_BLOCKSIZE 8192 +// This #define affects how we recurse in ComplexFftRecursive. +// We assume that memory-caching happens on a scale at +// least as small as this. + + +//! ComplexFftRecursive is a recursive function that computes the +//! complex FFT of size N. The "nffts" arguments specifies how many +//! separate FFTs to compute in parallel (we assume the data for +//! each one is consecutive in memory). The "forward argument" +//! specifies whether to do the FFT (true) or IFFT (false), although +//! note that we do not include the factor of 1/N (the user should +//! do this if required. The iterators factor_begin and factor_end +//! point to the beginning and end (i.e. one past the last element) +//! of an array of small factors of N (typically prime factors). +//! See the comments below this code for the detailed equations +//! of the recursion. + + +template +void ComplexFftRecursive (Real *data, int nffts, int N, + const int *factor_begin, + const int *factor_end, bool forward, + Vector *tmp_vec) { + if (factor_begin == factor_end) { + KALDI_ASSERT(N == 1); + return; + } + + { // an optimization: compute in smaller blocks. + // this block of code could be removed and it would still work. + MatrixIndexT size_perblock = N * 2 * sizeof(Real); + if (nffts > 1 && size_perblock*nffts > KALDI_COMPLEXFFT_BLOCKSIZE) { // can break it up... + // Break up into multiple blocks. This is an optimization. We make + // no progress on the FFT when we do this. + int block_skip = KALDI_COMPLEXFFT_BLOCKSIZE / size_perblock; // n blocks per call + if (block_skip == 0) block_skip = 1; + if (block_skip < nffts) { + int blocks_left = nffts; + while (blocks_left > 0) { + int skip_now = std::min(blocks_left, block_skip); + ComplexFftRecursive(data, skip_now, N, factor_begin, factor_end, forward, tmp_vec); + blocks_left -= skip_now; + data += skip_now * N*2; + } + return; + } // else do the actual algorithm. + } // else do the actual algorithm. + } + + int P = *factor_begin; + KALDI_ASSERT(P > 1); + int Q = N / P; + + + if (P > 1 && Q > 1) { // Do the rearrangement. C.f. eq. (8) below. Transform + // (a) to (b). + Real *data_thisblock = data; + if (tmp_vec->Dim() < (MatrixIndexT)N) tmp_vec->Resize(N); + Real *data_tmp = tmp_vec->Data(); + for (int thisfft = 0; thisfft < nffts; thisfft++, data_thisblock+=N*2) { + for (int offset = 0; offset < 2; offset++) { // 0 == real, 1 == im. + for (int p = 0; p < P; p++) { + for (int q = 0; q < Q; q++) { + int aidx = q*P + p, bidx = p*Q + q; + data_tmp[bidx] = data_thisblock[2*aidx+offset]; + } + } + for (int n = 0;n < P*Q;n++) data_thisblock[2*n+offset] = data_tmp[n]; + } + } + } + + { // Recurse. + ComplexFftRecursive(data, nffts*P, Q, factor_begin+1, factor_end, forward, tmp_vec); + } + + int exp_sign = (forward ? -1 : 1); + Real rootN_re, rootN_im; // Nth root of unity. + ComplexImExp(static_cast(exp_sign * M_2PI / N), &rootN_re, &rootN_im); + + Real rootP_re, rootP_im; // Pth root of unity. + ComplexImExp(static_cast(exp_sign * M_2PI / P), &rootP_re, &rootP_im); + + { // Do the multiplication + // could avoid a bunch of complex multiplies by moving the loop over data_thisblock + // inside. + if (tmp_vec->Dim() < (MatrixIndexT)(P*2)) tmp_vec->Resize(P*2); + Real *temp_a = tmp_vec->Data(); + + Real *data_thisblock = data, *data_end = data+(N*2*nffts); + for (; data_thisblock != data_end; data_thisblock += N*2) { // for each separate fft. + Real qd_re = 1.0, qd_im = 0.0; // 1^(q'/N) + for (int qd = 0; qd < Q; qd++) { + Real pdQ_qd_re = qd_re, pdQ_qd_im = qd_im; // 1^((p'Q+q') / N) == 1^((p'/P) + (q'/N)) + // Initialize to q'/N, corresponding to p' == 0. + for (int pd = 0; pd < P; pd++) { // pd == p' + { // This is the p = 0 case of the loop below [an optimization]. + temp_a[pd*2] = data_thisblock[qd*2]; + temp_a[pd*2 + 1] = data_thisblock[qd*2 + 1]; + } + { // This is the p = 1 case of the loop below [an optimization] + // **** MOST OF THE TIME (>60% I think) gets spent here. *** + ComplexAddProduct(pdQ_qd_re, pdQ_qd_im, + data_thisblock[(qd+Q)*2], data_thisblock[(qd+Q)*2 + 1], + &(temp_a[pd*2]), &(temp_a[pd*2 + 1])); + } + if (P > 2) { + Real p_pdQ_qd_re = pdQ_qd_re, p_pdQ_qd_im = pdQ_qd_im; // 1^(p(p'Q+q')/N) + for (int p = 2; p < P; p++) { + ComplexMul(pdQ_qd_re, pdQ_qd_im, &p_pdQ_qd_re, &p_pdQ_qd_im); // p_pdQ_qd *= pdQ_qd. + int data_idx = p*Q + qd; + ComplexAddProduct(p_pdQ_qd_re, p_pdQ_qd_im, + data_thisblock[data_idx*2], data_thisblock[data_idx*2 + 1], + &(temp_a[pd*2]), &(temp_a[pd*2 + 1])); + } + } + if (pd != P-1) + ComplexMul(rootP_re, rootP_im, &pdQ_qd_re, &pdQ_qd_im); // pdQ_qd *= (rootP == 1^{1/P}) + // (using 1/P == Q/N) + } + for (int pd = 0; pd < P; pd++) { + data_thisblock[(pd*Q + qd)*2] = temp_a[pd*2]; + data_thisblock[(pd*Q + qd)*2 + 1] = temp_a[pd*2 + 1]; + } + ComplexMul(rootN_re, rootN_im, &qd_re, &qd_im); // qd *= rootN. + } + } + } +} + +/* Equations for ComplexFftRecursive. + We consider here one of the "nffts" separate ffts; it's just a question of + doing them all in parallel. We also write all equations in terms of + complex math (the conversion to real arithmetic is not hard, and anyway + takes place inside function calls). + + + Let the input (i.e. "data" at start) be a_n, n = 0..N-1, and + the output (Fourier transform) be d_k, k = 0..N-1. We use these letters because + there will be two intermediate variables b and c. + We want to compute: + + d_k = \sum_n a_n 1^(kn/N) (1) + + where we use 1^x as shorthand for exp(-2pi x) for the forward algorithm + and exp(2pi x) for the backward one. + + We factorize N = P Q (P small, Q usually large). + With p = 0..P-1 and q = 0..Q-1, and also p'=0..P-1 and q'=0..P-1, we let: + + k == p'Q + q' (2) + n == qP + p (3) + + That is, we let p, q, p', q' range over these indices and observe that this way we + can cover all n, k. Expanding (1) using (2) and (3), we can write: + + d_k = \sum_{p, q} a_n 1^((p'Q+q')(qP+p)/N) + = \sum_{p, q} a_n 1^(p'pQ/N) 1^(q'qP/N) 1^(q'p/N) (4) + + using 1^(PQ/N) = 1 to get rid of the terms with PQ in them. Rearranging (4), + + d_k = \sum_p 1^(p'pQ/N) 1^(q'p/N) \sum_q 1^(q'qP/N) a_n (5) + + The point here is to separate the index q. Now we can expand out the remaining + instances of k and n using (2) and (3): + + d_(p'Q+q') = \sum_p 1^(p'pQ/N) 1^(q'p/N) \sum_q 1^(q'qP/N) a_(qP+p) (6) + + The expression \sum_q varies with the indices p and q'. Let us define + + C_{p, q'} = \sum_q 1^(q'qP/N) a_(qP+p) (7) + + Here, C_{p, q'}, viewed as a sequence in q', is just the DFT of the points + a_(qP+p) for q = 1..Q-1. These points are not consecutive in memory though, + they jump by P each time. Let us define b as a rearranged version of a, + so that + + b_(pQ+q) = a_(qP+p) (8) + + How to do this rearrangement in place? In + + We can rearrange (7) to be written in terms of the b's, using (8), so that + + C_{p, q'} = \sum_q 1^(q'q (P/N)) b_(pQ+q) (9) + + Here, the sequence of C_{p, q'} over q'=0..Q-1, is just the DFT of the sequence + of b_(pQ) .. b_(p(Q+1)-1). Let's arrange the C_{p, q'} in a single array in + memory in the same way as the b's, i.e. we define + c_(pQ+q') == C_{p, q'}. (10) + Note that we could have written (10) with q in place of q', as there is only + one index of type q present, but q' is just a more natural variable name to use + since we use q' elsewhere to subscript c and C. + + Rewriting (9), we have: + c_(pQ+q') = \sum_q 1^(q'q (P/N)) b_(pQ+q) (11) + which is the DFT computed by the recursive call to this function [after computing + the b's by rearranging the a's]. From the c's we want to compute the d's. + Taking (6), substituting in the sum (7), and using (10) to write it as an array, + we have: + d_(p'Q+q') = \sum_p 1^(p'pQ/N) 1^(q'p/N) c_(pQ+q') (12) + This sum is independent for different values of q'. Note that d overwrites c + in memory. We compute this in a direct way, using a little array of size P to + store the computed d values for one value of q' (we reuse the array for each value + of q'). + + So the overall picture is this: + We get a call to compute DFT on size N. + + - If N == 1 we return (nothing to do). + - We factor N = P Q (typically, P is small). + - Using (8), we rearrange the data in memory so that we have b not a in memory + (this is the block "do the rearrangement"). + The pseudocode for this is as follows. For simplicity we use a temporary array. + + for p = 0..P-1 + for q = 0..Q-1 + bidx = pQ + q + aidx = qP + p + tmp[bidx] = data[aidx]. + end + end + data <-- tmp + else + + endif + + + The reason this accomplishes (8) is that we want pQ+q and qP+p to be swapped + over for each p, q, and the "if m > n" is a convenient way of ensuring that + this swapping happens only once (otherwise it would happen twice, since pQ+q + and qP+p both range over the entire set of numbers 0..N-1). + + - We do the DFT on the smaller block size to compute c from b (this eq eq. (11)). + Note that this is actually multiple DFTs, one for each value of p, but this + goes to the "nffts" argument of the function call, which we have ignored up to now. + + -We compute eq. (12) via a loop, as follows + allocate temporary array e of size P. + For q' = 0..Q-1: + for p' = 0..P-1: + set sum to zero [this will go in e[p']] + for p = p..P-1: + sum += 1^(p'pQ/N) 1^(q'p/N) c_(pQ+q') + end + e[p'] = sum + end + for p' = 0..P-1: + d_(p'Q+q') = e[p'] + end + end + delete temporary array e + +*/ + +// This is the outer-layer calling code for ComplexFftRecursive. +// It factorizes the dimension and then calls the FFT routine. +template void ComplexFft(VectorBase *v, bool forward, Vector *tmp_in) { + KALDI_ASSERT(v != NULL); + + if (v->Dim()<=1) return; + KALDI_ASSERT(v->Dim() % 2 == 0); // complex input. + int N = v->Dim() / 2; + std::vector factors; + Factorize(N, &factors); + int *factor_beg = NULL; + if (factors.size() > 0) + factor_beg = &(factors[0]); + Vector tmp; // allocated in ComplexFftRecursive. + ComplexFftRecursive(v->Data(), 1, N, factor_beg, factor_beg+factors.size(), forward, (tmp_in?tmp_in:&tmp)); +} + +//! Inefficient version of Fourier transform, for testing purposes. +template void RealFftInefficient (VectorBase *v, bool forward) { + KALDI_ASSERT(v != NULL); + MatrixIndexT N = v->Dim(); + KALDI_ASSERT(N%2 == 0); + if (N == 0) return; + Vector vtmp(N*2); // store as complex. + if (forward) { + for (MatrixIndexT i = 0; i < N; i++) vtmp(i*2) = (*v)(i); + ComplexFft(&vtmp, forward); // this is already tested so we can use this. + v->CopyFromVec( vtmp.Range(0, N) ); + (*v)(1) = vtmp(N); // Copy the N/2'th fourier component, which is real, + // to the imaginary part of the 1st complex output. + } else { + // reverse the transformation above to get the complex spectrum. + vtmp(0) = (*v)(0); // copy F_0 which is real + vtmp(N) = (*v)(1); // copy F_{N/2} which is real + for (MatrixIndexT i = 1; i < N/2; i++) { + // Copy i'th to i'th fourier component + vtmp(2*i) = (*v)(2*i); + vtmp(2*i+1) = (*v)(2*i+1); + // Copy i'th to N-i'th, conjugated. + vtmp(2*(N-i)) = (*v)(2*i); + vtmp(2*(N-i)+1) = -(*v)(2*i+1); + } + ComplexFft(&vtmp, forward); // actually backward since forward == false + // Copy back real part. Complex part should be zero. + for (MatrixIndexT i = 0; i < N; i++) + (*v)(i) = vtmp(i*2); + } +} + +template void RealFftInefficient (VectorBase *v, bool forward); +template void RealFftInefficient (VectorBase *v, bool forward); + +template +void ComplexFft(VectorBase *v, bool forward, Vector *tmp_in); +template +void ComplexFft(VectorBase *v, bool forward, Vector *tmp_in); + + +// See the long comment below for the math behind this. +template void RealFft (VectorBase *v, bool forward) { + KALDI_ASSERT(v != NULL); + MatrixIndexT N = v->Dim(), N2 = N/2; + KALDI_ASSERT(N%2 == 0); + if (N == 0) return; + + if (forward) ComplexFft(v, true); + + Real *data = v->Data(); + Real rootN_re, rootN_im; // exp(-2pi/N), forward; exp(2pi/N), backward + int forward_sign = forward ? -1 : 1; + ComplexImExp(static_cast(M_2PI/N *forward_sign), &rootN_re, &rootN_im); + Real kN_re = -forward_sign, kN_im = 0.0; // exp(-2pik/N), forward; exp(-2pik/N), backward + // kN starts out as 1.0 for forward algorithm but -1.0 for backward. + for (MatrixIndexT k = 1; 2*k <= N2; k++) { + ComplexMul(rootN_re, rootN_im, &kN_re, &kN_im); + + Real Ck_re, Ck_im, Dk_re, Dk_im; + // C_k = 1/2 (B_k + B_{N/2 - k}^*) : + Ck_re = 0.5 * (data[2*k] + data[N - 2*k]); + Ck_im = 0.5 * (data[2*k + 1] - data[N - 2*k + 1]); + // re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})): + Dk_re = 0.5 * (data[2*k + 1] + data[N - 2*k + 1]); + // im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k})) + Dk_im =-0.5 * (data[2*k] - data[N - 2*k]); + // A_k = C_k + 1^(k/N) D_k: + data[2*k] = Ck_re; // A_k <-- C_k + data[2*k+1] = Ck_im; + // now A_k += D_k 1^(k/N) + ComplexAddProduct(Dk_re, Dk_im, kN_re, kN_im, &(data[2*k]), &(data[2*k+1])); + + MatrixIndexT kdash = N2 - k; + if (kdash != k) { + // Next we handle the index k' = N/2 - k. This is necessary + // to do now, to avoid invalidating data that we will later need. + // The quantities C_{k'} and D_{k'} are just the conjugates of C_k + // and D_k, so the equations are simple modifications of the above, + // replacing Ck_im and Dk_im with their negatives. + data[2*kdash] = Ck_re; // A_k' <-- C_k' + data[2*kdash+1] = -Ck_im; + // now A_k' += D_k' 1^(k'/N) + // We use 1^(k'/N) = 1^((N/2 - k) / N) = 1^(1/2) 1^(-k/N) = -1 * (1^(k/N))^* + // so it's the same as 1^(k/N) but with the real part negated. + ComplexAddProduct(Dk_re, -Dk_im, -kN_re, kN_im, &(data[2*kdash]), &(data[2*kdash+1])); + } + } + + { // Now handle k = 0. + // In simple terms: after the complex fft, data[0] becomes the sum of real + // parts input[0], input[2]... and data[1] becomes the sum of imaginary + // pats input[1], input[3]... + // "zeroth" [A_0] is just the sum of input[0]+input[1]+input[2].. + // and "n2th" [A_{N/2}] is input[0]-input[1]+input[2]... . + Real zeroth = data[0] + data[1], + n2th = data[0] - data[1]; + data[0] = zeroth; + data[1] = n2th; + if (!forward) { + data[0] /= 2; + data[1] /= 2; + } + } + + if (!forward) { + ComplexFft(v, false); + v->Scale(2.0); // This is so we get a factor of N increase, rather than N/2 which we would + // otherwise get from [ComplexFft, forward] + [ComplexFft, backward] in dimension N/2. + // It's for consistency with our normal FFT convensions. + } +} + +template void RealFft (VectorBase *v, bool forward); +template void RealFft (VectorBase *v, bool forward); + +/* Notes for real FFTs. + We are using the same convention as above, 1^x to mean exp(-2\pi x) for the forward transform. + Actually, in a slight abuse of notation, we use this meaning for 1^x in both the forward and + backward cases because it's more convenient in this section. + + Suppose we have real data a[0...N-1], with N even, and want to compute its Fourier transform. + We can make do with the first N/2 points of the transform, since the remaining ones are complex + conjugates of the first. We want to compute: + for k = 0...N/2-1, + A_k = \sum_{n = 0}^{N-1} a_n 1^(kn/N) (1) + + We treat a[0..N-1] as a complex sequence of length N/2, i.e. a sequence b[0..N/2 - 1]. + Viewed as sequences of length N/2, we have: + b = c + i d, + where c = a_0, a_2 ... and d = a_1, a_3 ... + + We can recover the length-N/2 Fourier transforms of c and d by doing FT on b and + then doing the equations below. Derivation is marked by (*) in a comment below (search + for it). Let B, C, D be the FTs. + We have + C_k = 1/2 (B_k + B_{N/2 - k}^*) (z0) + D_k =-1/2i (B_k - B_{N/2 - k}^*) (z1) +so: re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})) (z2) + im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k})) (z3) + + To recover the FT A from C and D, we write, rearranging (1): + + A_k = \sum_{n = 0, 2, ..., N-2} a_n 1^(kn/N) + +\sum_{n = 1, 3, ..., N-1} a_n 1^(kn/N) + = \sum_{n = 0, 1, ..., N/2-1} a_n 1^(2kn/N) + a_{n+1} 1^(2kn/N) 1^(k/N) + = \sum_{n = 0, 1, ..., N/2-1} c_n 1^(2kn/N) + d_n 1^(2kn/N) 1^(k/N) + A_k = C_k + 1^(k/N) D_k (a0) + + This equation is valid for k = 0...N/2-1, which is the range of the sequences B_k and + C_k. We don't use is for k = 0, which is a special case considered below. For + 1 < k < N/2, it's convenient to consider the pair k, k', where k' = N/2 - k. + Remember that C_k' = C_k^ *and D_k' = D_k^* [where * is conjugation]. Also, + 1^(N/2 / N) = -1. So we have: + A_k' = C_k^* - 1^(k/N) D_k^* (a0b) + We do (a0) and (a0b) together. + + + + By symmetry this gives us the Fourier components for N/2+1, ... N, if we want + them. However, it doesn't give us the value for exactly k = N/2. For k = 0 and k = N/2, it + is easiest to argue directly about the meaning of the A_k, B_k and C_k in terms of + sums of points. + A_0 and A_{N/2} are both real, with A_0=\sum_n a_n, and A_1 an alternating sum + A_1 = a_0 - a_1 + a_2 ... + It's easy to show that + A_0 = B_0 + C_0 (a1) + A_{N/2} = B_0 - C_0. (a2) + Since B_0 and C_0 are both real, B_0 is the real coefficient of D_0 and C_0 is the + imaginary coefficient. + + *REVERSING THE PROCESS* + + Next we want to reverse this process. We just need to work out C_k and D_k from the + sequence A_k. Then we do the inverse complex fft and we get back where we started. + For 0 and N/2, working from (a1) and (a2) above, we can see that: + B_0 = 1/2 (A_0 + A_{N/2}) (y0) + C_0 = 1/2 (A_0 + A_{N/2}) (y1) + and we use + D_0 = B_0 + i C_0 + to get the 1st complex coefficient of D. This is exactly the same as the forward process + except with an extra factor of 1/2. + + Consider equations (a0) and (a0b). We want to work out C_k and D_k from A_k and A_k'. Remember + k' = N/2 - k. + + Write down + A_k = C_k + 1^(k/N) D_k (copying a0) + A_k'^* = C_k - 1^(k/N) D_k (conjugate of a0b) + So + C_k = 0.5 (A_k + A_k'^*) (p0) + D_k = 1^(-k/N) . 0.5 (A_k - A_k'^*) (p1) + Next, we want to compute B_k and B_k' from C_k and D_k. C.f. (z0)..(z3), and remember + that k' = N/2-k. We can see + that + B_k = C_k + i D_k (p2) + B_k' = C_k - i D_k (p3) + + We would like to make the equations (p0) ... (p3) look like the forward equations (z0), (z1), + (a0) and (a0b) so we can reuse the code. Define E_k = -i 1^(k/N) D_k. Then write down (p0)..(p3). + We have + C_k = 0.5 (A_k + A_k'^*) (p0') + E_k = -0.5 i (A_k - A_k'^*) (p1') + B_k = C_k - 1^(-k/N) E_k (p2') + B_k' = C_k + 1^(-k/N) E_k (p3') + So these are exactly the same as (z0), (z1), (a0), (a0b) except replacing 1^(k/N) with + -1^(-k/N) . Remember that we defined 1^x above to be exp(-2pi x/N), so the signs here + might be opposite to what you see in the code. + + MODIFICATION: we need to take care of a factor of two. The complex FFT we implemented + does not divide by N in the reverse case. So upon inversion we get larger by N/2. + However, this is not consistent with normal FFT conventions where you get a factor of N. + For this reason we multiply by two after the process described above. + +*/ + + +/* + (*) [this token is referred to in a comment above]. + + Notes for separating 2 real transforms from one complex one. Note that the + letters here (A, B, C and N) are all distinct from the same letters used in the + place where this comment is used. + Suppose we + have two sequences a_n and b_n, n = 0..N-1. We combine them into a complex + number, + c_n = a_n + i b_n. + Then we take the fourier transform to get + C_k = \sum_{n = 0}^{N-1} c_n 1^(n/N) . + Then we use symmetry. Define A_k and B_k as the DFTs of a and b. + We use A_k = A_{N-k}^*, and B_k = B_{N-k}^*, since a and b are real. Using + C_k = A_k + i B_k, + C_{N-k} = A_k^* + i B_k^* + = A_k^* - (i B_k)^* + So: + A_k = 1/2 (C_k + C_{N-k}^*) + i B_k = 1/2 (C_k - C_{N-k}^*) +-> B_k =-1/2i (C_k - C_{N-k}^*) +-> re(B_k) = 1/2 (im(C_k) + im(C_{N-k})) + im(B_k) =-1/2 (re(C_k) - re(C_{N-k})) + + */ + +template void ComputeDctMatrix(Matrix *M) { + //KALDI_ASSERT(M->NumRows() == M->NumCols()); + MatrixIndexT K = M->NumRows(); + MatrixIndexT N = M->NumCols(); + + KALDI_ASSERT(K > 0); + KALDI_ASSERT(N > 0); + Real normalizer = std::sqrt(1.0 / static_cast(N)); // normalizer for + // X_0. + for (MatrixIndexT j = 0; j < N; j++) (*M)(0, j) = normalizer; + normalizer = std::sqrt(2.0 / static_cast(N)); // normalizer for other + // elements. + for (MatrixIndexT k = 1; k < K; k++) + for (MatrixIndexT n = 0; n < N; n++) + (*M)(k, n) = normalizer + * std::cos( static_cast(M_PI)/N * (n + 0.5) * k ); +} + + +template void ComputeDctMatrix(Matrix *M); +template void ComputeDctMatrix(Matrix *M); + + +template +void ComputePca(const MatrixBase &X, + MatrixBase *U, + MatrixBase *A, + bool print_eigs, + bool exact) { + // Note that some of these matrices may be transposed w.r.t. the + // way it's most natural to describe them in math... it's the rows + // of X and U that correspond to the (data-points, basis elements). + MatrixIndexT N = X.NumRows(), D = X.NumCols(); + // N = #points, D = feature dim. + KALDI_ASSERT(U != NULL && U->NumCols() == D); + MatrixIndexT G = U->NumRows(); // # of retained basis elements. + KALDI_ASSERT(A == NULL || (A->NumRows() == N && A->NumCols() == G)); + KALDI_ASSERT(G <= N && G <= D); + if (D < N) { // Do conventional PCA. + SpMatrix Msp(D); // Matrix of outer products. + Msp.AddMat2(1.0, X, kTrans, 0.0); // M <-- X^T X + Matrix Utmp; + Vector l; + if (exact) { + Utmp.Resize(D, D); + l.Resize(D); + //Matrix M(Msp); + //M.DestructiveSvd(&l, &Utmp, NULL); + Msp.Eig(&l, &Utmp); + } else { + Utmp.Resize(D, G); + l.Resize(G); + Msp.TopEigs(&l, &Utmp); + } + SortSvd(&l, &Utmp); + + for (MatrixIndexT g = 0; g < G; g++) + U->Row(g).CopyColFromMat(Utmp, g); + if (print_eigs) + KALDI_LOG << (exact ? "" : "Retained ") + << "PCA eigenvalues are " << l; + if (A != NULL) + A->AddMatMat(1.0, X, kNoTrans, *U, kTrans, 0.0); + } else { // Do inner-product PCA. + SpMatrix Nsp(N); // Matrix of inner products. + Nsp.AddMat2(1.0, X, kNoTrans, 0.0); // M <-- X X^T + + Matrix Vtmp; + Vector l; + if (exact) { + Vtmp.Resize(N, N); + l.Resize(N); + Matrix Nmat(Nsp); + Nmat.DestructiveSvd(&l, &Vtmp, NULL); + } else { + Vtmp.Resize(N, G); + l.Resize(G); + Nsp.TopEigs(&l, &Vtmp); + } + + MatrixIndexT num_zeroed = 0; + for (MatrixIndexT g = 0; g < G; g++) { + if (l(g) < 0.0) { + KALDI_WARN << "In PCA, setting element " << l(g) << " to zero."; + l(g) = 0.0; + num_zeroed++; + } + } + SortSvd(&l, &Vtmp); // Make sure zero elements are last, this + // is necessary for Orthogonalize() to work properly later. + + Vtmp.Transpose(); // So eigenvalues are the rows. + + for (MatrixIndexT g = 0; g < G; g++) { + Real sqrtlg = sqrt(l(g)); + if (l(g) != 0.0) { + U->Row(g).AddMatVec(1.0 / sqrtlg, X, kTrans, Vtmp.Row(g), 0.0); + } else { + U->Row(g).SetZero(); + (*U)(g, g) = 1.0; // arbitrary direction. Will later orthogonalize. + } + if (A != NULL) + for (MatrixIndexT n = 0; n < N; n++) + (*A)(n, g) = sqrtlg * Vtmp(g, n); + } + // Now orthogonalize. This is mainly useful in + // case there were zero eigenvalues, but we do it + // for all of them. + U->OrthogonalizeRows(); + if (print_eigs) + KALDI_LOG << "(inner-product) PCA eigenvalues are " << l; + } +} + + +template +void ComputePca(const MatrixBase &X, + MatrixBase *U, + MatrixBase *A, + bool print_eigs, + bool exact); + +template +void ComputePca(const MatrixBase &X, + MatrixBase *U, + MatrixBase *A, + bool print_eigs, + bool exact); + + +// Added by Dan, Feb. 13 2012. +// This function does: *plus += max(0, a b^T), +// *minus += max(0, -(a b^T)). +template +void AddOuterProductPlusMinus(Real alpha, + const VectorBase &a, + const VectorBase &b, + MatrixBase *plus, + MatrixBase *minus) { + KALDI_ASSERT(a.Dim() == plus->NumRows() && b.Dim() == plus->NumCols() + && a.Dim() == minus->NumRows() && b.Dim() == minus->NumCols()); + int32 nrows = a.Dim(), ncols = b.Dim(), pskip = plus->Stride() - ncols, + mskip = minus->Stride() - ncols; + const Real *adata = a.Data(), *bdata = b.Data(); + Real *plusdata = plus->Data(), *minusdata = minus->Data(); + + for (int32 i = 0; i < nrows; i++) { + const Real *btmp = bdata; + Real multiple = alpha * *adata; + if (multiple > 0.0) { + for (int32 j = 0; j < ncols; j++, plusdata++, minusdata++, btmp++) { + if (*btmp > 0.0) *plusdata += multiple * *btmp; + else *minusdata -= multiple * *btmp; + } + } else { + for (int32 j = 0; j < ncols; j++, plusdata++, minusdata++, btmp++) { + if (*btmp < 0.0) *plusdata += multiple * *btmp; + else *minusdata -= multiple * *btmp; + } + } + plusdata += pskip; + minusdata += mskip; + adata++; + } +} + +// Instantiate template +template +void AddOuterProductPlusMinus(float alpha, + const VectorBase &a, + const VectorBase &b, + MatrixBase *plus, + MatrixBase *minus); +template +void AddOuterProductPlusMinus(double alpha, + const VectorBase &a, + const VectorBase &b, + MatrixBase *plus, + MatrixBase *minus); + + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/matrix/matrix-functions.h b/speechx/speechx/kaldi/matrix/matrix-functions.h new file mode 100644 index 00000000..ca50ddda --- /dev/null +++ b/speechx/speechx/kaldi/matrix/matrix-functions.h @@ -0,0 +1,174 @@ +// matrix/matrix-functions.h + +// Copyright 2009-2011 Microsoft Corporation; Go Vivace Inc.; Jan Silovsky; +// Yanmin Qian; 1991 Henrique (Rico) Malvar (*) +// +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +// +// (*) incorporates, with permission, FFT code from his book +// "Signal Processing with Lapped Transforms", Artech, 1992. + + + +#ifndef KALDI_MATRIX_MATRIX_FUNCTIONS_H_ +#define KALDI_MATRIX_MATRIX_FUNCTIONS_H_ + +#include "matrix/kaldi-vector.h" +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + +/// @addtogroup matrix_funcs_misc +/// @{ + +/** The function ComplexFft does an Fft on the vector argument v. + v is a vector of even dimension, interpreted for both input + and output as a vector of complex numbers i.e. + \f[ v = ( re_0, im_0, re_1, im_1, ... ) \f] + + If "forward == true" this routine does the Discrete Fourier Transform + (DFT), i.e.: + \f[ vout[m] \leftarrow \sum_{n = 0}^{N-1} vin[i] exp( -2pi m n / N ) \f] + + If "backward" it does the Inverse Discrete Fourier Transform (IDFT) + *WITHOUT THE FACTOR 1/N*, + i.e.: + \f[ vout[m] <-- \sum_{n = 0}^{N-1} vin[i] exp( 2pi m n / N ) \f] + [note the sign difference on the 2 pi for the backward one.] + + Note that this is the definition of the FT given in most texts, but + it differs from the Numerical Recipes version in which the forward + and backward algorithms are flipped. + + Note that you would have to multiply by 1/N after the IDFT to get + back to where you started from. We don't do this because + in some contexts, the transform is made symmetric by multiplying + by sqrt(N) in both passes. The user can do this by themselves. + + See also SplitRadixComplexFft, declared in srfft.h, which is more efficient + but only works if the length of the input is a power of 2. + */ +template void ComplexFft (VectorBase *v, bool forward, Vector *tmp_work = NULL); + +/// ComplexFt is the same as ComplexFft but it implements the Fourier +/// transform in an inefficient way. It is mainly included for testing purposes. +/// See comment for ComplexFft to describe the input and outputs and what it does. +template void ComplexFt (const VectorBase &in, + VectorBase *out, bool forward); + +/// RealFft is a fourier transform of real inputs. Internally it uses +/// ComplexFft. The input dimension N must be even. If forward == true, +/// it transforms from a sequence of N real points to its complex fourier +/// transform; otherwise it goes in the reverse direction. If you call it +/// in the forward and then reverse direction and multiply by 1.0/N, you +/// will get back the original data. +/// The interpretation of the complex-FFT data is as follows: the array +/// is a sequence of complex numbers C_n of length N/2 with (real, im) format, +/// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...]. +/// See also SplitRadixRealFft, declared in srfft.h, which is more efficient +/// but only works if the length of the input is a power of 2. + +template void RealFft (VectorBase *v, bool forward); + + +/// RealFt has the same input and output format as RealFft above, but it is +/// an inefficient implementation included for testing purposes. +template void RealFftInefficient (VectorBase *v, bool forward); + +/// ComputeDctMatrix computes a matrix corresponding to the DCT, such that +/// M * v equals the DCT of vector v. M must be square at input. +/// This is the type = III DCT with normalization, corresponding to the +/// following equations, where x is the signal and X is the DCT: +/// X_0 = 1/sqrt(2*N) \sum_{n = 0}^{N-1} x_n +/// X_k = 1/sqrt(N) \sum_{n = 0}^{N-1} x_n cos( \pi/N (n + 1/2) k ) +/// This matrix's transpose is its own inverse, so transposing this +/// matrix will give the inverse DCT. +/// Caution: the type III DCT is generally known as the "inverse DCT" (with the +/// type II being the actual DCT), so this function is somewhatd mis-named. It +/// was probably done this way for HTK compatibility. We don't change it +/// because it was this way from the start and changing it would affect the +/// feature generation. + +template void ComputeDctMatrix(Matrix *M); + + +/// ComplexMul implements, inline, the complex multiplication b *= a. +template inline void ComplexMul(const Real &a_re, const Real &a_im, + Real *b_re, Real *b_im); + +/// ComplexMul implements, inline, the complex operation c += (a * b). +template inline void ComplexAddProduct(const Real &a_re, const Real &a_im, + const Real &b_re, const Real &b_im, + Real *c_re, Real *c_im); + + +/// ComplexImExp implements a <-- exp(i x), inline. +template inline void ComplexImExp(Real x, Real *a_re, Real *a_im); + + + +/** + ComputePCA does a PCA computation, using either outer products + or inner products, whichever is more efficient. Let D be + the dimension of the data points, N be the number of data + points, and G be the PCA dimension we want to retain. We assume + G <= N and G <= D. + + @param X [in] An N x D matrix. Each row of X is a point x_i. + @param U [out] A G x D matrix. Each row of U is a basis element u_i. + @param A [out] An N x D matrix, or NULL. Each row of A is a set of coefficients + in the basis for a point x_i, so A(i, g) is the coefficient of u_i + in x_i. + @param print_eigs [in] If true, prints out diagnostic information about the + eigenvalues. + @param exact [in] If true, does the exact computation; if false, does + a much faster (but almost exact) computation based on the Lanczos + method. +*/ + +template +void ComputePca(const MatrixBase &X, + MatrixBase *U, + MatrixBase *A, + bool print_eigs = false, + bool exact = true); + + + +// This function does: *plus += max(0, a b^T), +// *minus += max(0, -(a b^T)). +template +void AddOuterProductPlusMinus(Real alpha, + const VectorBase &a, + const VectorBase &b, + MatrixBase *plus, + MatrixBase *minus); + +template +inline void AssertSameDim(const MatrixBase &mat1, const MatrixBase &mat2) { + KALDI_ASSERT(mat1.NumRows() == mat2.NumRows() + && mat1.NumCols() == mat2.NumCols()); +} + + +/// @} end of "addtogroup matrix_funcs_misc" + +} // end namespace kaldi + +#include "matrix/matrix-functions-inl.h" + + +#endif diff --git a/speechx/speechx/kaldi/matrix/matrix-lib.h b/speechx/speechx/kaldi/matrix/matrix-lib.h new file mode 100644 index 00000000..2a5ebad7 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/matrix-lib.h @@ -0,0 +1,37 @@ +// matrix/matrix-lib.h + +// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Haihua Xu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +// Include everything from this directory. +// These files include other stuff that we need. +#ifndef KALDI_MATRIX_MATRIX_LIB_H_ +#define KALDI_MATRIX_MATRIX_LIB_H_ + +#include "base/kaldi-common.h" +#include "matrix/kaldi-vector.h" +#include "matrix/kaldi-matrix.h" +#include "matrix/sp-matrix.h" +#include "matrix/tp-matrix.h" +#include "matrix/matrix-functions.h" +#include "matrix/srfft.h" +#include "matrix/compressed-matrix.h" +#include "matrix/sparse-matrix.h" +#include "matrix/optimization.h" + +#endif + diff --git a/speechx/speechx/kaldi/matrix/optimization.cc b/speechx/speechx/kaldi/matrix/optimization.cc new file mode 100644 index 00000000..c17b5b94 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/optimization.cc @@ -0,0 +1,577 @@ +// matrix/optimization.cc + +// Copyright 2012 Johns Hopkins University (author: Daniel Povey) + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +// +// (*) incorporates, with permission, FFT code from his book +// "Signal Processing with Lapped Transforms", Artech, 1992. + +#include + +#include "matrix/optimization.h" +#include "matrix/sp-matrix.h" + +namespace kaldi { + + +// Below, N&W refers to Nocedal and Wright, "Numerical Optimization", 2nd Ed. + +template +OptimizeLbfgs::OptimizeLbfgs(const VectorBase &x, + const LbfgsOptions &opts): + opts_(opts), k_(0), computation_state_(kBeforeStep), H_was_set_(false) { + KALDI_ASSERT(opts.m > 0); // dimension. + MatrixIndexT dim = x.Dim(); + KALDI_ASSERT(dim > 0); + x_ = x; // this is the value of x_k + new_x_ = x; // this is where we'll evaluate the function next. + deriv_.Resize(dim); + temp_.Resize(dim); + data_.Resize(2 * opts.m, dim); + rho_.Resize(opts.m); + // Just set f_ to some invalid value, as we haven't yet set it. + f_ = (opts.minimize ? 1 : -1 ) * std::numeric_limits::infinity(); + best_f_ = f_; + best_x_ = x_; +} + + +template +Real OptimizeLbfgs::RecentStepLength() const { + size_t n = step_lengths_.size(); + if (n == 0) return std::numeric_limits::infinity(); + else { + if (n >= 2 && step_lengths_[n-1] == 0.0 && step_lengths_[n-2] == 0.0) + return 0.0; // two zeros in a row means repeated restarts, which is + // a loop. Short-circuit this by returning zero. + Real avg = 0.0; + for (size_t i = 0; i < n; i++) + avg += step_lengths_[i] / n; + return avg; + } +} + +template +void OptimizeLbfgs::ComputeHifNeeded(const VectorBase &gradient) { + if (k_ == 0) { + if (H_.Dim() == 0) { + // H was never set up. Set it up for the first time. + Real learning_rate; + if (opts_.first_step_length > 0.0) { // this takes + // precedence over first_step_learning_rate, if set. + // We are setting up H for the first time. + Real gradient_length = gradient.Norm(2.0); + learning_rate = (gradient_length > 0.0 ? + opts_.first_step_length / gradient_length : + 1.0); + } else if (opts_.first_step_impr > 0.0) { + Real gradient_length = gradient.Norm(2.0); + learning_rate = (gradient_length > 0.0 ? + opts_.first_step_impr / (gradient_length * gradient_length) : + 1.0); + } else { + learning_rate = opts_.first_step_learning_rate; + } + H_.Resize(x_.Dim()); + KALDI_ASSERT(learning_rate > 0.0); + H_.Set(opts_.minimize ? learning_rate : -learning_rate); + } + } else { // k_ > 0 + if (!H_was_set_) { // The user never specified an approximate + // diagonal inverse Hessian. + // Set it using formula 7.20: H_k^{(0)} = \gamma_k I, where + // \gamma_k = s_{k-1}^T y_{k-1} / y_{k-1}^T y_{k-1} + SubVector y_km1 = Y(k_-1); + double gamma_k = VecVec(S(k_-1), y_km1) / VecVec(y_km1, y_km1); + if (KALDI_ISNAN(gamma_k) || KALDI_ISINF(gamma_k)) { + KALDI_WARN << "NaN encountered in L-BFGS (already converged?)"; + gamma_k = (opts_.minimize ? 1.0 : -1.0); + } + H_.Set(gamma_k); + } + } +} + +// This represents the first 2 lines of Algorithm 7.5 (N&W), which +// in fact is mostly a call to Algorithm 7.4. +// Note: this is valid whether we are minimizing or maximizing. +template +void OptimizeLbfgs::ComputeNewDirection(Real function_value, + const VectorBase &gradient) { + KALDI_ASSERT(computation_state_ == kBeforeStep); + SignedMatrixIndexT m = M(), k = k_; + ComputeHifNeeded(gradient); + // The rest of this is computing p_k <-- - H_k \nabla f_k using Algorithm + // 7.4 of N&W. + Vector &q(deriv_), &r(new_x_); // Use deriv_ as a temporary place to put + // q, and new_x_ as a temporay place to put r. + // The if-statement below is just to get rid of spurious warnings from + // valgrind about memcpy source and destination overlap, since sometimes q and + // gradient are the same variable. + if (&q != &gradient) + q.CopyFromVec(gradient); // q <-- \nabla f_k. + Vector alpha(m); + // for i = k - 1, k - 2, ... k - m + for (SignedMatrixIndexT i = k - 1; + i >= std::max(k - m, static_cast(0)); + i--) { + alpha(i % m) = rho_(i % m) * VecVec(S(i), q); // \alpha_i <-- \rho_i s_i^T q. + q.AddVec(-alpha(i % m), Y(i)); // q <-- q - \alpha_i y_i + } + r.SetZero(); + r.AddVecVec(1.0, H_, q, 0.0); // r <-- H_k^{(0)} q. + // for k = k - m, k - m + 1, ... , k - 1 + for (SignedMatrixIndexT i = std::max(k - m, static_cast(0)); + i < k; + i++) { + Real beta = rho_(i % m) * VecVec(Y(i), r); // \beta <-- \rho_i y_i^T r + r.AddVec(alpha(i % m) - beta, S(i)); // r <-- r + s_i (\alpha_i - \beta) + } + + { // TEST. Note, -r will be the direction. + Real dot = VecVec(gradient, r); + if ((opts_.minimize && dot < 0) || (!opts_.minimize && dot > 0)) + KALDI_WARN << "Step direction has the wrong sign! Routine will fail."; + } + + // Now we're out of Alg. 7.4 and back into Alg. 7.5. + // Alg. 7.4 returned r (using new_x_ as the location), and with \alpha_k = 1 + // as the initial guess, we're setting x_{k+1} = x_k + \alpha_k p_k, with + // p_k = -r [hence the statement new_x_.Scale(-1.0)]., and \alpha_k = 1. + // This is the first place we'll get the user to evaluate the function; + // any backtracking (or acceptance of that step) occurs inside StepSizeIteration. + // We're still within iteration k; we haven't yet finalized the step size. + new_x_.Scale(-1.0); + new_x_.AddVec(1.0, x_); + if (&deriv_ != &gradient) + deriv_.CopyFromVec(gradient); + f_ = function_value; + d_ = opts_.d; + num_wolfe_i_failures_ = 0; + num_wolfe_ii_failures_ = 0; + last_failure_type_ = kNone; + computation_state_ = kWithinStep; +} + + +template +bool OptimizeLbfgs::AcceptStep(Real function_value, + const VectorBase &gradient) { + // Save s_k = x_{k+1} - x_{k}, and y_k = \nabla f_{k+1} - \nabla f_k. + SubVector s = S(k_), y = Y(k_); + s.CopyFromVec(new_x_); + s.AddVec(-1.0, x_); // s = new_x_ - x_. + y.CopyFromVec(gradient); + y.AddVec(-1.0, deriv_); // y = gradient - deriv_. + + // Warning: there is a division in the next line. This could + // generate inf or nan, but this wouldn't necessarily be an error + // at this point because for zero step size or derivative we should + // terminate the iterations. But this is up to the calling code. + Real prod = VecVec(y, s); + rho_(k_ % opts_.m) = 1.0 / prod; + Real len = s.Norm(2.0); + + if ((opts_.minimize && prod <= 1.0e-20) || (!opts_.minimize && prod >= -1.0e-20) + || len == 0.0) + return false; // This will force restart. + + KALDI_VLOG(3) << "Accepted step; length was " << len + << ", prod was " << prod; + RecordStepLength(len); + + // store x_{k+1} and the function value f_{k+1}. + x_.CopyFromVec(new_x_); + f_ = function_value; + k_++; + + return true; // We successfully accepted the step. +} + +template +void OptimizeLbfgs::RecordStepLength(Real s) { + step_lengths_.push_back(s); + if (step_lengths_.size() > static_cast(opts_.avg_step_length)) + step_lengths_.erase(step_lengths_.begin(), step_lengths_.begin() + 1); +} + + +template +void OptimizeLbfgs::Restart(const VectorBase &x, + Real f, + const VectorBase &gradient) { + // Note: we will consider restarting (the transition of x_ -> x) + // as a step, even if it has zero step size. This is necessary in + // order for convergence to be detected. + { + Vector &diff(temp_); + diff.CopyFromVec(x); + diff.AddVec(-1.0, x_); + RecordStepLength(diff.Norm(2.0)); + } + k_ = 0; // Restart the iterations! [But note that the Hessian, + // whatever it was, stays as before.] + if (&x_ != &x) + x_.CopyFromVec(x); + new_x_.CopyFromVec(x); + f_ = f; + computation_state_ = kBeforeStep; + ComputeNewDirection(f, gradient); +} + +template +void OptimizeLbfgs::StepSizeIteration(Real function_value, + const VectorBase &gradient) { + KALDI_VLOG(3) << "In step size iteration, function value changed " + << f_ << " to " << function_value; + + // We're in some part of the backtracking, and the user is providing + // the objective function value and gradient. + // We're checking two conditions: Wolfe i) [the Armijo rule] and + // Wolfe ii). + + // The Armijo rule (when minimizing) is: + // f(k_k + \alpha_k p_k) <= f(x_k) + c_1 \alpha_k p_k^T \nabla f(x_k), where + // \nabla means the derivative. + // Below, "temp" is the RHS of this equation, where (\alpha_k p_k) equals + // (new_x_ - x_); we don't store \alpha or p_k separately, they are implicit + // as the difference new_x_ - x_. + + // Below, pf is \alpha_k p_k^T \nabla f(x_k). + Real pf = VecVec(new_x_, deriv_) - VecVec(x_, deriv_); + Real temp = f_ + opts_.c1 * pf; + + bool wolfe_i_ok; + if (opts_.minimize) wolfe_i_ok = (function_value <= temp); + else wolfe_i_ok = (function_value >= temp); + + // Wolfe condition ii) can be written as: + // p_k^T \nabla f(x_k + \alpha_k p_k) >= c_2 p_k^T \nabla f(x_k) + // p2f equals \alpha_k p_k^T \nabla f(x_k + \alpha_k p_k), where + // (\alpha_k p_k^T) is (new_x_ - x_). + // Note that in our version of Wolfe condition (ii) we have an extra + // factor alpha, which doesn't affect anything. + Real p2f = VecVec(new_x_, gradient) - VecVec(x_, gradient); + //eps = (sizeof(Real) == 4 ? 1.0e-05 : 1.0e-10) * + //(std::abs(p2f) + std::abs(pf)); + bool wolfe_ii_ok; + if (opts_.minimize) wolfe_ii_ok = (p2f >= opts_.c2 * pf); + else wolfe_ii_ok = (p2f <= opts_.c2 * pf); + + enum { kDecrease, kNoChange } d_action; // What do do with d_: leave it alone, + // or take the square root. + enum { kAccept, kDecreaseStep, kIncreaseStep, kRestart } iteration_action; + // What we'll do in the overall iteration: accept this value, DecreaseStep + // (reduce the step size), IncreaseStep (increase the step size), or kRestart + // (set k back to zero). Generally when we can't get both conditions to be + // true with a reasonable period of time, it makes sense to restart, because + // probably we've almost converged and got into numerical issues; from here + // we'll just produced NaN's. Restarting is a safe thing to do and the outer + // code will quickly detect convergence. + + d_action = kNoChange; // the default. + + if (wolfe_i_ok && wolfe_ii_ok) { + iteration_action = kAccept; + d_action = kNoChange; // actually doesn't matter, it'll get reset. + } else if (!wolfe_i_ok) { + // If wolfe i) [the Armijo rule] failed then we went too far (or are + // meeting numerical problems). + if (last_failure_type_ == kWolfeII) { // Last time we failed it was Wolfe ii). + // When we switch between them we decrease d. + d_action = kDecrease; + } + iteration_action = kDecreaseStep; + last_failure_type_ = kWolfeI; + num_wolfe_i_failures_++; + } else if (!wolfe_ii_ok) { + // Curvature condition failed -> we did not go far enough. + if (last_failure_type_ == kWolfeI) // switching between wolfe i and ii failures-> + d_action = kDecrease; // decrease value of d. + iteration_action = kIncreaseStep; + last_failure_type_ = kWolfeII; + num_wolfe_ii_failures_++; + } + + // Test whether we've been switching too many times betwen wolfe i) and ii) + // failures, or overall have an excessive number of failures. We just give up + // and restart L-BFGS. Probably we've almost converged. + if (num_wolfe_i_failures_ + num_wolfe_ii_failures_ > + opts_.max_line_search_iters) { + KALDI_VLOG(2) << "Too many steps in line search -> restarting."; + iteration_action = kRestart; + } + + if (d_action == kDecrease) + d_ = std::sqrt(d_); + + KALDI_VLOG(3) << "d = " << d_ << ", iter = " << k_ << ", action = " + << (iteration_action == kAccept ? "accept" : + (iteration_action == kDecreaseStep ? "decrease" : + (iteration_action == kIncreaseStep ? "increase" : + "reject"))); + + // Note: even if iteration_action != Restart at this point, + // some code below may set it to Restart. + if (iteration_action == kAccept) { + if (AcceptStep(function_value, gradient)) { // If we did + // not detect a problem while accepting the step.. + computation_state_ = kBeforeStep; + ComputeNewDirection(function_value, gradient); + } else { + KALDI_VLOG(2) << "Restarting L-BFGS computation; problem found while " + << "accepting step."; + iteration_action = kRestart; // We'll have to restart now. + } + } + if (iteration_action == kDecreaseStep || iteration_action == kIncreaseStep) { + Real scale = (iteration_action == kDecreaseStep ? 1.0 / d_ : d_); + temp_.CopyFromVec(new_x_); + new_x_.Scale(scale); + new_x_.AddVec(1.0 - scale, x_); + if (new_x_.ApproxEqual(temp_, 0.0)) { + // Value of new_x_ did not change at all --> we must restart. + KALDI_VLOG(3) << "Value of x did not change, when taking step; " + << "will restart computation."; + iteration_action = kRestart; + } + if (new_x_.ApproxEqual(temp_, 1.0e-08) && + std::abs(f_ - function_value) < 1.0e-08 * + std::abs(f_) && iteration_action == kDecreaseStep) { + // This is common and due to roundoff. + KALDI_VLOG(3) << "We appear to be backtracking while we are extremely " + << "close to the old value; restarting."; + iteration_action = kRestart; + } + + if (iteration_action == kDecreaseStep) { + num_wolfe_i_failures_++; + last_failure_type_ = kWolfeI; + } else { + num_wolfe_ii_failures_++; + last_failure_type_ = kWolfeII; + } + } + if (iteration_action == kRestart) { + // We want to restart the computation. If the objf at new_x_ is + // better than it was at x_, we'll start at new_x_, else at x_. + bool use_newx; + if (opts_.minimize) use_newx = (function_value < f_); + else use_newx = (function_value > f_); + KALDI_VLOG(3) << "Restarting computation."; + if (use_newx) Restart(new_x_, function_value, gradient); + else Restart(x_, f_, deriv_); + } +} + +template +void OptimizeLbfgs::DoStep(Real function_value, + const VectorBase &gradient) { + if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) { + best_f_ = function_value; + best_x_.CopyFromVec(new_x_); + } + if (computation_state_ == kBeforeStep) + ComputeNewDirection(function_value, gradient); + else // kWithinStep{1,2,3} + StepSizeIteration(function_value, gradient); +} + +template +void OptimizeLbfgs::DoStep(Real function_value, + const VectorBase &gradient, + const VectorBase &diag_approx_2nd_deriv) { + if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) { + best_f_ = function_value; + best_x_.CopyFromVec(new_x_); + } + if (opts_.minimize) { + KALDI_ASSERT(diag_approx_2nd_deriv.Min() > 0.0); + } else { + KALDI_ASSERT(diag_approx_2nd_deriv.Max() < 0.0); + } + H_was_set_ = true; + H_.CopyFromVec(diag_approx_2nd_deriv); + H_.InvertElements(); + DoStep(function_value, gradient); +} + +template +const VectorBase& +OptimizeLbfgs::GetValue(Real *objf_value) const { + if (objf_value != NULL) *objf_value = best_f_; + return best_x_; +} + +// to compute the alpha, we are minimizing f(x) = x^T b - 0.5 x_k^T A x_k along +// direction p_k... consider alpha +// d/dx of f(x) = b - A x_k = r. + +// Notation based on Sec. 5.1 of Nocedal and Wright +// Computation based on Alg. 5.2 of Nocedal and Wright (Pg. 112) +// Notation (replicated for convenience): +// To solve Ax=b for x +// k : current iteration +// x_k : estimate of x (at iteration k) +// r_k : residual ( r_k \eqdef A x_k - b ) +// \alpha_k : step size +// p_k : A-conjugate direction +// \beta_k : coefficient used in A-conjugate direction computation for next +// iteration +// +// Algo. LinearCG(A,b,x_0) +// ======================== +// r_0 = Ax_0 - b +// p_0 = -r_0 +// k = 0 +// +// while r_k != 0 +// \alpha_k = (r_k^T r_k) / (p_k^T A p_k) +// x_{k+1} = x_k + \alpha_k p_k; +// r_{k+1} = r_k + \alpha_k A p_k +// \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K} +// p_{k+1} = -r_{k+1} + \beta_{k+1} p_k +// k = k + 1 +// end + +template +int32 LinearCgd(const LinearCgdOptions &opts, + const SpMatrix &A, + const VectorBase &b, + VectorBase *x) { + // Initialize the variables + // + int32 M = A.NumCols(); + + Matrix storage(4, M); + SubVector r(storage, 0), p(storage, 1), Ap(storage, 2), x_orig(storage, 3); + p.CopyFromVec(b); + p.AddSpVec(-1.0, A, *x, 1.0); // p_0 = b - A x_0 + r.AddVec(-1.0, p); // r_0 = - p_0 + x_orig.CopyFromVec(*x); // in case of failure. + + Real r_cur_norm_sq = VecVec(r, r), + r_initial_norm_sq = r_cur_norm_sq, + r_recompute_norm_sq = r_cur_norm_sq; + + KALDI_VLOG(5) << "In linear CG: initial norm-square of residual = " + << r_initial_norm_sq; + + KALDI_ASSERT(opts.recompute_residual_factor <= 1.0); + Real max_error_sq = std::max(opts.max_error * opts.max_error, + std::numeric_limits::min()), + residual_factor = opts.recompute_residual_factor * + opts.recompute_residual_factor, + inv_residual_factor = 1.0 / residual_factor; + + // Note: although from a mathematical point of view the method should converge + // after M iterations, in practice (due to roundoff) it does not always + // converge to good precision after that many iterations so we let the maximum + // be M + 5 instead. + int32 k = 0; + for (; k < M + 5 && k != opts.max_iters; k++) { + // Note: we'll break from this loop if we converge sooner due to + // max_error. + Ap.AddSpVec(1.0, A, p, 0.0); // Ap = A p + + // Below is how the code used to look. + // // next line: \alpha_k = (r_k^T r_k) / (p_k^T A p_k) + // Real alpha = r_cur_norm_sq / VecVec(p, Ap); + // + // We changed r_cur_norm_sq below to -VecVec(p, r). Although this is + // slightly less efficient, it seems to make the algorithm dramatically more + // robust. Note that -p^T r is the mathematically more natural quantity to + // use here, that corresponds to minimizing along that direction... r^T r is + // recommended in Nocedal and Wright only as a kind of optimization as it is + // supposed to be the same as -p^T r and we already have it computed. + Real alpha = -VecVec(p, r) / VecVec(p, Ap); + + // next line: x_{k+1} = x_k + \alpha_k p_k; + x->AddVec(alpha, p); + // next line: r_{k+1} = r_k + \alpha_k A p_k + r.AddVec(alpha, Ap); + Real r_next_norm_sq = VecVec(r, r); + + if (r_next_norm_sq < residual_factor * r_recompute_norm_sq || + r_next_norm_sq > inv_residual_factor * r_recompute_norm_sq) { + + // Recompute the residual from scratch if the residual norm has decreased + // a lot; this costs an extra matrix-vector multiply, but helps keep the + // residual accurate. + // Also do the same if the residual norm has increased a lot since + // the last time we recomputed... this shouldn't happen often, but + // it can indicate bad stuff is happening. + + // r_{k+1} = A x_{k+1} - b + r.AddSpVec(1.0, A, *x, 0.0); + r.AddVec(-1.0, b); + r_next_norm_sq = VecVec(r, r); + r_recompute_norm_sq = r_next_norm_sq; + + KALDI_VLOG(5) << "In linear CG: recomputing residual."; + } + KALDI_VLOG(5) << "In linear CG: k = " << k + << ", r_next_norm_sq = " << r_next_norm_sq; + // Check if converged. + if (r_next_norm_sq <= max_error_sq) + break; + + // next line: \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K} + Real beta_next = r_next_norm_sq / r_cur_norm_sq; + // next lines: p_{k+1} = -r_{k+1} + \beta_{k+1} p_k + Vector p_old(p); + p.Scale(beta_next); + p.AddVec(-1.0, r); + r_cur_norm_sq = r_next_norm_sq; + } + + // note: the first element of the && is only there to save compute. + // the residual r is A x - b, and r_cur_norm_sq and r_initial_norm_sq are + // of the form r * r, so it's clear that b * b has the right dimension to + // compare with the residual. + if (r_cur_norm_sq > r_initial_norm_sq && + r_cur_norm_sq > r_initial_norm_sq + 1.0e-10 * VecVec(b, b)) { + KALDI_WARN << "Doing linear CGD in dimension " << A.NumRows() << ", after " << k + << " iterations the squared residual has got worse, " + << r_cur_norm_sq << " > " << r_initial_norm_sq + << ". Will do an exact optimization."; + SolverOptions opts("called-from-linearCGD"); + x->CopyFromVec(x_orig); + SolveQuadraticProblem(A, b, opts, x); + } + return k; +} + +// Instantiate the class for float and double. +template +class OptimizeLbfgs; +template +class OptimizeLbfgs; + + +template +int32 LinearCgd(const LinearCgdOptions &opts, + const SpMatrix &A, const VectorBase &b, + VectorBase *x); + +template +int32 LinearCgd(const LinearCgdOptions &opts, + const SpMatrix &A, const VectorBase &b, + VectorBase *x); + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/matrix/optimization.h b/speechx/speechx/kaldi/matrix/optimization.h new file mode 100644 index 00000000..66309aca --- /dev/null +++ b/speechx/speechx/kaldi/matrix/optimization.h @@ -0,0 +1,248 @@ +// matrix/optimization.h + +// Copyright 2012 Johns Hopkins University (author: Daniel Povey) +// +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +// +// (*) incorporates, with permission, FFT code from his book +// "Signal Processing with Lapped Transforms", Artech, 1992. + + + +#ifndef KALDI_MATRIX_OPTIMIZATION_H_ +#define KALDI_MATRIX_OPTIMIZATION_H_ + +#include "matrix/kaldi-vector.h" +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + + +/// @addtogroup matrix_optimization +/// @{ + +struct LinearCgdOptions { + int32 max_iters; // Maximum number of iters (if >= 0). + BaseFloat max_error; // Maximum 2-norm of the residual A x - b (convergence + // test) + // Every time the residual 2-norm decreases by this recompute_residual_factor + // since the last time it was computed from scratch, recompute it from + // scratch. This helps to keep the computed residual accurate even in the + // presence of roundoff. + BaseFloat recompute_residual_factor; + + LinearCgdOptions(): max_iters(-1), + max_error(0.0), + recompute_residual_factor(0.01) { } +}; + +/* + This function uses linear conjugate gradient descent to approximately solve + the system A x = b. The value of x at entry corresponds to the initial guess + of x. The algorithm continues until the number of iterations equals b.Dim(), + or until the 2-norm of (A x - b) is <= max_error, or until the number of + iterations equals max_iter, whichever happens sooner. It is a requirement + that A be positive definite. + It returns the number of iterations that were actually executed (this is + useful for testing purposes). +*/ +template +int32 LinearCgd(const LinearCgdOptions &opts, + const SpMatrix &A, const VectorBase &b, + VectorBase *x); + + + + + + +/** + This is an implementation of L-BFGS. It pushes responsibility for + determining when to stop, onto the user. There is no call-back here: + everything is done via calls to the class itself (see the example in + matrix-lib-test.cc). This does not implement constrained L-BFGS, but it will + handle constrained problems correctly as long as the function approaches + +infinity (or -infinity for maximization problems) when it gets close to the + bound of the constraint. In these types of problems, you just let the + function value be +infinity for minimization problems, or -infinity for + maximization problems, outside these bounds). +*/ + +struct LbfgsOptions { + bool minimize; // if true, we're minimizing, else maximizing. + int m; // m is the number of stored vectors L-BFGS keeps. + float first_step_learning_rate; // The very first step of L-BFGS is + // like gradient descent. If you want to configure the size of that step, + // you can do it using this variable. + float first_step_length; // If this variable is >0.0, it overrides + // first_step_learning_rate; on the first step we choose an approximate + // Hessian that is the multiple of the identity that would generate this + // step-length, or 1.0 if the gradient is zero. + float first_step_impr; // If this variable is >0.0, it overrides + // first_step_learning_rate; on the first step we choose an approximate + // Hessian that is the multiple of the identity that would generate this + // amount of objective function improvement (assuming the "real" objf + // was linear). + float c1; // A constant in Armijo rule = Wolfe condition i) + float c2; // A constant in Wolfe condition ii) + float d; // An amount > 1.0 (default 2.0) that we initially multiply or + // divide the step length by, in the line search. + int max_line_search_iters; // after this many iters we restart L-BFGS. + int avg_step_length; // number of iters to avg step length over, in + // RecentStepLength(). + + LbfgsOptions (bool minimize = true): + minimize(minimize), + m(10), + first_step_learning_rate(1.0), + first_step_length(0.0), + first_step_impr(0.0), + c1(1.0e-04), + c2(0.9), + d(2.0), + max_line_search_iters(50), + avg_step_length(4) { } +}; + +template +class OptimizeLbfgs { + public: + /// Initializer takes the starting value of x. + OptimizeLbfgs(const VectorBase &x, + const LbfgsOptions &opts); + + /// This returns the value of the variable x that has the best objective + /// function so far, and the corresponding objective function value if + /// requested. This would typically be called only at the end. + const VectorBase& GetValue(Real *objf_value = NULL) const; + + /// This returns the value at which the function wants us + /// to compute the objective function and gradient. + const VectorBase& GetProposedValue() const { return new_x_; } + + /// Returns the average magnitude of the last n steps (but not + /// more than the number we have stored). Before we have taken + /// any steps, returns +infinity. Note: if the most recent + /// step length was 0, it returns 0, regardless of the other + /// step lengths. This makes it suitable as a convergence test + /// (else we'd generate NaN's). + Real RecentStepLength() const; + + /// The user calls this function to provide the class with the + /// function and gradient info at the point GetProposedValue(). + /// If this point is outside the constraints you can set function_value + /// to {+infinity,-infinity} for {minimization,maximization} problems. + /// In this case the gradient, and also the second derivative (if you call + /// the second overloaded version of this function) will be ignored. + void DoStep(Real function_value, + const VectorBase &gradient); + + /// The user can call this version of DoStep() if it is desired to set some + /// kind of approximate Hessian on this iteration. Note: it is a prerequisite + /// that diag_approx_2nd_deriv must be strictly positive (minimizing), or + /// negative (maximizing). + void DoStep(Real function_value, + const VectorBase &gradient, + const VectorBase &diag_approx_2nd_deriv); + + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(OptimizeLbfgs); + + + // The following variable says what stage of the computation we're at. + // Refer to Algorithm 7.5 (L-BFGS) of Nodecdal & Wright, "Numerical + // Optimization", 2nd edition. + // kBeforeStep means we're about to do + /// "compute p_k <-- - H_k \delta f_k" (i.e. Algorithm 7.4). + // kWithinStep means we're at some point within line search; note + // that line search is iterative so we can stay in this state more + // than one time on each iteration. + enum ComputationState { + kBeforeStep, + kWithinStep, // This means we're within the step-size computation, and + // have not yet done the 1st function evaluation. + }; + + inline MatrixIndexT Dim() { return x_.Dim(); } + inline MatrixIndexT M() { return opts_.m; } + SubVector Y(MatrixIndexT i) { + return SubVector(data_, (i % M()) * 2); // vector y_i + } + SubVector S(MatrixIndexT i) { + return SubVector(data_, (i % M()) * 2 + 1); // vector s_i + } + // The following are subroutines within DoStep(): + bool AcceptStep(Real function_value, + const VectorBase &gradient); + void Restart(const VectorBase &x, + Real function_value, + const VectorBase &gradient); + void ComputeNewDirection(Real function_value, + const VectorBase &gradient); + void ComputeHifNeeded(const VectorBase &gradient); + void StepSizeIteration(Real function_value, + const VectorBase &gradient); + void RecordStepLength(Real s); + + + LbfgsOptions opts_; + SignedMatrixIndexT k_; // Iteration number, starts from zero. Gets set back to zero + // when we restart. + + ComputationState computation_state_; + bool H_was_set_; // True if the user specified H_; if false, + // we'll use a heuristic to estimate it. + + + Vector x_; // current x. + Vector new_x_; // the x proposed in the line search. + Vector best_x_; // the x with the best objective function so far + // (either the same as x_ or something in the current line search.) + Vector deriv_; // The most recently evaluated derivative-- at x_k. + Vector temp_; + Real f_; // The function evaluated at x_k. + Real best_f_; // the best objective function so far. + Real d_; // a number d > 1.0, but during an iteration we may decrease this, when + // we switch between armijo and wolfe failures. + + int num_wolfe_i_failures_; // the num times we decreased step size. + int num_wolfe_ii_failures_; // the num times we increased step size. + enum { kWolfeI, kWolfeII, kNone } last_failure_type_; // last type of step-search + // failure on this iter. + + Vector H_; // Current inverse-Hessian estimate. May be computed by this class itself, + // or provided by user using 2nd form of SetGradientInfo(). + Matrix data_; // dimension (m*2) x dim. Even rows store + // gradients y_i, odd rows store steps s_i. + Vector rho_; // dimension m; rho_(m) = 1/(y_m^T s_m), Eq. 7.17. + + std::vector step_lengths_; // The step sizes we took on the last + // (up to m) iterations; these are not stored in a rotating buffer but + // are shifted by one each time (this is more convenient when we + // restart, as we keep this info past restarting). + + +}; + +/// @} + + +} // end namespace kaldi + + + +#endif + diff --git a/speechx/speechx/kaldi/matrix/packed-matrix.cc b/speechx/speechx/kaldi/matrix/packed-matrix.cc new file mode 100644 index 00000000..80bf5891 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/packed-matrix.cc @@ -0,0 +1,438 @@ +// matrix/packed-matrix.cc + +// Copyright 2009-2012 Microsoft Corporation Saarland University +// Johns Hopkins University (Author: Daniel Povey); +// Haihua Xu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +/** + * @file packed-matrix.cc + * + * Implementation of specialized PackedMatrix template methods + */ +#include "matrix/cblas-wrappers.h" +#include "matrix/packed-matrix.h" +#include "matrix/kaldi-vector.h" + +namespace kaldi { + +template +void PackedMatrix::Scale(Real alpha) { + size_t nr = num_rows_, + sz = (nr * (nr + 1)) / 2; + cblas_Xscal(sz, alpha, data_, 1); +} + +template +void PackedMatrix::AddPacked(const Real alpha, const PackedMatrix &rMa) { + KALDI_ASSERT(num_rows_ == rMa.NumRows()); + size_t nr = num_rows_, + sz = (nr * (nr + 1)) / 2; + cblas_Xaxpy(sz, alpha, rMa.Data(), 1, data_, 1); +} + +template +void PackedMatrix::SetRandn() { + Real *data = data_; + size_t dim = num_rows_, size = ((dim*(dim+1))/2); + for (size_t i = 0; i < size; i++) + data[i] = RandGauss(); +} + +template +inline void PackedMatrix::Init(MatrixIndexT r) { + if (r == 0) { + num_rows_ = 0; + data_ = 0; + return; + } + size_t size = ((static_cast(r) * static_cast(r + 1)) / 2); + + if (static_cast(static_cast(size)) != size) { + KALDI_WARN << "Allocating packed matrix whose full dimension does not fit " + << "in MatrixIndexT: not all code is tested for this case."; + } + + void *data; // aligned memory block + void *temp; + + if ((data = KALDI_MEMALIGN(16, size * sizeof(Real), &temp)) != NULL) { + this->data_ = static_cast (data); + this->num_rows_ = r; + } else { + throw std::bad_alloc(); + } +} + +template +void PackedMatrix::Swap(PackedMatrix *other) { + std::swap(data_, other->data_); + std::swap(num_rows_, other->num_rows_); +} + +template +void PackedMatrix::Swap(Matrix *other) { + std::swap(data_, other->data_); + std::swap(num_rows_, other->num_rows_); +} + + +template +void PackedMatrix::Resize(MatrixIndexT r, MatrixResizeType resize_type) { + // the next block uses recursion to handle what we have to do if + // resize_type == kCopyData. + if (resize_type == kCopyData) { + if (this->data_ == NULL || r == 0) resize_type = kSetZero; // nothing to copy. + else if (this->num_rows_ == r) { return; } // nothing to do. + else { + // set tmp to a packed matrix of the desired size. + PackedMatrix tmp(r, kUndefined); + size_t r_min = std::min(r, num_rows_); + size_t mem_size_min = sizeof(Real) * (r_min*(r_min+1))/2, + mem_size_full = sizeof(Real) * (r*(r+1))/2; + // Copy the contents to tmp. + memcpy(tmp.data_, data_, mem_size_min); + char *ptr = static_cast(static_cast(tmp.data_)); + // Set the rest of the contents of tmp to zero. + memset(static_cast(ptr + mem_size_min), 0, mem_size_full-mem_size_min); + tmp.Swap(this); + return; + } + } + if (data_ != NULL) Destroy(); + Init(r); + if (resize_type == kSetZero) SetZero(); +} + + + +template +void PackedMatrix::AddToDiag(Real r) { + Real *ptr = data_; + for (MatrixIndexT i = 2; i <= num_rows_+1; i++) { + *ptr += r; + ptr += i; + } +} + +template +void PackedMatrix::ScaleDiag(Real alpha) { + Real *ptr = data_; + for (MatrixIndexT i = 2; i <= num_rows_+1; i++) { + *ptr *= alpha; + ptr += i; + } +} + +template +void PackedMatrix::SetDiag(Real alpha) { + Real *ptr = data_; + for (MatrixIndexT i = 2; i <= num_rows_+1; i++) { + *ptr = alpha; + ptr += i; + } +} + + + +template +template +void PackedMatrix::CopyFromPacked(const PackedMatrix &orig) { + KALDI_ASSERT(NumRows() == orig.NumRows()); + if (sizeof(Real) == sizeof(OtherReal)) { + memcpy(data_, orig.Data(), SizeInBytes()); + } else { + Real *dst = data_; + const OtherReal *src = orig.Data(); + size_t nr = NumRows(), + size = (nr * (nr + 1)) / 2; + for (size_t i = 0; i < size; i++, dst++, src++) + *dst = *src; + } +} + +// template instantiations. +template +void PackedMatrix::CopyFromPacked(const PackedMatrix &orig); +template +void PackedMatrix::CopyFromPacked(const PackedMatrix &orig); +template +void PackedMatrix::CopyFromPacked(const PackedMatrix &orig); +template +void PackedMatrix::CopyFromPacked(const PackedMatrix &orig); + + + +template +template +void PackedMatrix::CopyFromVec(const SubVector &vec) { + MatrixIndexT size = (NumRows()*(NumRows()+1)) / 2; + KALDI_ASSERT(vec.Dim() == size); + if (sizeof(Real) == sizeof(OtherReal)) { + memcpy(data_, vec.Data(), size * sizeof(Real)); + } else { + Real *dst = data_; + const OtherReal *src = vec.Data(); + for (MatrixIndexT i = 0; i < size; i++, dst++, src++) + *dst = *src; + } +} + +// template instantiations. +template +void PackedMatrix::CopyFromVec(const SubVector &orig); +template +void PackedMatrix::CopyFromVec(const SubVector &orig); +template +void PackedMatrix::CopyFromVec(const SubVector &orig); +template +void PackedMatrix::CopyFromVec(const SubVector &orig); + + + +template +void PackedMatrix::SetZero() { + memset(data_, 0, SizeInBytes()); +} + +template +void PackedMatrix::SetUnit() { + memset(data_, 0, SizeInBytes()); + for (MatrixIndexT row = 0;row < num_rows_;row++) + (*this)(row, row) = 1.0; +} + +template +Real PackedMatrix::Trace() const { + Real ans = 0.0; + for (MatrixIndexT row = 0;row < num_rows_;row++) + ans += (*this)(row, row); + return ans; +} + +template +void PackedMatrix::Destroy() { + // we need to free the data block if it was defined + if (data_ != NULL) KALDI_MEMALIGN_FREE(data_); + data_ = NULL; + num_rows_ = 0; +} + + +template +void PackedMatrix::Write(std::ostream &os, bool binary) const { + if (!os.good()) { + KALDI_ERR << "Failed to write vector to stream: stream not good"; + } + + int32 size = this->NumRows(); // make the size 32-bit on disk. + KALDI_ASSERT(this->NumRows() == (MatrixIndexT) size); + MatrixIndexT num_elems = ((size+1)*(MatrixIndexT)size)/2; + + if(binary) { + std::string my_token = (sizeof(Real) == 4 ? "FP" : "DP"); + WriteToken(os, binary, my_token); + WriteBasicType(os, binary, size); + // We don't use the built-in Kaldi write routines for the floats, as they are + // not efficient enough. + os.write((const char*) data_, sizeof(Real) * num_elems); + } + else { + if(size == 0) + os<<"[ ]\n"; + else { + os<<"[\n"; + MatrixIndexT i = 0; + for (int32 j = 0; j < size; j++) { + for (int32 k = 0; k < j + 1; k++) { + WriteBasicType(os, binary, data_[i++]); + } + os << ( (j==size-1)? "]\n" : "\n"); + } + KALDI_ASSERT(i == num_elems); + } + } + if (os.fail()) { + KALDI_ERR << "Failed to write packed matrix to stream"; + } +} + +// template +// void Save (std::ostream & os, const PackedMatrix& rM) +// { +// const Real* p_elem = rM.data(); +// for (MatrixIndexT i = 0; i < rM.NumRows(); i++) { +// for (MatrixIndexT j = 0; j <= i ; j++) { +// os << *p_elem; +// p_elem++; +// if (j == i) { +// os << '\n'; +// } +// else { +// os << ' '; +// } +// } +// } +// if (os.fail()) +// KALDI_ERR("Failed to write packed matrix to stream"); +// } + + + + + +template +void PackedMatrix::Read(std::istream& is, bool binary, bool add) { + if (add) { + PackedMatrix tmp; + tmp.Read(is, binary, false); // read without adding. + if (this->NumRows() == 0) this->Resize(tmp.NumRows()); + else { + if (this->NumRows() != tmp.NumRows()) { + if (tmp.NumRows() == 0) return; // do nothing in this case. + else KALDI_ERR << "PackedMatrix::Read, size mismatch " << this->NumRows() + << " vs. " << tmp.NumRows(); + } + } + this->AddPacked(1.0, tmp); + return; + } // now assume add == false. + + std::ostringstream specific_error; + MatrixIndexT pos_at_start = is.tellg(); + int peekval = Peek(is, binary); + const char *my_token = (sizeof(Real) == 4 ? "FP" : "DP"); + const char *new_format_token = "["; + bool is_new_format = false;//added by hxu + char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F'); + int32 size; + MatrixIndexT num_elems; + + if (peekval == other_token_start) { // need to instantiate the other type to read it. + typedef typename OtherReal::Real OtherType; // if Real == float, OtherType == double, and vice versa. + PackedMatrix other(this->NumRows()); + other.Read(is, binary, false); // add is false at this point. + this->Resize(other.NumRows()); + this->CopyFromPacked(other); + return; + } + std::string token; + ReadToken(is, binary, &token); + if (token != my_token) { + if(token != new_format_token) { + specific_error << ": Expected token " << my_token << ", got " << token; + goto bad; + } + //new format it is + is_new_format = true; + } + if(!is_new_format) { + ReadBasicType(is, binary, &size); // throws on error. + if ((MatrixIndexT)size != this->NumRows()) { + KALDI_ASSERT(size>=0); + this->Resize(size); + } + num_elems = ((size+1)*(MatrixIndexT)size)/2; + if (!binary) { + for (MatrixIndexT i = 0; i < num_elems; i++) { + ReadBasicType(is, false, data_+i); // will throw on error. + } + } else { + if (num_elems) + is.read(reinterpret_cast(data_), sizeof(Real)*num_elems); + } + if (is.fail()) goto bad; + return; + } + else { + std::vector data; + while(1) { + int32 num_lines = 0; + int i = is.peek(); + if (i == -1) { specific_error << "Got EOF while reading matrix data"; goto bad; } + else if (static_cast(i) == ']') { // Finished reading matrix. + is.get(); // eat the "]". + i = is.peek(); + if (static_cast(i) == '\r') { + is.get(); + is.get(); // get \r\n (must eat what we wrote) + }// I don't actually understand what it's doing here + else if (static_cast(i) == '\n') { is.get(); } // get \n (must eat what we wrote) + + if (is.fail()) { + KALDI_WARN << "After end of matrix data, read error."; + // we got the data we needed, so just warn for this error. + } + //now process the data: + num_lines = int32(sqrt(data.size()*2)); + + KALDI_ASSERT(data.size() == num_lines*(num_lines+1)/2); + + this->Resize(num_lines); + + //std::cout<= '0' && i <= '9') || i == '-' ) { // A number... + Real r; + is >> r; + if (is.fail()) { + specific_error << "Stream failure/EOF while reading matrix data."; + goto bad; + } + data.push_back(r); + } + else if (isspace(i)) { + is.get(); // eat the space and do nothing. + } else { // NaN or inf or error. + std::string str; + is >> str; + if (!KALDI_STRCASECMP(str.c_str(), "inf") || + !KALDI_STRCASECMP(str.c_str(), "infinity")) { + data.push_back(std::numeric_limits::infinity()); + KALDI_WARN << "Reading infinite value into matrix."; + } else if (!KALDI_STRCASECMP(str.c_str(), "nan")) { + data.push_back(std::numeric_limits::quiet_NaN()); + KALDI_WARN << "Reading NaN value into matrix."; + } else { + specific_error << "Expecting numeric matrix data, got " << str; + goto bad; + } + } + } + } +bad: + KALDI_ERR << "Failed to read packed matrix from stream. " << specific_error.str() + << " File position at start is " + << pos_at_start << ", currently " << is.tellg(); +} + + +// Instantiate PackedMatrix for float and double. +template +class PackedMatrix; + +template +class PackedMatrix; + + +} // namespace kaldi + diff --git a/speechx/speechx/kaldi/matrix/packed-matrix.h b/speechx/speechx/kaldi/matrix/packed-matrix.h new file mode 100644 index 00000000..722d932b --- /dev/null +++ b/speechx/speechx/kaldi/matrix/packed-matrix.h @@ -0,0 +1,197 @@ +// matrix/packed-matrix.h + +// Copyright 2009-2013 Ondrej Glembek; Lukas Burget; Microsoft Corporation; +// Saarland University; Yanmin Qian; +// Johns Hopkins University (Author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_MATRIX_PACKED_MATRIX_H_ +#define KALDI_MATRIX_PACKED_MATRIX_H_ + +#include "matrix/matrix-common.h" +#include + +namespace kaldi { + +/// \addtogroup matrix_funcs_io +// we need to declare the friend << operator here +template +std::ostream & operator <<(std::ostream & out, const PackedMatrix& M); + + +/// \addtogroup matrix_group +/// @{ + +/// @brief Packed matrix: base class for triangular and symmetric matrices. +template class PackedMatrix { + friend class CuPackedMatrix; + public: + //friend class CuPackedMatrix; + + PackedMatrix() : data_(NULL), num_rows_(0) {} + + explicit PackedMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero): + data_(NULL) { Resize(r, resize_type); } + + explicit PackedMatrix(const PackedMatrix &orig) : data_(NULL) { + Resize(orig.num_rows_, kUndefined); + CopyFromPacked(orig); + } + + template + explicit PackedMatrix(const PackedMatrix &orig) : data_(NULL) { + Resize(orig.NumRows(), kUndefined); + CopyFromPacked(orig); + } + + void SetZero(); /// < Set to zero + void SetUnit(); /// < Set to unit matrix. + void SetRandn(); /// < Set to random values of a normal distribution + + Real Trace() const; + + // Needed for inclusion in std::vector + PackedMatrix & operator =(const PackedMatrix &other) { + Resize(other.NumRows()); + CopyFromPacked(other); + return *this; + } + + ~PackedMatrix() { + Destroy(); + } + + /// Set packed matrix to a specified size (can be zero). + /// The value of the new data depends on resize_type: + /// -if kSetZero, the new data will be zero + /// -if kUndefined, the new data will be undefined + /// -if kCopyData, the new data will be the same as the old data in any + /// shared positions, and zero elsewhere. + /// This function takes time proportional to the number of data elements. + void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero); + + void AddToDiag(const Real r); // Adds r to diaginal + + void ScaleDiag(const Real alpha); // Scales diagonal by alpha. + + void SetDiag(const Real alpha); // Sets diagonal to this value. + + template + void CopyFromPacked(const PackedMatrix &orig); + + /// CopyFromVec just interprets the vector as having the same layout + /// as the packed matrix. Must have the same dimension, i.e. + /// orig.Dim() == (NumRows()*(NumRows()+1)) / 2; + template + void CopyFromVec(const SubVector &orig); + + Real* Data() { return data_; } + const Real* Data() const { return data_; } + inline MatrixIndexT NumRows() const { return num_rows_; } + inline MatrixIndexT NumCols() const { return num_rows_; } + size_t SizeInBytes() const { + size_t nr = static_cast(num_rows_); + return ((nr * (nr+1)) / 2) * sizeof(Real); + } + + //MatrixIndexT Stride() const { return stride_; } + + // This code is duplicated in child classes to avoid extra levels of calls. + Real operator() (MatrixIndexT r, MatrixIndexT c) const { + KALDI_ASSERT(static_cast(r) < + static_cast(num_rows_) && + static_cast(c) < + static_cast(num_rows_) + && c <= r); + return *(data_ + (r * (r + 1)) / 2 + c); + } + + // This code is duplicated in child classes to avoid extra levels of calls. + Real &operator() (MatrixIndexT r, MatrixIndexT c) { + KALDI_ASSERT(static_cast(r) < + static_cast(num_rows_) && + static_cast(c) < + static_cast(num_rows_) + && c <= r); + return *(data_ + (r * (r + 1)) / 2 + c); + } + + Real Max() const { + KALDI_ASSERT(num_rows_ > 0); + return * (std::max_element(data_, data_ + ((num_rows_*(num_rows_+1))/2) )); + } + + Real Min() const { + KALDI_ASSERT(num_rows_ > 0); + return * (std::min_element(data_, data_ + ((num_rows_*(num_rows_+1))/2) )); + } + + void Scale(Real c); + + friend std::ostream & operator << <> (std::ostream & out, + const PackedMatrix &m); + // Use instead of stream<<*this, if you want to add to existing contents. + // Will throw exception on failure. + void Read(std::istream &in, bool binary, bool add = false); + + void Write(std::ostream &out, bool binary) const; + + void Destroy(); + + /// Swaps the contents of *this and *other. Shallow swap. + void Swap(PackedMatrix *other); + void Swap(Matrix *other); + + + protected: + // Will only be called from this class or derived classes. + void AddPacked(const Real alpha, const PackedMatrix& M); + Real *data_; + MatrixIndexT num_rows_; + //MatrixIndexT stride_; + private: + /// Init assumes the current contents of the class are is invalid (i.e. junk or + /// has already been freed), and it sets the matrixd to newly allocated memory + /// with the specified dimension. dim == 0 is acceptable. The memory contents + /// pointed to by data_ will be undefined. + void Init(MatrixIndexT dim); + +}; +/// @} end "addtogroup matrix_group" + + +/// \addtogroup matrix_funcs_io +/// @{ + +template +std::ostream & operator << (std::ostream & os, const PackedMatrix& M) { + M.Write(os, false); + return os; +} + +template +std::istream & operator >> (std::istream &is, PackedMatrix &M) { + M.Read(is, false); + return is; +} + +/// @} + +} // namespace kaldi + +#endif + diff --git a/speechx/speechx/kaldi/matrix/qr.cc b/speechx/speechx/kaldi/matrix/qr.cc new file mode 100644 index 00000000..861dead0 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/qr.cc @@ -0,0 +1,580 @@ +// matrix/qr.cc + +// Copyright 2012 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "matrix/sp-matrix.h" +#include "matrix/kaldi-vector.h" +#include "matrix/kaldi-matrix.h" +#include "matrix/matrix-functions.h" +#include "matrix/cblas-wrappers.h" + +// This file contains an implementation of the Symmetric QR Algorithm +// for the symmetric eigenvalue problem. See Golub and Van Loan, +// 3rd ed., Algorithm 8.3.3. + +namespace kaldi { + + +/* This is from Golub and Van Loan 3rd ed., sec. 5.1.3, + p210. + x is the input of dimenson 'dim', v is the output of dimension + dim, and beta is a scalar. Note: we use zero-based + not one-based indexing. */ +/* +// We are commenting out the function below ("House") because it's not +// needed, but we keep it just to show how we came up with HouseBackward. +template +void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) { + KALDI_ASSERT(dim > 0); + // To avoid overflow, we first compute the max of x_ (or + // one if that's zero, and we'll replace "x" by x/max(x_i) + // below. The householder vector is anyway invariant to + // the magnitude of x. We could actually avoid this extra loop + // over x if we wanted to be a bit smarter, but anyway this + // doesn't dominate the O(N) performance of the algorithm. + Real s; // s is a scale on x. + { + Real max_x = std::numeric_limits::min(); + for (MatrixIndexT i = 0; i < dim; i++) + max_x = std::max(max_x, (x[i] < 0 ? -x[i] : x[i])); + if (max_x == 0.0) max_x = 1.0; + s = 1.0 / max_x; + } + + Real sigma = 0.0; + v[0] = 1.0; + for (MatrixIndexT i = 1; i < dim; i++) { + sigma += (x[i]*s) * (x[i]*s); + v[i] = x[i]*s; + } + if (sigma == 0.0) *beta = 0.0; + else { + // When we say x1 = x[0], we reference the one-based indexing + // in Golub and Van Loan. + Real x1 = x[0] * s, mu = std::sqrt(x1*x1 + sigma); + if (x1 <= 0) { + v[0] = x1 - mu; + } else { + v[0] = -sigma / (x1 + mu); + KALDI_ASSERT(KALDI_ISFINITE(v[dim-1])); + } + Real v1 = v[0]; + Real v1sq = v1 * v1; + *beta = 2 * v1sq / (sigma + v1sq); + Real inv_v1 = 1.0 / v1; + if (KALDI_ISINF(inv_v1)) { + // can happen if v1 is denormal. + KALDI_ASSERT(v1 == v1 && v1 != 0.0); + for (MatrixIndexT i = 0; i < dim; i++) v[i] /= v1; + } else { + cblas_Xscal(dim, inv_v1, v, 1); + } + if (KALDI_ISNAN(inv_v1)) { + KALDI_ERR << "NaN encountered in HouseBackward"; + } + } +} +*/ + +// This is a backward version of the "House" routine above: +// backward because it's the last index, not the first index of +// the vector that is "special". This is convenient in +// the Tridiagonalize routine that uses reversed indexes for +// compatibility with the packed lower triangular format. +template +void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) { + KALDI_ASSERT(dim > 0); + // To avoid overflow, we first compute the max of x_ (or + // one if that's zero, and we'll replace "x" by x/max(x_i) + // below. The householder vector is anyway invariant to + // the magnitude of x. We could actually avoid this extra loop + // over x if we wanted to be a bit smarter, but anyway this + // doesn't dominate the O(N) performance of the algorithm. + Real s; // s is a scale on x. + { + Real max_x = std::numeric_limits::min(); + for (MatrixIndexT i = 0; i < dim; i++) + max_x = std::max(max_x, (x[i] < 0 ? -x[i] : x[i])); + s = 1.0 / max_x; + } + Real sigma = 0.0; + v[dim-1] = 1.0; + for (MatrixIndexT i = 0; i + 1 < dim; i++) { + sigma += (x[i] * s) * (x[i] * s); + v[i] = x[i] * s; + } + KALDI_ASSERT(KALDI_ISFINITE(sigma) && + "Tridiagonalizing matrix that is too large or has NaNs."); + if (sigma == 0.0) *beta = 0.0; + else { + Real x1 = x[dim-1] * s, mu = std::sqrt(x1 * x1 + sigma); + if (x1 <= 0) { + v[dim-1] = x1 - mu; + } else { + v[dim-1] = -sigma / (x1 + mu); + KALDI_ASSERT(KALDI_ISFINITE(v[dim-1])); + } + Real v1 = v[dim-1]; + Real v1sq = v1 * v1; + *beta = 2 * v1sq / (sigma + v1sq); + Real inv_v1 = 1.0 / v1; + if (KALDI_ISINF(inv_v1)) { + // can happen if v1 is denormal. + KALDI_ASSERT(v1 == v1 && v1 != 0.0); + for (MatrixIndexT i = 0; i < dim; i++) v[i] /= v1; + } else { + cblas_Xscal(dim, inv_v1, v, 1); + } + if (KALDI_ISNAN(inv_v1)) { + KALDI_ERR << "NaN encountered in HouseBackward"; + } + } +} + + +/** + This routine tridiagonalizes *this. C.f. Golub and Van Loan 3rd ed., sec. + 8.3.1 (p415). We reverse the order of the indices as it's more natural + with packed lower-triangular matrices to do it this way. There's also + a shift from one-based to zero-based indexing, so the index + k is transformed k -> n - k, and a corresponding transpose... + + Let the original *this be A. This algorithms replaces *this with + a tridiagonal matrix T such that T = Q A Q^T for an orthogonal Q. + Caution: Q is transposed vs. Golub and Van Loan. + If Q != NULL it outputs Q. +*/ +template +void SpMatrix::Tridiagonalize(MatrixBase *Q) { + MatrixIndexT n = this->NumRows(); + KALDI_ASSERT(Q == NULL || (Q->NumRows() == n && + Q->NumCols() == n)); + if (Q != NULL) Q->SetUnit(); + Real *data = this->Data(); + Real *qdata = (Q == NULL ? NULL : Q->Data()); + MatrixIndexT qstride = (Q == NULL ? 0 : Q->Stride()); + Vector tmp_v(n-1), tmp_p(n); + Real beta, *v = tmp_v.Data(), *p = tmp_p.Data(), *w = p, *x = p; + for (MatrixIndexT k = n-1; k >= 2; k--) { + MatrixIndexT ksize = ((k+1)*k)/2; + // ksize is the packed size of the lower-triangular matrix of size k, + // which is the size of "all rows previous to this one." + Real *Arow = data + ksize; // In Golub+Van Loan it was A(k+1:n, k), we + // have Arow = A(k, 0:k-1). + HouseBackward(k, Arow, v, &beta); // sets v and beta. + cblas_Xspmv(k, beta, data, v, 1, 0.0, p, 1); // p = beta * A(0:k-1,0:k-1) v + Real minus_half_beta_pv = -0.5 * beta * cblas_Xdot(k, p, 1, v, 1); + cblas_Xaxpy(k, minus_half_beta_pv, v, 1, w, 1); // w = p - (beta p^T v/2) v; + // this relies on the fact that w and p are the same pointer. + // We're doing A(k, k-1) = ||Arow||. It happens that this element + // is indexed at ksize + k - 1 in the packed lower-triangular format. + data[ksize + k - 1] = std::sqrt(cblas_Xdot(k, Arow, 1, Arow, 1)); + for (MatrixIndexT i = 0; i + 1 < k; i++) + data[ksize + i] = 0; // This is not in Golub and Van Loan but is + // necessary if we're not using parts of A to store the Householder + // vectors. + // We're doing A(0:k-1,0:k-1) -= (v w' + w v') + cblas_Xspr2(k, -1.0, v, 1, w, 1, data); + if (Q != NULL) { // C.f. Golub, Q is H_1 .. H_n-2... in this + // case we apply them in the opposite order so it's H_n-1 .. H_1, + // but also Q is transposed so we really have Q = H_1 .. H_n-1. + // It's a double negative. + // Anyway, we left-multiply Q by each one. The H_n would each be + // diag(I + beta v v', I) but we don't ever touch the last dims. + // We do (in Matlab notation): + // Q(0:k-1,:) = (I - beta v v') * Q, i.e.: + // Q(:,0:i-1) += -beta v (v' Q(:,0:k-1)v .. let x = -beta Q(0:k-1,:)^T v. + cblas_Xgemv(kTrans, k, n, -beta, qdata, qstride, v, 1, 0.0, x, 1); + // now x = -beta Q(:,0:k-1) v. + // The next line does: Q(:,0:k-1) += v x'. + cblas_Xger(k, n, 1.0, v, 1, x, 1, qdata, qstride); + } + } +} + +// Instantiate these functions, as it wasn't implemented in sp-matrix.cc +// where we instantiated the whole class. +template +void SpMatrix::Tridiagonalize(MatrixBase *Q); +template +void SpMatrix::Tridiagonalize(MatrixBase *Q); + +/// Create Givens rotations, as in Golub and Van Loan 3rd ed., page 216. +template +inline void Givens(Real a, Real b, Real *c, Real *s) { + if (b == 0) { + *c = 1; + *s = 0; + } else { + if (std::abs(b) > std::abs(a)) { + Real tau = -a / b; + *s = 1 / std::sqrt(1 + tau*tau); + *c = *s * tau; + } else { + Real tau = -b / a; + *c = 1 / std::sqrt(1 + tau*tau); + *s = *c * tau; + } + } +} + + +// Some internal code for the QR algorithm: one "QR step". +// This is Golub and Van Loan 3rd ed., Algorithm 8.3.2 "Implicit Symmetric QR step +// with Wilkinson shift." A couple of differences: this code is +// in zero based arithmetic, and we represent Q transposed from +// their Q for memory locality with row-major-indexed matrices. +template +void QrStep(MatrixIndexT n, + Real *diag, + Real *off_diag, + MatrixBase *Q) { + KALDI_ASSERT(n >= 2); + // below, "scale" could be any number; we introduce it to keep the + // floating point quantities within a good range. + Real d = (diag[n-2] - diag[n-1]) / 2.0, + t = off_diag[n-2], + inv_scale = std::max(std::max(std::abs(d), std::abs(t)), + std::numeric_limits::min()), + scale = 1.0 / inv_scale, + d_scaled = d * scale, + off_diag_n2_scaled = off_diag[n-2] * scale, + t2_n_n1_scaled = off_diag_n2_scaled * off_diag_n2_scaled, + sgn_d = (d > 0.0 ? 1.0 : -1.0), + mu = diag[n-1] - inv_scale * t2_n_n1_scaled / + (d_scaled + sgn_d * std::sqrt(d_scaled * d_scaled + t2_n_n1_scaled)), + x = diag[0] - mu, + z = off_diag[0]; + KALDI_ASSERT(KALDI_ISFINITE(x)); + Real *Qdata = (Q == NULL ? NULL : Q->Data()); + MatrixIndexT Qstride = (Q == NULL ? 0 : Q->Stride()), + Qcols = (Q == NULL ? 0 : Q->NumCols()); + for (MatrixIndexT k = 0; k < n-1; k++) { + Real c, s; + Givens(x, z, &c, &s); + // Rotate dimensions k and k+1 with the Givens matrix G, as + // T <== G^T T G. + // In 2d, a Givens matrix is [ c s; -s c ]. Forget about + // the dimension-indexing issues and assume we have a 2x2 + // symmetric matrix [ p q ; q r ] + // We ask our friends at Wolfram Alpha about + // { { c, -s}, {s, c} } * { {p, q}, {q, r} } * { { c, s}, {-s, c} } + // Interpreting the result as [ p', q' ; q', r ] + // p' = c (c p - s q) - s (c q - s r) + // q' = s (c p - s q) + c (c q - s r) + // r' = s (s p + c q) + c (s q + c r) + Real p = diag[k], q = off_diag[k], r = diag[k+1]; + // p is element k,k; r is element k+1,k+1; q is element k,k+1 or k+1,k. + // We'll let the compiler optimize this. + diag[k] = c * (c*p - s*q) - s * (c*q - s*r); + off_diag[k] = s * (c*p - s*q) + c * (c*q - s*r); + diag[k+1] = s * (s*p + c*q) + c * (s*q + c*r); + + // We also have some other elements to think of that + // got rotated in a simpler way: if k>0, + // then element (k, k-1) and (k+1, k-1) get rotated. Here, + // element k+1, k-1 will be present as z; it's the out-of-band + // element that we remembered from last time. This is + // on the left as it's the row indexes that differ, so think of + // this as being premultiplied by G^T. In fact we're multiplying + // T by in some sense the opposite/transpose of the Givens rotation. + if (k > 0) { // Note, in rotations, going backward, (x,y) -> ((cx - sy), (sx + cy)) + Real &elem_k_km1 = off_diag[k-1], + elem_kp1_km1 = z; // , tmp = elem_k_km1; + elem_k_km1 = c*elem_k_km1 - s*elem_kp1_km1; + // The next line will set elem_kp1_km1 to zero and we'll never access this + // value, so we comment it out. + // elem_kp1_km1 = s*tmp + c*elem_kp1_km1; + } + if (Q != NULL) + cblas_Xrot(Qcols, Qdata + k*Qstride, 1, + Qdata + (k+1)*Qstride, 1, c, -s); + if (k < n-2) { + // Next is the elements (k+2, k) and (k+2, k-1), to be rotated, again + // backwards. + Real &elem_kp2_k = z, + &elem_kp2_kp1 = off_diag[k+1]; + // Note: elem_kp2_k == z would start off as zero because it's + // two off the diagonal, and not been touched yet. Therefore + // we eliminate it in expressions below, commenting it out. + // If we didn't do this we should set it to zero first. + elem_kp2_k = - s * elem_kp2_kp1; // + c*elem_kp2_k + elem_kp2_kp1 = c * elem_kp2_kp1; // + s*elem_kp2_k (original value). + // The next part is from the algorithm they describe: x = t_{k+1,k} + x = off_diag[k]; + } + } +} + + +// Internal code for the QR algorithm, where the diagonal +// and off-diagonal of the symmetric matrix are represented as +// vectors of length n and n-1. +template +void QrInternal(MatrixIndexT n, + Real *diag, + Real *off_diag, + MatrixBase *Q) { + KALDI_ASSERT(Q == NULL || Q->NumCols() == n); // We may + // later relax the condition that Q->NumCols() == n. + + MatrixIndexT counter = 0, max_iters = 500 + 4*n, // Should never take this many iters. + large_iters = 100 + 2*n; + Real epsilon = (pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0)); + + for (; counter < max_iters; counter++) { // this takes the place of "until + // q=n"... we'll break out of the + // loop when we converge. + if (counter == large_iters || + (counter > large_iters && (counter - large_iters) % 50 == 0)) { + KALDI_WARN << "Took " << counter + << " iterations in QR (dim is " << n << "), doubling epsilon."; + SubVector d(diag, n), o(off_diag, n-1); + KALDI_WARN << "Diag, off-diag are " << d << " and " << o; + epsilon *= 2.0; + } + for (MatrixIndexT i = 0; i+1 < n; i++) { + if (std::abs(off_diag[i]) <= epsilon * + (std::abs(diag[i]) + std::abs(diag[i+1]))) + off_diag[i] = 0.0; + } + // The next code works out p, q, and npq which is n - p - q. + // For the definitions of q and p, see Golub and Van Loan; we + // partition the n dims into pieces of size (p, n-p-q, q) where + // the part of size q is diagonal and the part of size n-p-p is + // "unreduced", i.e. has no zero off-diagonal elements. + MatrixIndexT q = 0; + // Note: below, "n-q < 2" should more clearly be "n-2-q < 0", but that + // causes problems if MatrixIndexT is unsigned. + while (q < n && (n-q < 2 || off_diag[n-2-q] == 0.0)) + q++; + if (q == n) break; // we're done. It's diagonal. + KALDI_ASSERT(n - q >= 2); + MatrixIndexT npq = 2; // Value of n - p - q, where n - p - q must be + // unreduced. This is the size of "middle" band of elements. If q != n, + // we must have hit a nonzero off-diag element, so the size of this + // band must be at least two. + while (npq + q < n && (n-q-npq-1 < 0 || off_diag[n-q-npq-1] != 0.0)) + npq++; + MatrixIndexT p = n - q - npq; + { // Checks. + for (MatrixIndexT i = 0; i+1 < npq; i++) + KALDI_ASSERT(off_diag[p + i] != 0.0); + for (MatrixIndexT i = 0; i+1 < q; i++) + KALDI_ASSERT(off_diag[p + npq - 1 + i] == 0.0); + if (p > 1) // Something must have stopped npq from growing further.. + KALDI_ASSERT(off_diag[p-1] == 0.0); // so last off-diag elem in + // group of size p must be zero. + } + + if (Q != NULL) { + // Do one QR step on the middle part of Q only. + // Qpart will be a subset of the rows of Q. + SubMatrix Qpart(*Q, p, npq, 0, Q->NumCols()); + QrStep(npq, diag + p, off_diag + p, &Qpart); + } else { + QrStep(npq, diag + p, off_diag + p, + static_cast*>(NULL)); + } + } + if (counter == max_iters) { + KALDI_WARN << "Failure to converge in QR algorithm. " + << "Exiting with partial output."; + } +} + + +/** + This is the symmetric QR algorithm, from Golub and Van Loan 3rd ed., Algorithm + 8.3.3. Q is transposed w.r.t. there, though. +*/ +template +void SpMatrix::Qr(MatrixBase *Q) { + KALDI_ASSERT(this->IsTridiagonal()); + // We envisage that Q would be square but we don't check for this, + // as there are situations where you might not want this. + KALDI_ASSERT(Q == NULL || Q->NumRows() == this->NumRows()); + // Note: the first couple of lines of the algorithm they give would be done + // outside of this function, by calling Tridiagonalize(). + + MatrixIndexT n = this->NumRows(); + Vector diag(n), off_diag(n-1); + for (MatrixIndexT i = 0; i < n; i++) { + diag(i) = (*this)(i, i); + if (i > 0) off_diag(i-1) = (*this)(i, i-1); + } + QrInternal(n, diag.Data(), off_diag.Data(), Q); + // Now set *this to the value represented by diag and off_diag. + this->SetZero(); + for (MatrixIndexT i = 0; i < n; i++) { + (*this)(i, i) = diag(i); + if (i > 0) (*this)(i, i-1) = off_diag(i-1); + } +} + +template +void SpMatrix::Eig(VectorBase *s, MatrixBase *P) const { + MatrixIndexT dim = this->NumRows(); + KALDI_ASSERT(s->Dim() == dim); + KALDI_ASSERT(P == NULL || (P->NumRows() == dim && P->NumCols() == dim)); + + SpMatrix A(*this); // Copy *this, since the tridiagonalization + // and QR decomposition are destructive. + // Note: for efficiency of memory access, the tridiagonalization + // algorithm makes the *rows* of P the eigenvectors, not the columns. + // We'll transpose P before we exit. + // Also note: P may be null if you don't want the eigenvectors. This + // will make this function more efficient. + + A.Tridiagonalize(P); // Tridiagonalizes. + A.Qr(P); // Diagonalizes. + if(P) P->Transpose(); + s->CopyDiagFromPacked(A); +} + + +template +void SpMatrix::TopEigs(VectorBase *s, MatrixBase *P, + MatrixIndexT lanczos_dim) const { + const SpMatrix &S(*this); // call this "S" for easy notation. + MatrixIndexT eig_dim = s->Dim(); // Space of dim we want to retain. + if (lanczos_dim <= 0) + lanczos_dim = std::max(eig_dim + 50, eig_dim + eig_dim/2); + MatrixIndexT dim = this->NumRows(); + if (lanczos_dim >= dim) { + // There would be no speed advantage in using this method, so just + // use the regular approach. + Vector s_tmp(dim); + Matrix P_tmp(dim, dim); + this->Eig(&s_tmp, &P_tmp); + SortSvd(&s_tmp, &P_tmp); + s->CopyFromVec(s_tmp.Range(0, eig_dim)); + P->CopyFromMat(P_tmp.Range(0, dim, 0, eig_dim)); + return; + } + KALDI_ASSERT(eig_dim <= dim && eig_dim > 0); + KALDI_ASSERT(P->NumRows() == dim && P->NumCols() == eig_dim); // each column + // is one eigenvector. + + Matrix Q(lanczos_dim, dim); // The rows of Q will be the + // orthogonal vectors of the Krylov subspace. + + SpMatrix T(lanczos_dim); // This will be equal to Q S Q^T, + // i.e. *this projected into the Krylov subspace. Note: only the + // diagonal and off-diagonal fo T are nonzero, i.e. it's tridiagonal, + // but we don't have access to the low-level algorithms that work + // on that type of matrix (since we want to use ATLAS). So we just + // do normal SVD, on a full matrix; it won't typically dominate. + + Q.Row(0).SetRandn(); + Q.Row(0).Scale(1.0 / Q.Row(0).Norm(2)); + for (MatrixIndexT d = 0; d < lanczos_dim; d++) { + Vector r(dim); + r.AddSpVec(1.0, S, Q.Row(d), 0.0); + // r = S * q_d + MatrixIndexT counter = 0; + Real end_prod; + while (1) { // Normally we'll do this loop only once: + // we repeat to handle cases where r gets very much smaller + // and we want to orthogonalize again. + // We do "full orthogonalization" to preserve stability, + // even though this is usually a waste of time. + Real start_prod = VecVec(r, r); + for (SignedMatrixIndexT e = d; e >= 0; e--) { // e must be signed! + SubVector q_e(Q, e); + Real prod = VecVec(r, q_e); + if (counter == 0 && static_cast(e) + 1 >= d) // Keep T tridiagonal, which + T(d, e) = prod; // mathematically speaking, it is. + r.AddVec(-prod, q_e); // Subtract component in q_e. + } + if (d+1 == lanczos_dim) break; + end_prod = VecVec(r, r); + if (end_prod <= 0.1 * start_prod) { + // also handles case where both are 0. + // We're not confident any more that it's completely + // orthogonal to the rest so we want to re-do. + if (end_prod == 0.0) + r.SetRandn(); // "Restarting". + counter++; + if (counter > 100) + KALDI_ERR << "Loop detected in Lanczos iteration."; + } else { + break; + } + } + if (d+1 != lanczos_dim) { + // OK, at this point we're satisfied that r is orthogonal + // to all previous rows. + KALDI_ASSERT(end_prod != 0.0); // should have looped. + r.Scale(1.0 / std::sqrt(end_prod)); // make it unit. + Q.Row(d+1).CopyFromVec(r); + } + } + + Matrix R(lanczos_dim, lanczos_dim); + R.SetUnit(); + T.Qr(&R); // Diagonalizes T. + Vector s_tmp(lanczos_dim); + s_tmp.CopyDiagFromSp(T); + + // Now T = R * diag(s_tmp) * R^T. + // The next call sorts the elements of s from greatest to least absolute value, + // and moves around the rows of R in the corresponding way. This picks out + // the largest (absolute) eigenvalues. + SortSvd(&s_tmp, static_cast*>(NULL), &R); + // Keep only the initial rows of R, those corresponding to greatest (absolute) + // eigenvalues. + SubMatrix Rsub(R, 0, eig_dim, 0, lanczos_dim); + SubVector s_sub(s_tmp, 0, eig_dim); + s->CopyFromVec(s_sub); + + // For working out what to do now, just assume the other eigenvalues were + // zero. This is just for purposes of knowing how to get the result, and + // not getting things wrongly transposed. + // We have T = Rsub^T * diag(s_sub) * Rsub. + // Now, T = Q S Q^T, with Q orthogonal, so S = Q^T T Q = Q^T Rsub^T * diag(s) * Rsub * Q. + // The output is P and we want S = P * diag(s) * P^T, so we need P = Q^T Rsub^T. + P->AddMatMat(1.0, Q, kTrans, Rsub, kTrans, 0.0); +} + + +// Instantiate the templates for Eig and TopEig. +template +void SpMatrix::Eig(VectorBase*, MatrixBase*) const; +template +void SpMatrix::Eig(VectorBase*, MatrixBase*) const; + +template +void SpMatrix::TopEigs(VectorBase*, MatrixBase*, MatrixIndexT) const; +template +void SpMatrix::TopEigs(VectorBase*, MatrixBase*, MatrixIndexT) const; + +// Someone had a problem with the Intel compiler with -O3, with Qr not being +// defined for some strange reason (should automatically happen when +// we instantiate Eig and TopEigs), so we explicitly instantiate it here. +template +void SpMatrix::Qr(MatrixBase *Q); +template +void SpMatrix::Qr(MatrixBase *Q); + + + +} +// namespace kaldi diff --git a/speechx/speechx/kaldi/matrix/sp-matrix-inl.h b/speechx/speechx/kaldi/matrix/sp-matrix-inl.h new file mode 100644 index 00000000..15795923 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/sp-matrix-inl.h @@ -0,0 +1,42 @@ +// matrix/sp-matrix-inl.h + +// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Haihua Xu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_MATRIX_SP_MATRIX_INL_H_ +#define KALDI_MATRIX_SP_MATRIX_INL_H_ + +#include "matrix/tp-matrix.h" + +namespace kaldi { + +// All the lines in this file seem to be declaring template specializations. +// These tell the compiler that we'll implement the templated function +// separately for the different template arguments (float, double). + +template<> +double SolveQuadraticProblem(const SpMatrix &H, const VectorBase &g, + const SolverOptions &opts, VectorBase *x); + +template<> +float SolveQuadraticProblem(const SpMatrix &H, const VectorBase &g, + const SolverOptions &opts, VectorBase *x); + +} // namespace kaldi + + +#endif // KALDI_MATRIX_SP_MATRIX_INL_H_ diff --git a/speechx/speechx/kaldi/matrix/sp-matrix.cc b/speechx/speechx/kaldi/matrix/sp-matrix.cc new file mode 100644 index 00000000..224ef39f --- /dev/null +++ b/speechx/speechx/kaldi/matrix/sp-matrix.cc @@ -0,0 +1,1216 @@ +// matrix/sp-matrix.cc + +// Copyright 2009-2011 Lukas Burget; Ondrej Glembek; Microsoft Corporation +// Saarland University; Petr Schwarz; Yanmin Qian; +// Haihua Xu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "matrix/sp-matrix.h" +#include "matrix/kaldi-vector.h" +#include "matrix/kaldi-matrix.h" +#include "matrix/matrix-functions.h" +#include "matrix/cblas-wrappers.h" + +namespace kaldi { + +// **************************************************************************** +// Returns the log-determinant if +ve definite, else KALDI_ERR. +// **************************************************************************** +template +Real SpMatrix::LogPosDefDet() const { + TpMatrix chol(this->NumRows()); + double det = 0.0; + double diag; + chol.Cholesky(*this); // Will throw exception if not +ve definite! + + for (MatrixIndexT i = 0; i < this->NumRows(); i++) { + diag = static_cast(chol(i, i)); + det += kaldi::Log(diag); + } + return static_cast(2*det); +} + + +template +void SpMatrix::Swap(SpMatrix *other) { + std::swap(this->data_, other->data_); + std::swap(this->num_rows_, other->num_rows_); +} + +template +void SpMatrix::SymPosSemiDefEig(VectorBase *s, + MatrixBase *P, + Real tolerance) const { + Eig(s, P); + Real max = s->Max(), min = s->Min(); + KALDI_ASSERT(-min <= tolerance * max); + s->ApplyFloor(0.0); +} + +template +Real SpMatrix::MaxAbsEig() const { + Vector s(this->NumRows()); + this->Eig(&s, static_cast*>(NULL)); + return std::max(s.Max(), -s.Min()); +} + +// returns true if positive definite--uses cholesky. +template +bool SpMatrix::IsPosDef() const { + MatrixIndexT D = (*this).NumRows(); + KALDI_ASSERT(D > 0); + try { + TpMatrix C(D); + C.Cholesky(*this); + for (MatrixIndexT r = 0; r < D; r++) + if (C(r, r) == 0.0) return false; + return true; + } + catch(...) { // not positive semidefinite. + return false; + } +} + +template +void SpMatrix::ApplyPow(Real power) { + if (power == 1) return; // can do nothing. + MatrixIndexT D = this->NumRows(); + KALDI_ASSERT(D > 0); + Matrix U(D, D); + Vector l(D); + (*this).SymPosSemiDefEig(&l, &U); + + Vector l_copy(l); + try { + l.ApplyPow(power * 0.5); + } + catch(...) { + KALDI_ERR << "Error taking power " << (power * 0.5) << " of vector " + << l_copy; + } + U.MulColsVec(l); + (*this).AddMat2(1.0, U, kNoTrans, 0.0); +} + +template +void SpMatrix::CopyFromMat(const MatrixBase &M, + SpCopyType copy_type) { + KALDI_ASSERT(this->NumRows() == M.NumRows() && M.NumRows() == M.NumCols()); + MatrixIndexT D = this->NumRows(); + + switch (copy_type) { + case kTakeMeanAndCheck: + { + Real good_sum = 0.0, bad_sum = 0.0; + for (MatrixIndexT i = 0; i < D; i++) { + for (MatrixIndexT j = 0; j < i; j++) { + Real a = M(i, j), b = M(j, i), avg = 0.5*(a+b), diff = 0.5*(a-b); + (*this)(i, j) = avg; + good_sum += std::abs(avg); + bad_sum += std::abs(diff); + } + good_sum += std::abs(M(i, i)); + (*this)(i, i) = M(i, i); + } + if (bad_sum > 0.01 * good_sum) { + KALDI_ERR << "SpMatrix::Copy(), source matrix is not symmetric: " + << bad_sum << ">" << good_sum; + } + break; + } + case kTakeMean: + { + for (MatrixIndexT i = 0; i < D; i++) { + for (MatrixIndexT j = 0; j < i; j++) { + (*this)(i, j) = 0.5*(M(i, j) + M(j, i)); + } + (*this)(i, i) = M(i, i); + } + break; + } + case kTakeLower: + { // making this one a bit more efficient. + const Real *src = M.Data(); + Real *dest = this->data_; + MatrixIndexT stride = M.Stride(); + for (MatrixIndexT i = 0; i < D; i++) { + for (MatrixIndexT j = 0; j <= i; j++) + dest[j] = src[j]; + dest += i + 1; + src += stride; + } + } + break; + case kTakeUpper: + for (MatrixIndexT i = 0; i < D; i++) + for (MatrixIndexT j = 0; j <= i; j++) + (*this)(i, j) = M(j, i); + break; + default: + KALDI_ASSERT("Invalid argument to SpMatrix::CopyFromMat"); + } +} + +template +Real SpMatrix::Trace() const { + const Real *data = this->data_; + MatrixIndexT num_rows = this->num_rows_; + Real ans = 0.0; + for (int32 i = 1; i <= num_rows; i++, data += i) + ans += *data; + return ans; +} + +// diagonal update, this <-- this + diag(v) +template +template +void SpMatrix::AddDiagVec(const Real alpha, const VectorBase &v) { + int32 num_rows = this->num_rows_; + KALDI_ASSERT(num_rows == v.Dim() && num_rows > 0); + const OtherReal *src = v.Data(); + Real *dst = this->data_; + if (alpha == 1.0) + for (int32 i = 1; i <= num_rows; i++, src++, dst += i) + *dst += *src; + else + for (int32 i = 1; i <= num_rows; i++, src++, dst += i) + *dst += alpha * *src; +} + +// instantiate the template above. +template +void SpMatrix::AddDiagVec(const float alpha, + const VectorBase &v); + +template +void SpMatrix::AddDiagVec(const double alpha, + const VectorBase &v); + +template +void SpMatrix::AddDiagVec(const float alpha, + const VectorBase &v); + +template +void SpMatrix::AddDiagVec(const double alpha, + const VectorBase &v); + +template<> +template<> +void SpMatrix::AddVec2(const double alpha, const VectorBase &v); + +#ifndef HAVE_ATLAS +template +void SpMatrix::Invert(Real *logdet, Real *det_sign, bool need_inverse) { + // these are CLAPACK types + KaldiBlasInt result; + KaldiBlasInt rows = static_cast(this->num_rows_); + KaldiBlasInt* p_ipiv = new KaldiBlasInt[rows]; + Real *p_work; // workspace for the lapack function + void *temp; + if ((p_work = static_cast( + KALDI_MEMALIGN(16, sizeof(Real) * rows, &temp))) == NULL) { + delete[] p_ipiv; + throw std::bad_alloc(); + } +#ifdef HAVE_OPENBLAS + memset(p_work, 0, sizeof(Real) * rows); // gets rid of a probably + // spurious Valgrind warning about jumps depending upon uninitialized values. +#endif + + + // NOTE: Even though "U" is for upper, lapack assumes column-wise storage + // of the data. We have a row-wise storage, therefore, we need to "invert" + clapack_Xsptrf(&rows, this->data_, p_ipiv, &result); + + + KALDI_ASSERT(result >= 0 && "Call to CLAPACK ssptrf_ called with wrong arguments"); + + if (result > 0) { // Singular... + if (det_sign) *det_sign = 0; + if (logdet) *logdet = -std::numeric_limits::infinity(); + if (need_inverse) KALDI_ERR << "CLAPACK stptrf_ : factorization failed"; + } else { // Not singular.. compute log-determinant if needed. + if (logdet != NULL || det_sign != NULL) { + Real prod = 1.0, log_prod = 0.0; + int sign = 1; + for (int i = 0; i < (int)this->num_rows_; i++) { + if (p_ipiv[i] > 0) { // not a 2x2 block... + // if (p_ipiv[i] != i+1) sign *= -1; // row swap. + Real diag = (*this)(i, i); + prod *= diag; + } else { // negative: 2x2 block. [we are in first of the two]. + i++; // skip over the first of the pair. + // each 2x2 block... + Real diag1 = (*this)(i, i), diag2 = (*this)(i-1, i-1), + offdiag = (*this)(i, i-1); + Real thisdet = diag1*diag2 - offdiag*offdiag; + // thisdet == determinant of 2x2 block. + // The following line is more complex than it looks: there are 2 offsets of + // 1 that cancel. + prod *= thisdet; + } + if (i == (int)(this->num_rows_-1) || fabs(prod) < 1.0e-10 || fabs(prod) > 1.0e+10) { + if (prod < 0) { prod = -prod; sign *= -1; } + log_prod += kaldi::Log(std::abs(prod)); + prod = 1.0; + } + } + if (logdet != NULL) *logdet = log_prod; + if (det_sign != NULL) *det_sign = sign; + } + } + if (!need_inverse) { + delete [] p_ipiv; + KALDI_MEMALIGN_FREE(p_work); + return; // Don't need what is computed next. + } + // NOTE: Even though "U" is for upper, lapack assumes column-wise storage + // of the data. We have a row-wise storage, therefore, we need to "invert" + clapack_Xsptri(&rows, this->data_, p_ipiv, p_work, &result); + + KALDI_ASSERT(result >=0 && + "Call to CLAPACK ssptri_ called with wrong arguments"); + + if (result != 0) { + KALDI_ERR << "CLAPACK ssptrf_ : Matrix is singular"; + } + + delete [] p_ipiv; + KALDI_MEMALIGN_FREE(p_work); +} +#else +// in the ATLAS case, these are not implemented using a library and we back off to something else. +template +void SpMatrix::Invert(Real *logdet, Real *det_sign, bool need_inverse) { + Matrix M(this->NumRows(), this->NumCols()); + M.CopyFromSp(*this); + M.Invert(logdet, det_sign, need_inverse); + if (need_inverse) + for (MatrixIndexT i = 0; i < this->NumRows(); i++) + for (MatrixIndexT j = 0; j <= i; j++) + (*this)(i, j) = M(i, j); +} +#endif + +template +void SpMatrix::InvertDouble(Real *logdet, Real *det_sign, + bool inverse_needed) { + SpMatrix dmat(*this); + double logdet_tmp, det_sign_tmp; + dmat.Invert(logdet ? &logdet_tmp : NULL, + det_sign ? &det_sign_tmp : NULL, + inverse_needed); + if (logdet) *logdet = logdet_tmp; + if (det_sign) *det_sign = det_sign_tmp; + (*this).CopyFromSp(dmat); +} + + + +double TraceSpSp(const SpMatrix &A, const SpMatrix &B) { + KALDI_ASSERT(A.NumRows() == B.NumRows()); + const double *Aptr = A.Data(); + const double *Bptr = B.Data(); + MatrixIndexT R = A.NumRows(); + MatrixIndexT RR = (R * (R + 1)) / 2; + double all_twice = 2.0 * cblas_Xdot(RR, Aptr, 1, Bptr, 1); + // "all_twice" contains twice the vector-wise dot-product... this is + // what we want except the diagonal elements are represented + // twice. + double diag_once = 0.0; + for (MatrixIndexT row_plus_two = 2; row_plus_two <= R + 1; row_plus_two++) { + diag_once += *Aptr * *Bptr; + Aptr += row_plus_two; + Bptr += row_plus_two; + } + return all_twice - diag_once; +} + + +float TraceSpSp(const SpMatrix &A, const SpMatrix &B) { + KALDI_ASSERT(A.NumRows() == B.NumRows()); + const float *Aptr = A.Data(); + const float *Bptr = B.Data(); + MatrixIndexT R = A.NumRows(); + MatrixIndexT RR = (R * (R + 1)) / 2; + float all_twice = 2.0 * cblas_Xdot(RR, Aptr, 1, Bptr, 1); + // "all_twice" contains twice the vector-wise dot-product... this is + // what we want except the diagonal elements are represented + // twice. + float diag_once = 0.0; + for (MatrixIndexT row_plus_two = 2; row_plus_two <= R + 1; row_plus_two++) { + diag_once += *Aptr * *Bptr; + Aptr += row_plus_two; + Bptr += row_plus_two; + } + return all_twice - diag_once; +} + + +template +Real TraceSpSp(const SpMatrix &A, const SpMatrix &B) { + KALDI_ASSERT(A.NumRows() == B.NumRows()); + Real ans = 0.0; + const Real *Aptr = A.Data(); + const OtherReal *Bptr = B.Data(); + MatrixIndexT row, col, R = A.NumRows(); + for (row = 0; row < R; row++) { + for (col = 0; col < row; col++) + ans += 2.0 * *(Aptr++) * *(Bptr++); + ans += *(Aptr++) * *(Bptr++); // Diagonal. + } + return ans; +} + +template +float TraceSpSp(const SpMatrix &A, const SpMatrix &B); + +template +double TraceSpSp(const SpMatrix &A, const SpMatrix &B); + + +template +Real TraceSpMat(const SpMatrix &A, const MatrixBase &B) { + KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols() && + "KALDI_ERR: TraceSpMat: arguments have mismatched dimension"); + MatrixIndexT R = A.NumRows(); + Real ans = (Real)0.0; + const Real *Aptr = A.Data(), *Bptr = B.Data(); + MatrixIndexT bStride = B.Stride(); + for (MatrixIndexT r = 0;r < R;r++) { + for (MatrixIndexT c = 0;c < r;c++) { + // ans += A(r, c) * (B(r, c) + B(c, r)); + ans += *(Aptr++) * (Bptr[r*bStride + c] + Bptr[c*bStride + r]); + } + // ans += A(r, r) * B(r, r); + ans += *(Aptr++) * Bptr[r*bStride + r]; + } + return ans; +} + +template +float TraceSpMat(const SpMatrix &A, const MatrixBase &B); + +template +double TraceSpMat(const SpMatrix &A, const MatrixBase &B); + + +template +Real TraceMatSpMat(const MatrixBase &A, MatrixTransposeType transA, + const SpMatrix &B, const MatrixBase &C, + MatrixTransposeType transC) { + KALDI_ASSERT((transA == kTrans?A.NumCols():A.NumRows()) == + (transC == kTrans?C.NumRows():C.NumCols()) && + (transA == kTrans?A.NumRows():A.NumCols()) == B.NumRows() && + (transC == kTrans?C.NumCols():C.NumRows()) == B.NumRows() && + "TraceMatSpMat: arguments have wrong dimension."); + Matrix tmp(B.NumRows(), B.NumRows()); + tmp.AddMatMat(1.0, C, transC, A, transA, 0.0); // tmp = C * A. + return TraceSpMat(B, tmp); +} + +template +float TraceMatSpMat(const MatrixBase &A, MatrixTransposeType transA, + const SpMatrix &B, const MatrixBase &C, + MatrixTransposeType transC); +template +double TraceMatSpMat(const MatrixBase &A, MatrixTransposeType transA, + const SpMatrix &B, const MatrixBase &C, + MatrixTransposeType transC); + +template +Real TraceMatSpMatSp(const MatrixBase &A, MatrixTransposeType transA, + const SpMatrix &B, const MatrixBase &C, + MatrixTransposeType transC, const SpMatrix &D) { + KALDI_ASSERT((transA == kTrans ?A.NumCols():A.NumRows() == D.NumCols()) && + (transA == kTrans ? A.NumRows():A.NumCols() == B.NumRows()) && + (transC == kTrans ? A.NumCols():A.NumRows() == B.NumCols()) && + (transC == kTrans ? A.NumRows():A.NumCols() == D.NumRows()) && + "KALDI_ERR: TraceMatSpMatSp: arguments have mismatched dimension."); + // Could perhaps optimize this more depending on dimensions of quantities. + Matrix tmpAB(transA == kTrans ? A.NumCols():A.NumRows(), B.NumCols()); + tmpAB.AddMatSp(1.0, A, transA, B, 0.0); + Matrix tmpCD(transC == kTrans ? C.NumCols():C.NumRows(), D.NumCols()); + tmpCD.AddMatSp(1.0, C, transC, D, 0.0); + return TraceMatMat(tmpAB, tmpCD, kNoTrans); +} + +template +float TraceMatSpMatSp(const MatrixBase &A, MatrixTransposeType transA, + const SpMatrix &B, const MatrixBase &C, + MatrixTransposeType transC, const SpMatrix &D); +template +double TraceMatSpMatSp(const MatrixBase &A, MatrixTransposeType transA, + const SpMatrix &B, const MatrixBase &C, + MatrixTransposeType transC, const SpMatrix &D); + + +template +bool SpMatrix::IsDiagonal(Real cutoff) const { + MatrixIndexT R = this->NumRows(); + Real bad_sum = 0.0, good_sum = 0.0; + for (MatrixIndexT i = 0; i < R; i++) { + for (MatrixIndexT j = 0; j <= i; j++) { + if (i == j) + good_sum += std::abs((*this)(i, j)); + else + bad_sum += std::abs((*this)(i, j)); + } + } + return (!(bad_sum > good_sum * cutoff)); +} + +template +bool SpMatrix::IsUnit(Real cutoff) const { + MatrixIndexT R = this->NumRows(); + Real max = 0.0; // max error + for (MatrixIndexT i = 0; i < R; i++) + for (MatrixIndexT j = 0; j <= i; j++) + max = std::max(max, static_cast(std::abs((*this)(i, j) - + (i == j ? 1.0 : 0.0)))); + return (max <= cutoff); +} + +template +bool SpMatrix::IsTridiagonal(Real cutoff) const { + MatrixIndexT R = this->NumRows(); + Real max_abs_2diag = 0.0, max_abs_offdiag = 0.0; + for (MatrixIndexT i = 0; i < R; i++) + for (MatrixIndexT j = 0; j <= i; j++) { + if (j+1 < i) + max_abs_offdiag = std::max(max_abs_offdiag, + std::abs((*this)(i, j))); + else + max_abs_2diag = std::max(max_abs_2diag, + std::abs((*this)(i, j))); + } + return (max_abs_offdiag <= cutoff * max_abs_2diag); +} + +template +bool SpMatrix::IsZero(Real cutoff) const { + if (this->num_rows_ == 0) return true; + return (this->Max() <= cutoff && this->Min() >= -cutoff); +} + +template +Real SpMatrix::FrobeniusNorm() const { + Real sum = 0.0; + MatrixIndexT R = this->NumRows(); + for (MatrixIndexT i = 0; i < R; i++) { + for (MatrixIndexT j = 0; j < i; j++) + sum += (*this)(i, j) * (*this)(i, j) * 2; + sum += (*this)(i, i) * (*this)(i, i); + } + return std::sqrt(sum); +} + +template +bool SpMatrix::ApproxEqual(const SpMatrix &other, float tol) const { + if (this->NumRows() != other.NumRows()) + KALDI_ERR << "SpMatrix::AproxEqual, size mismatch, " + << this->NumRows() << " vs. " << other.NumRows(); + SpMatrix tmp(*this); + tmp.AddSp(-1.0, other); + return (tmp.FrobeniusNorm() <= tol * std::max(this->FrobeniusNorm(), other.FrobeniusNorm())); +} + +// function Floor: A = Floor(B, alpha * C) ... see tutorial document. +template +int SpMatrix::ApplyFloor(const SpMatrix &C, Real alpha, + bool verbose) { + MatrixIndexT dim = this->NumRows(); + int nfloored = 0; + KALDI_ASSERT(C.NumRows() == dim); + KALDI_ASSERT(alpha > 0); + TpMatrix L(dim); + L.Cholesky(C); + L.Scale(std::sqrt(alpha)); // equivalent to scaling C by alpha. + TpMatrix LInv(L); + LInv.Invert(); + + SpMatrix D(dim); + { // D = L^{-1} * (*this) * L^{-T} + Matrix LInvFull(LInv); + D.AddMat2Sp(1.0, LInvFull, kNoTrans, (*this), 0.0); + } + + Vector l(dim); + Matrix U(dim, dim); + + D.Eig(&l, &U); + + if (verbose) { + KALDI_LOG << "ApplyFloor: flooring following diagonal to 1: " << l; + } + for (MatrixIndexT i = 0; i < l.Dim(); i++) { + if (l(i) < 1.0) { + nfloored++; + l(i) = 1.0; + } + } + l.ApplyPow(0.5); + U.MulColsVec(l); + D.AddMat2(1.0, U, kNoTrans, 0.0); + { // D' := U * diag(l') * U^T ... l'=floor(l, 1) + Matrix LFull(L); + (*this).AddMat2Sp(1.0, LFull, kNoTrans, D, 0.0); // A := L * D' * L^T + } + return nfloored; +} + +template +Real SpMatrix::LogDet(Real *det_sign) const { + Real log_det; + SpMatrix tmp(*this); + // false== output not needed (saves some computation). + tmp.Invert(&log_det, det_sign, false); + return log_det; +} + + +template +int SpMatrix::ApplyFloor(Real floor) { + MatrixIndexT Dim = this->NumRows(); + int nfloored = 0; + Vector s(Dim); + Matrix P(Dim, Dim); + (*this).Eig(&s, &P); + for (MatrixIndexT i = 0; i < Dim; i++) { + if (s(i) < floor) { + nfloored++; + s(i) = floor; + } + } + (*this).AddMat2Vec(1.0, P, kNoTrans, s, 0.0); + return nfloored; +} + +template +MatrixIndexT SpMatrix::LimitCond(Real maxCond, bool invert) { // e.g. maxCond = 1.0e+05. + MatrixIndexT Dim = this->NumRows(); + Vector s(Dim); + Matrix P(Dim, Dim); + (*this).SymPosSemiDefEig(&s, &P); + KALDI_ASSERT(maxCond > 1); + Real floor = s.Max() / maxCond; + if (floor < 0) floor = 0; + if (floor < 1.0e-40) { + KALDI_WARN << "LimitCond: limiting " << floor << " to 1.0e-40"; + floor = 1.0e-40; + } + MatrixIndexT nfloored = 0; + for (MatrixIndexT i = 0; i < Dim; i++) { + if (s(i) <= floor) nfloored++; + if (invert) + s(i) = 1.0 / std::sqrt(std::max(s(i), floor)); + else + s(i) = std::sqrt(std::max(s(i), floor)); + } + P.MulColsVec(s); + (*this).AddMat2(1.0, P, kNoTrans, 0.0); // (*this) = P*P^T. ... (*this) = P * floor(s) * P^T ... if P was original P. + return nfloored; +} + +void SolverOptions::Check() const { + KALDI_ASSERT(K>10 && eps<1.0e-10); +} + +template<> double SolveQuadraticProblem(const SpMatrix &H, + const VectorBase &g, + const SolverOptions &opts, + VectorBase *x) { + KALDI_ASSERT(H.NumRows() == g.Dim() && g.Dim() == x->Dim() && x->Dim() != 0); + opts.Check(); + MatrixIndexT dim = x->Dim(); + if (H.IsZero(0.0)) { + KALDI_WARN << "Zero quadratic term in quadratic vector problem for " + << opts.name << ": leaving it unchanged."; + return 0.0; + } + if (opts.diagonal_precondition) { + // We can re-cast the problem with a diagonal preconditioner to + // make H better-conditioned. + Vector H_diag(dim); + H_diag.CopyDiagFromSp(H); + H_diag.ApplyFloor(std::numeric_limits::min() * 1.0E+3); + Vector H_diag_sqrt(H_diag); + H_diag_sqrt.ApplyPow(0.5); + Vector H_diag_inv_sqrt(H_diag_sqrt); + H_diag_inv_sqrt.InvertElements(); + Vector x_scaled(*x); + x_scaled.MulElements(H_diag_sqrt); + Vector g_scaled(g); + g_scaled.MulElements(H_diag_inv_sqrt); + SpMatrix H_scaled(dim); + H_scaled.AddVec2Sp(1.0, H_diag_inv_sqrt, H, 0.0); + double ans; + SolverOptions new_opts(opts); + new_opts.diagonal_precondition = false; + ans = SolveQuadraticProblem(H_scaled, g_scaled, new_opts, &x_scaled); + x->CopyFromVec(x_scaled); + x->MulElements(H_diag_inv_sqrt); + return ans; + } + Vector gbar(g); + if (opts.optimize_delta) gbar.AddSpVec(-1.0, H, *x, 1.0); // gbar = g - H x + Matrix U(dim, dim); + Vector l(dim); + H.SymPosSemiDefEig(&l, &U); // does svd H = U L V^T and checks that H == U L U^T to within a tolerance. + // floor l. + double f = std::max(static_cast(opts.eps), l.Max() / opts.K); + MatrixIndexT nfloored = 0; + for (MatrixIndexT i = 0; i < dim; i++) { // floor l. + if (l(i) < f) { + nfloored++; + l(i) = f; + } + } + if (nfloored != 0 && opts.print_debug_output) { + KALDI_LOG << "Solving quadratic problem for " << opts.name + << ": floored " << nfloored<< " eigenvalues. "; + } + Vector tmp(dim); + tmp.AddMatVec(1.0, U, kTrans, gbar, 0.0); // tmp = U^T \bar{g} + tmp.DivElements(l); // divide each element of tmp by l: tmp = \tilde{L}^{-1} U^T \bar{g} + Vector delta(dim); + delta.AddMatVec(1.0, U, kNoTrans, tmp, 0.0); // delta = U tmp = U \tilde{L}^{-1} U^T \bar{g} + Vector &xhat(tmp); + xhat.CopyFromVec(delta); + if (opts.optimize_delta) xhat.AddVec(1.0, *x); // xhat = x + delta. + double auxf_before = VecVec(g, *x) - 0.5 * VecSpVec(*x, H, *x), + auxf_after = VecVec(g, xhat) - 0.5 * VecSpVec(xhat, H, xhat); + if (auxf_after < auxf_before) { // Reject change. + if (auxf_after < auxf_before - 1.0e-10 && opts.print_debug_output) + KALDI_WARN << "Optimizing vector auxiliary function for " + << opts.name<< ": auxf decreased " << auxf_before + << " to " << auxf_after << ", change is " + << (auxf_after-auxf_before); + return 0.0; + } else { + x->CopyFromVec(xhat); + return auxf_after - auxf_before; + } +} + +template<> float SolveQuadraticProblem(const SpMatrix &H, + const VectorBase &g, + const SolverOptions &opts, + VectorBase *x) { + KALDI_ASSERT(H.NumRows() == g.Dim() && g.Dim() == x->Dim() && x->Dim() != 0); + SpMatrix Hd(H); + Vector gd(g); + Vector xd(*x); + float ans = static_cast(SolveQuadraticProblem(Hd, gd, opts, &xd)); + x->CopyFromVec(xd); + return ans; +} + +// Maximizes the auxiliary function Q(x) = tr(M^T SigmaInv Y) - 0.5 tr(SigmaInv M Q M^T). +// Like a numerically stable version of M := Y Q^{-1}. +template +Real +SolveQuadraticMatrixProblem(const SpMatrix &Q, + const MatrixBase &Y, + const SpMatrix &SigmaInv, + const SolverOptions &opts, + MatrixBase *M) { + KALDI_ASSERT(Q.NumRows() == M->NumCols() && + SigmaInv.NumRows() == M->NumRows() && Y.NumRows() == M->NumRows() + && Y.NumCols() == M->NumCols() && M->NumCols() != 0); + opts.Check(); + MatrixIndexT rows = M->NumRows(), cols = M->NumCols(); + if (Q.IsZero(0.0)) { + KALDI_WARN << "Zero quadratic term in quadratic matrix problem for " + << opts.name << ": leaving it unchanged."; + return 0.0; + } + + if (opts.diagonal_precondition) { + // We can re-cast the problem with a diagonal preconditioner in the space + // of Q (columns of M). Helps to improve the condition of Q. + Vector Q_diag(cols); + Q_diag.CopyDiagFromSp(Q); + Q_diag.ApplyFloor(std::numeric_limits::min() * 1.0E+3); + Vector Q_diag_sqrt(Q_diag); + Q_diag_sqrt.ApplyPow(0.5); + Vector Q_diag_inv_sqrt(Q_diag_sqrt); + Q_diag_inv_sqrt.InvertElements(); + Matrix M_scaled(*M); + M_scaled.MulColsVec(Q_diag_sqrt); + Matrix Y_scaled(Y); + Y_scaled.MulColsVec(Q_diag_inv_sqrt); + SpMatrix Q_scaled(cols); + Q_scaled.AddVec2Sp(1.0, Q_diag_inv_sqrt, Q, 0.0); + Real ans; + SolverOptions new_opts(opts); + new_opts.diagonal_precondition = false; + ans = SolveQuadraticMatrixProblem(Q_scaled, Y_scaled, SigmaInv, + new_opts, &M_scaled); + M->CopyFromMat(M_scaled); + M->MulColsVec(Q_diag_inv_sqrt); + return ans; + } + + Matrix Ybar(Y); + if (opts.optimize_delta) { + Matrix Qfull(Q); + Ybar.AddMatMat(-1.0, *M, kNoTrans, Qfull, kNoTrans, 1.0); + } // Ybar = Y - M Q. + Matrix U(cols, cols); + Vector l(cols); + Q.SymPosSemiDefEig(&l, &U); // does svd Q = U L V^T and checks that Q == U L U^T to within a tolerance. + // floor l. + Real f = std::max(static_cast(opts.eps), l.Max() / opts.K); + MatrixIndexT nfloored = 0; + for (MatrixIndexT i = 0; i < cols; i++) { // floor l. + if (l(i) < f) { nfloored++; l(i) = f; } + } + if (nfloored != 0 && opts.print_debug_output) + KALDI_LOG << "Solving matrix problem for " << opts.name + << ": floored " << nfloored << " eigenvalues. "; + Matrix tmpDelta(rows, cols); + tmpDelta.AddMatMat(1.0, Ybar, kNoTrans, U, kNoTrans, 0.0); // tmpDelta = Ybar * U. + l.InvertElements(); KALDI_ASSERT(1.0/l.Max() != 0); // check not infinite. eps should take care of this. + tmpDelta.MulColsVec(l); // tmpDelta = Ybar * U * \tilde{L}^{-1} + + Matrix Delta(rows, cols); + Delta.AddMatMat(1.0, tmpDelta, kNoTrans, U, kTrans, 0.0); // Delta = Ybar * U * \tilde{L}^{-1} * U^T + + Real auxf_before, auxf_after; + SpMatrix MQM(rows); + Matrix &SigmaInvY(tmpDelta); + { Matrix SigmaInvFull(SigmaInv); SigmaInvY.AddMatMat(1.0, SigmaInvFull, kNoTrans, Y, kNoTrans, 0.0); } + { // get auxf_before. Q(x) = tr(M^T SigmaInv Y) - 0.5 tr(SigmaInv M Q M^T). + MQM.AddMat2Sp(1.0, *M, kNoTrans, Q, 0.0); + auxf_before = TraceMatMat(*M, SigmaInvY, kaldi::kTrans) - 0.5*TraceSpSp(SigmaInv, MQM); + } + + Matrix Mhat(Delta); + if (opts.optimize_delta) Mhat.AddMat(1.0, *M); // Mhat = Delta + M. + + { // get auxf_after. + MQM.AddMat2Sp(1.0, Mhat, kNoTrans, Q, 0.0); + auxf_after = TraceMatMat(Mhat, SigmaInvY, kaldi::kTrans) - 0.5*TraceSpSp(SigmaInv, MQM); + } + + if (auxf_after < auxf_before) { + if (auxf_after < auxf_before - 1.0e-10) + KALDI_WARN << "Optimizing matrix auxiliary function for " + << opts.name << ", auxf decreased " + << auxf_before << " to " << auxf_after << ", change is " + << (auxf_after-auxf_before); + return 0.0; + } else { + M->CopyFromMat(Mhat); + return auxf_after - auxf_before; + } +} + +template +Real SolveDoubleQuadraticMatrixProblem(const MatrixBase &G, + const SpMatrix &P1, + const SpMatrix &P2, + const SpMatrix &Q1, + const SpMatrix &Q2, + const SolverOptions &opts, + MatrixBase *M) { + KALDI_ASSERT(Q1.NumRows() == M->NumCols() && P1.NumRows() == M->NumRows() && + G.NumRows() == M->NumRows() && G.NumCols() == M->NumCols() && + M->NumCols() != 0 && Q2.NumRows() == M->NumCols() && + P2.NumRows() == M->NumRows()); + MatrixIndexT rows = M->NumRows(), cols = M->NumCols(); + // The following check should not fail as we stipulate P1, P2 and one of Q1 + // or Q2 must be +ve def and other Q1 or Q2 must be +ve semidef. + TpMatrix LInv(rows); + LInv.Cholesky(P1); + LInv.Invert(); // Will throw exception if fails. + SpMatrix S(rows); + Matrix LInvFull(LInv); + S.AddMat2Sp(1.0, LInvFull, kNoTrans, P2, 0.0); // S := L^{-1} P_2 L^{-T} + Matrix U(rows, rows); + Vector d(rows); + S.SymPosSemiDefEig(&d, &U); + Matrix T(rows, rows); + T.AddMatMat(1.0, U, kTrans, LInvFull, kNoTrans, 0.0); // T := U^T * L^{-1} + +#ifdef KALDI_PARANOID // checking mainly for errors in the code or math. + { + SpMatrix P1Trans(rows); + P1Trans.AddMat2Sp(1.0, T, kNoTrans, P1, 0.0); + KALDI_ASSERT(P1Trans.IsUnit(0.01)); + } + { + SpMatrix P2Trans(rows); + P2Trans.AddMat2Sp(1.0, T, kNoTrans, P2, 0.0); + KALDI_ASSERT(P2Trans.IsDiagonal(0.01)); + } +#endif + + Matrix TInv(T); + TInv.Invert(); + Matrix Gdash(rows, cols); + Gdash.AddMatMat(1.0, T, kNoTrans, G, kNoTrans, 0.0); // G' = T G + Matrix MdashOld(rows, cols); + MdashOld.AddMatMat(1.0, TInv, kTrans, *M, kNoTrans, 0.0); // M' = T^{-T} M + Matrix MdashNew(MdashOld); + Real objf_impr = 0.0; + for (MatrixIndexT n = 0; n < rows; n++) { + SpMatrix Qsum(Q1); + Qsum.AddSp(d(n), Q2); + SubVector mdash_n = MdashNew.Row(n); + SubVector gdash_n = Gdash.Row(n); + + Matrix QsumInv(Qsum); + try { + QsumInv.Invert(); + Real old_objf = VecVec(mdash_n, gdash_n) + - 0.5 * VecSpVec(mdash_n, Qsum, mdash_n); + mdash_n.AddMatVec(1.0, QsumInv, kNoTrans, gdash_n, 0.0); // m'_n := g'_n * (Q_1 + d_n Q_2)^{-1} + Real new_objf = VecVec(mdash_n, gdash_n) + - 0.5 * VecSpVec(mdash_n, Qsum, mdash_n); + if (new_objf < old_objf) { + if (new_objf < old_objf - 1.0e-05) { + KALDI_WARN << "In double quadratic matrix problem: objective " + "function decreasing during optimization of " << opts.name + << ", " << old_objf << "->" << new_objf << ", change is " + << (new_objf - old_objf); + KALDI_ERR << "Auxiliary function decreasing."; // Will be caught. + } else { // Reset to old value, didn't improve (very close to optimum). + MdashNew.Row(n).CopyFromVec(MdashOld.Row(n)); + } + } + objf_impr += new_objf - old_objf; + } + catch (...) { + KALDI_WARN << "Matrix inversion or optimization failed during double " + "quadratic problem, solving for" << opts.name + << ": trying more stable approach."; + objf_impr += SolveQuadraticProblem(Qsum, gdash_n, opts, &mdash_n); + } + } + M->AddMatMat(1.0, T, kTrans, MdashNew, kNoTrans, 0.0); // M := T^T M'. + return objf_impr; +} + +// rank-one update, this <-- this + alpha V V' +template<> +template<> +void SpMatrix::AddVec2(const float alpha, const VectorBase &v) { + KALDI_ASSERT(v.Dim() == this->NumRows()); + cblas_Xspr(v.Dim(), alpha, v.Data(), 1, + this->data_); +} + +template +void SpMatrix::AddVec2Sp(const Real alpha, const VectorBase &v, + const SpMatrix &S, const Real beta) { + KALDI_ASSERT(v.Dim() == this->NumRows() && S.NumRows() == this->NumRows()); + const Real *Sdata = S.Data(); + const Real *vdata = v.Data(); + Real *data = this->data_; + MatrixIndexT dim = this->num_rows_; + for (MatrixIndexT r = 0; r < dim; r++) + for (MatrixIndexT c = 0; c <= r; c++, Sdata++, data++) + *data = beta * *data + alpha * vdata[r] * vdata[c] * *Sdata; +} + + +// rank-one update, this <-- this + alpha V V' +template<> +template<> +void SpMatrix::AddVec2(const double alpha, const VectorBase &v) { + KALDI_ASSERT(v.Dim() == num_rows_); + cblas_Xspr(v.Dim(), alpha, v.Data(), 1, data_); +} + + +template +template +void SpMatrix::AddVec2(const Real alpha, const VectorBase &v) { + KALDI_ASSERT(v.Dim() == this->NumRows()); + Real *data = this->data_; + const OtherReal *v_data = v.Data(); + MatrixIndexT nr = this->num_rows_; + for (MatrixIndexT i = 0; i < nr; i++) + for (MatrixIndexT j = 0; j <= i; j++, data++) + *data += alpha * v_data[i] * v_data[j]; +} + +// instantiate the template above. +template +void SpMatrix::AddVec2(const float alpha, const VectorBase &v); +template +void SpMatrix::AddVec2(const double alpha, const VectorBase &v); + + +template +Real VecSpVec(const VectorBase &v1, const SpMatrix &M, + const VectorBase &v2) { + MatrixIndexT D = M.NumRows(); + KALDI_ASSERT(v1.Dim() == D && v1.Dim() == v2.Dim()); + Vector tmp_vec(D); + cblas_Xspmv(D, 1.0, M.Data(), v1.Data(), 1, 0.0, tmp_vec.Data(), 1); + return VecVec(tmp_vec, v2); +} + +template +float VecSpVec(const VectorBase &v1, const SpMatrix &M, + const VectorBase &v2); +template +double VecSpVec(const VectorBase &v1, const SpMatrix &M, + const VectorBase &v2); + + +template +void SpMatrix::AddMat2Sp( + const Real alpha, const MatrixBase &M, + MatrixTransposeType transM, const SpMatrix &A, const Real beta) { + if (transM == kNoTrans) { + KALDI_ASSERT(M.NumCols() == A.NumRows() && M.NumRows() == this->num_rows_); + } else { + KALDI_ASSERT(M.NumRows() == A.NumRows() && M.NumCols() == this->num_rows_); + } + Vector tmp_vec(A.NumRows()); + Real *tmp_vec_data = tmp_vec.Data(); + SpMatrix tmp_A; + const Real *p_A_data = A.Data(); + Real *p_row_data = this->Data(); + MatrixIndexT M_other_dim = (transM == kNoTrans ? M.NumCols() : M.NumRows()), + M_same_dim = (transM == kNoTrans ? M.NumRows() : M.NumCols()), + M_stride = M.Stride(), dim = this->NumRows(); + KALDI_ASSERT(M_same_dim == dim); + + const Real *M_data = M.Data(); + + if (this->Data() <= A.Data() + A.SizeInBytes() && + this->Data() + this->SizeInBytes() >= A.Data()) { + // Matrices A and *this overlap. Make copy of A + tmp_A.Resize(A.NumRows()); + tmp_A.CopyFromSp(A); + p_A_data = tmp_A.Data(); + } + + if (transM == kNoTrans) { + for (MatrixIndexT r = 0; r < dim; r++, p_row_data += r) { + cblas_Xspmv(A.NumRows(), 1.0, p_A_data, M.RowData(r), 1, 0.0, tmp_vec_data, 1); + cblas_Xgemv(transM, r+1, M_other_dim, alpha, M_data, M_stride, + tmp_vec_data, 1, beta, p_row_data, 1); + } + } else { + for (MatrixIndexT r = 0; r < dim; r++, p_row_data += r) { + cblas_Xspmv(A.NumRows(), 1.0, p_A_data, M.Data() + r, M.Stride(), 0.0, tmp_vec_data, 1); + cblas_Xgemv(transM, M_other_dim, r+1, alpha, M_data, M_stride, + tmp_vec_data, 1, beta, p_row_data, 1); + } + } +} + +template +void SpMatrix::AddSmat2Sp( + const Real alpha, const MatrixBase &M, + MatrixTransposeType transM, const SpMatrix &A, + const Real beta) { + KALDI_ASSERT((transM == kNoTrans && M.NumCols() == A.NumRows()) || + (transM == kTrans && M.NumRows() == A.NumRows())); + if (transM == kNoTrans) { + KALDI_ASSERT(M.NumCols() == A.NumRows() && M.NumRows() == this->num_rows_); + } else { + KALDI_ASSERT(M.NumRows() == A.NumRows() && M.NumCols() == this->num_rows_); + } + MatrixIndexT Adim = A.NumRows(), dim = this->num_rows_; + + Matrix temp_A(A); // represent A as full matrix. + Matrix temp_MA(dim, Adim); + temp_MA.AddSmatMat(1.0, M, transM, temp_A, kNoTrans, 0.0); + + // Next-- we want to do *this = alpha * temp_MA * M^T + beta * *this. + // To make it sparse vector multiplies, since M is sparse, we'd like + // to do: for each column c, (*this column c) += temp_MA * (M^T's column c.) + // [ignoring the alpha and beta here.] + // It's not convenient to process columns in the symmetric + // packed format because they don't have a constant stride. However, + // we can use the fact that temp_MA * M is symmetric, to just assign + // each row of *this instead of each column. + // So the final iteration is: + // for i = 0... dim-1, + // [the i'th row of *this] = beta * [the i'th row of *this] + alpha * + // temp_MA * [the i'th column of M]. + // Of course, we only process the first 0 ... i elements of this row, + // as that's all that are kept in the symmetric packed format. + + Matrix temp_this(*this); + Real *data = this->data_; + const Real *Mdata = M.Data(), *MAdata = temp_MA.Data(); + MatrixIndexT temp_MA_stride = temp_MA.Stride(), Mstride = M.Stride(); + + if (transM == kNoTrans) { + // The column of M^T corresponds to the rows of the supplied matrix. + for (MatrixIndexT i = 0; i < dim; i++, data += i) { + MatrixIndexT num_rows = i + 1, num_cols = Adim; + Xgemv_sparsevec(kNoTrans, num_rows, num_cols, alpha, MAdata, + temp_MA_stride, Mdata + (i * Mstride), 1, beta, data, 1); + } + } else { + // The column of M^T corresponds to the columns of the supplied matrix. + for (MatrixIndexT i = 0; i < dim; i++, data += i) { + MatrixIndexT num_rows = i + 1, num_cols = Adim; + Xgemv_sparsevec(kNoTrans, num_rows, num_cols, alpha, MAdata, + temp_MA_stride, Mdata + i, Mstride, beta, data, 1); + } + } +} + +template +void SpMatrix::AddMat2Vec(const Real alpha, + const MatrixBase &M, + MatrixTransposeType transM, + const VectorBase &v, + const Real beta) { + this->Scale(beta); + KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows() && + M.NumCols() == v.Dim()) || + (transM == kTrans && this->NumRows() == M.NumCols() && + M.NumRows() == v.Dim())); + + if (transM == kNoTrans) { + const Real *Mdata = M.Data(), *vdata = v.Data(); + Real *data = this->data_; + MatrixIndexT dim = this->NumRows(), mcols = M.NumCols(), + mstride = M.Stride(); + for (MatrixIndexT col = 0; col < mcols; col++, vdata++, Mdata += 1) + cblas_Xspr(dim, *vdata*alpha, Mdata, mstride, data); + } else { + const Real *Mdata = M.Data(), *vdata = v.Data(); + Real *data = this->data_; + MatrixIndexT dim = this->NumRows(), mrows = M.NumRows(), + mstride = M.Stride(); + for (MatrixIndexT row = 0; row < mrows; row++, vdata++, Mdata += mstride) + cblas_Xspr(dim, *vdata*alpha, Mdata, 1, data); + } +} + +template +void SpMatrix::AddMat2(const Real alpha, const MatrixBase &M, + MatrixTransposeType transM, const Real beta) { + KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows()) + || (transM == kTrans && this->NumRows() == M.NumCols())); + + // Cblas has no function *sprk (i.e. symmetric packed rank-k update), so we + // use as temporary storage a regular matrix of which we only access its lower + // triangle + + MatrixIndexT this_dim = this->NumRows(), + m_other_dim = (transM == kNoTrans ? M.NumCols() : M.NumRows()); + + if (this_dim == 0) return; + if (alpha == 0.0) { + if (beta != 1.0) this->Scale(beta); + return; + } + + Matrix temp_mat(*this); // wastefully copies upper triangle too, but this + // doesn't dominate O(N) time. + + // This function call is hard-coded to update the lower triangle. + cblas_Xsyrk(transM, this_dim, m_other_dim, alpha, M.Data(), + M.Stride(), beta, temp_mat.Data(), temp_mat.Stride()); + + this->CopyFromMat(temp_mat, kTakeLower); +} + +template +void SpMatrix::AddTp2Sp(const Real alpha, const TpMatrix &T, + MatrixTransposeType transM, const SpMatrix &A, + const Real beta) { + Matrix Tmat(T); + AddMat2Sp(alpha, Tmat, transM, A, beta); +} + +template +void SpMatrix::AddVecVec(const Real alpha, const VectorBase &v, + const VectorBase &w) { + int32 dim = this->NumRows(); + KALDI_ASSERT(dim == v.Dim() && dim == w.Dim() && dim > 0); + cblas_Xspr2(dim, alpha, v.Data(), 1, w.Data(), 1, this->data_); +} + + +template +void SpMatrix::AddTp2(const Real alpha, const TpMatrix &T, + MatrixTransposeType transM, const Real beta) { + Matrix Tmat(T); + AddMat2(alpha, Tmat, transM, beta); +} + + +// Explicit instantiation of the class. +// This needs to be after the definition of all the class member functions. + +template class SpMatrix; +template class SpMatrix; + + +template +Real TraceSpSpLower(const SpMatrix &A, const SpMatrix &B) { + MatrixIndexT adim = A.NumRows(); + KALDI_ASSERT(adim == B.NumRows()); + MatrixIndexT dim = (adim*(adim+1))/2; + return cblas_Xdot(dim, A.Data(), 1, B.Data(), 1); +} +// Instantiate the template above. +template +double TraceSpSpLower(const SpMatrix &A, const SpMatrix &B); +template +float TraceSpSpLower(const SpMatrix &A, const SpMatrix &B); + +// Instantiate the template above. +template float SolveQuadraticMatrixProblem(const SpMatrix &Q, + const MatrixBase &Y, + const SpMatrix &SigmaInv, + const SolverOptions &opts, + MatrixBase *M); +template double SolveQuadraticMatrixProblem(const SpMatrix &Q, + const MatrixBase &Y, + const SpMatrix &SigmaInv, + const SolverOptions &opts, + MatrixBase *M); + +// Instantiate the template above. +template float SolveDoubleQuadraticMatrixProblem( + const MatrixBase &G, + const SpMatrix &P1, + const SpMatrix &P2, + const SpMatrix &Q1, + const SpMatrix &Q2, + const SolverOptions &opts, + MatrixBase *M); + +template double SolveDoubleQuadraticMatrixProblem( + const MatrixBase &G, + const SpMatrix &P1, + const SpMatrix &P2, + const SpMatrix &Q1, + const SpMatrix &Q2, + const SolverOptions &opts, + MatrixBase *M); + + + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/matrix/sp-matrix.h b/speechx/speechx/kaldi/matrix/sp-matrix.h new file mode 100644 index 00000000..26d9ad6f --- /dev/null +++ b/speechx/speechx/kaldi/matrix/sp-matrix.h @@ -0,0 +1,517 @@ +// matrix/sp-matrix.h + +// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Lukas Burget; +// Saarland University; Ariya Rastrow; Yanmin Qian; +// Jan Silovsky + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_MATRIX_SP_MATRIX_H_ +#define KALDI_MATRIX_SP_MATRIX_H_ + +#include +#include + +#include "matrix/packed-matrix.h" + +namespace kaldi { + + +/// \addtogroup matrix_group +/// @{ +template class SpMatrix; + + +/** + * @brief Packed symetric matrix class +*/ +template +class SpMatrix : public PackedMatrix { + friend class CuSpMatrix; + public: + // so it can use our assignment operator. + friend class std::vector >; + + SpMatrix(): PackedMatrix() {} + + /// Copy constructor from CUDA version of SpMatrix + /// This is defined in ../cudamatrix/cu-sp-matrix.h + + explicit SpMatrix(const CuSpMatrix &cu); + + explicit SpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero) + : PackedMatrix(r, resize_type) {} + + SpMatrix(const SpMatrix &orig) + : PackedMatrix(orig) {} + + template + explicit SpMatrix(const SpMatrix &orig) + : PackedMatrix(orig) {} + +#ifdef KALDI_PARANOID + explicit SpMatrix(const MatrixBase & orig, + SpCopyType copy_type = kTakeMeanAndCheck) + : PackedMatrix(orig.NumRows(), kUndefined) { + CopyFromMat(orig, copy_type); + } +#else + explicit SpMatrix(const MatrixBase & orig, + SpCopyType copy_type = kTakeMean) + : PackedMatrix(orig.NumRows(), kUndefined) { + CopyFromMat(orig, copy_type); + } +#endif + + /// Shallow swap. + void Swap(SpMatrix *other); + + inline void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero) { + PackedMatrix::Resize(nRows, resize_type); + } + + void CopyFromSp(const SpMatrix &other) { + PackedMatrix::CopyFromPacked(other); + } + + template + void CopyFromSp(const SpMatrix &other) { + PackedMatrix::CopyFromPacked(other); + } + +#ifdef KALDI_PARANOID + void CopyFromMat(const MatrixBase &orig, + SpCopyType copy_type = kTakeMeanAndCheck); +#else // different default arg if non-paranoid mode. + void CopyFromMat(const MatrixBase &orig, + SpCopyType copy_type = kTakeMean); +#endif + + inline Real operator() (MatrixIndexT r, MatrixIndexT c) const { + // if column is less than row, then swap these as matrix is stored + // as upper-triangular... only allowed for const matrix object. + if (static_cast(c) > + static_cast(r)) + std::swap(c, r); + // c<=r now so don't have to check c. + KALDI_ASSERT(static_cast(r) < + static_cast(this->num_rows_)); + return *(this->data_ + (r*(r+1)) / 2 + c); + // Duplicating code from PackedMatrix.h + } + + inline Real &operator() (MatrixIndexT r, MatrixIndexT c) { + if (static_cast(c) > + static_cast(r)) + std::swap(c, r); + // c<=r now so don't have to check c. + KALDI_ASSERT(static_cast(r) < + static_cast(this->num_rows_)); + return *(this->data_ + (r * (r + 1)) / 2 + c); + // Duplicating code from PackedMatrix.h + } + + SpMatrix& operator=(const SpMatrix &other) { + PackedMatrix::operator=(other); + return *this; + } + + using PackedMatrix::Scale; + + /// matrix inverse. + /// if inverse_needed = false, will fill matrix with garbage. + /// (only useful if logdet wanted). + void Invert(Real *logdet = NULL, Real *det_sign= NULL, + bool inverse_needed = true); + + // Below routine does inversion in double precision, + // even for single-precision object. + void InvertDouble(Real *logdet = NULL, Real *det_sign = NULL, + bool inverse_needed = true); + + /// Returns maximum ratio of singular values. + inline Real Cond() const { + Matrix tmp(*this); + return tmp.Cond(); + } + + /// Takes matrix to a fraction power via Svd. + /// Will throw exception if matrix is not positive semidefinite + /// (to within a tolerance) + void ApplyPow(Real exponent); + + /// This is the version of SVD that we implement for symmetric positive + /// definite matrices. This exists for historical reasons; right now its + /// internal implementation is the same as Eig(). It computes the eigenvalue + /// decomposition (*this) = P * diag(s) * P^T with P orthogonal. Will throw + /// exception if input is not positive semidefinite to within a tolerance. + void SymPosSemiDefEig(VectorBase *s, MatrixBase *P, + Real tolerance = 0.001) const; + + /// Solves the symmetric eigenvalue problem: at end we should have (*this) = P + /// * diag(s) * P^T. We solve the problem using the symmetric QR method. + /// P may be NULL. + /// Implemented in qr.cc. + /// If you need the eigenvalues sorted, the function SortSvd declared in + /// kaldi-matrix is suitable. + void Eig(VectorBase *s, MatrixBase *P = NULL) const; + + /// This function gives you, approximately, the largest eigenvalues of the + /// symmetric matrix and the corresponding eigenvectors. (largest meaning, + /// further from zero). It does this by doing a SVD within the Krylov + /// subspace generated by this matrix and a random vector. This is + /// a form of the Lanczos method with complete reorthogonalization, followed + /// by SVD within a smaller dimension ("lanczos_dim"). + /// + /// If *this is m by m, s should be of dimension n and P should be of + /// dimension m by n, with n <= m. The *columns* of P are the approximate + /// eigenvectors; P * diag(s) * P^T would be a low-rank reconstruction of + /// *this. The columns of P will be orthogonal, and the elements of s will be + /// the eigenvalues of *this projected into that subspace, but beyond that + /// there are no exact guarantees. (This is because the convergence of this + /// method is statistical). Note: it only makes sense to use this + /// method if you are in very high dimension and n is substantially smaller + /// than m: for example, if you want the 100 top eigenvalues of a 10k by 10k + /// matrix. This function calls Rand() to initialize the lanczos + /// iterations and also for restarting. + /// If lanczos_dim is zero, it will default to the greater of: + /// s->Dim() + 50 or s->Dim() + s->Dim()/2, but not more than this->Dim(). + /// If lanczos_dim == this->Dim(), you might as well just call the function + /// Eig() since the result will be the same, and Eig() would be faster; the + /// whole point of this function is to reduce the dimension of the SVD + /// computation. + void TopEigs(VectorBase *s, MatrixBase *P, + MatrixIndexT lanczos_dim = 0) const; + + + /// Returns the maximum of the absolute values of any of the + /// eigenvalues. + Real MaxAbsEig() const; + + void PrintEigs(const char *name) { + Vector s((*this).NumRows()); + Matrix P((*this).NumRows(), (*this).NumCols()); + SymPosSemiDefEig(&s, &P); + KALDI_LOG << "PrintEigs: " << name << ": " << s; + } + + bool IsPosDef() const; // returns true if Cholesky succeeds. + void AddSp(const Real alpha, const SpMatrix &Ma) { + this->AddPacked(alpha, Ma); + } + + /// Computes log determinant but only for +ve-def matrices + /// (it uses Cholesky). + /// If matrix is not +ve-def, it will throw an exception + /// was LogPDDeterminant() + Real LogPosDefDet() const; + + Real LogDet(Real *det_sign = NULL) const; + + /// rank-one update, this <-- this + alpha v v' + template + void AddVec2(const Real alpha, const VectorBase &v); + + /// rank-two update, this <-- this + alpha (v w' + w v'). + void AddVecVec(const Real alpha, const VectorBase &v, + const VectorBase &w); + + /// Does *this = beta * *thi + alpha * diag(v) * S * diag(v) + void AddVec2Sp(const Real alpha, const VectorBase &v, + const SpMatrix &S, const Real beta); + + /// diagonal update, this <-- this + diag(v) + template + void AddDiagVec(const Real alpha, const VectorBase &v); + + /// rank-N update: + /// if (transM == kNoTrans) + /// (*this) = beta*(*this) + alpha * M * M^T, + /// or (if transM == kTrans) + /// (*this) = beta*(*this) + alpha * M^T * M + /// Note: beta used to default to 0.0. + void AddMat2(const Real alpha, const MatrixBase &M, + MatrixTransposeType transM, const Real beta); + + /// Extension of rank-N update: + /// this <-- beta*this + alpha * M * A * M^T. + /// (*this) and A are allowed to be the same. + /// If transM == kTrans, then we do it as M^T * A * M. + void AddMat2Sp(const Real alpha, const MatrixBase &M, + MatrixTransposeType transM, const SpMatrix &A, + const Real beta = 0.0); + + /// This is a version of AddMat2Sp specialized for when M is fairly sparse. + /// This was required for making the raw-fMLLR code efficient. + void AddSmat2Sp(const Real alpha, const MatrixBase &M, + MatrixTransposeType transM, const SpMatrix &A, + const Real beta = 0.0); + + /// The following function does: + /// this <-- beta*this + alpha * T * A * T^T. + /// (*this) and A are allowed to be the same. + /// If transM == kTrans, then we do it as alpha * T^T * A * T. + /// Currently it just calls AddMat2Sp, but if needed we + /// can implement it more efficiently. + void AddTp2Sp(const Real alpha, const TpMatrix &T, + MatrixTransposeType transM, const SpMatrix &A, + const Real beta = 0.0); + + /// The following function does: + /// this <-- beta*this + alpha * T * T^T. + /// (*this) and A are allowed to be the same. + /// If transM == kTrans, then we do it as alpha * T^T * T + /// Currently it just calls AddMat2, but if needed we + /// can implement it more efficiently. + void AddTp2(const Real alpha, const TpMatrix &T, + MatrixTransposeType transM, const Real beta = 0.0); + + /// Extension of rank-N update: + /// this <-- beta*this + alpha * M * diag(v) * M^T. + /// if transM == kTrans, then + /// this <-- beta*this + alpha * M^T * diag(v) * M. + void AddMat2Vec(const Real alpha, const MatrixBase &M, + MatrixTransposeType transM, const VectorBase &v, + const Real beta = 0.0); + + + /// Floors this symmetric matrix to the matrix + /// alpha * Floor, where the matrix Floor is positive + /// definite. + /// It is floored in the sense that after flooring, + /// x^T (*this) x >= x^T (alpha*Floor) x. + /// This is accomplished using an Svd. It will crash + /// if Floor is not positive definite. Returns the number of + /// elements that were floored. + int ApplyFloor(const SpMatrix &Floor, Real alpha = 1.0, + bool verbose = false); + + /// Floor: Given a positive semidefinite matrix, floors the eigenvalues + /// to the specified quantity. A previous version of this function had + /// a tolerance which is now no longer needed since we have code to + /// do the symmetric eigenvalue decomposition and no longer use the SVD + /// code for that purose. + int ApplyFloor(Real floor); + + bool IsDiagonal(Real cutoff = 1.0e-05) const; + bool IsUnit(Real cutoff = 1.0e-05) const; + bool IsZero(Real cutoff = 1.0e-05) const; + bool IsTridiagonal(Real cutoff = 1.0e-05) const; + + /// sqrt of sum of square elements. + Real FrobeniusNorm() const; + + /// Returns true if ((*this)-other).FrobeniusNorm() <= + /// tol*(*this).FrobeniusNorma() + bool ApproxEqual(const SpMatrix &other, float tol = 0.01) const; + + // LimitCond: + // Limits the condition of symmetric positive semidefinite matrix to + // a specified value + // by flooring all eigenvalues to a positive number which is some multiple + // of the largest one (or zero if there are no positive eigenvalues). + // Takes the condition number we are willing to accept, and floors + // eigenvalues to the largest eigenvalue divided by this. + // Returns #eigs floored or already equal to the floor. + // Throws exception if input is not positive definite. + // returns #floored. + MatrixIndexT LimitCond(Real maxCond = 1.0e+5, bool invert = false); + + // as LimitCond but all done in double precision. // returns #floored. + MatrixIndexT LimitCondDouble(Real maxCond = 1.0e+5, bool invert = false) { + SpMatrix dmat(*this); + MatrixIndexT ans = dmat.LimitCond(maxCond, invert); + (*this).CopyFromSp(dmat); + return ans; + } + Real Trace() const; + + /// Tridiagonalize the matrix with an orthogonal transformation. If + /// *this starts as S, produce T (and Q, if non-NULL) such that + /// T = Q A Q^T, i.e. S = Q^T T Q. Caution: this is the other way + /// round from most authors (it's more efficient in row-major indexing). + void Tridiagonalize(MatrixBase *Q); + + /// The symmetric QR algorithm. This will mostly be useful in internal code. + /// Typically, you will call this after Tridiagonalize(), on the same object. + /// When called, *this (call it A at this point) must be tridiagonal; at exit, + /// *this will be a diagonal matrix D that is similar to A via orthogonal + /// transformations. This algorithm right-multiplies Q by orthogonal + /// transformations. It turns *this from a tridiagonal into a diagonal matrix + /// while maintaining that (Q *this Q^T) has the same value at entry and exit. + /// At entry Q should probably be either NULL or orthogonal, but we don't check + /// this. + void Qr(MatrixBase *Q); + + private: + void EigInternal(VectorBase *s, MatrixBase *P, + Real tolerance, int recurse) const; +}; + +/// @} end of "addtogroup matrix_group" + +/// \addtogroup matrix_funcs_scalar +/// @{ + + +/// Returns tr(A B). +float TraceSpSp(const SpMatrix &A, const SpMatrix &B); +double TraceSpSp(const SpMatrix &A, const SpMatrix &B); + + +template +inline bool ApproxEqual(const SpMatrix &A, + const SpMatrix &B, Real tol = 0.01) { + return A.ApproxEqual(B, tol); +} + +template +inline void AssertEqual(const SpMatrix &A, + const SpMatrix &B, Real tol = 0.01) { + KALDI_ASSERT(ApproxEqual(A, B, tol)); +} + + + +/// Returns tr(A B). +template +Real TraceSpSp(const SpMatrix &A, const SpMatrix &B); + + + +// TraceSpSpLower is the same as Trace(A B) except the lower-diagonal elements +// are counted only once not twice as they should be. It is useful in certain +// optimizations. +template +Real TraceSpSpLower(const SpMatrix &A, const SpMatrix &B); + + +/// Returns tr(A B). +/// No option to transpose B because would make no difference. +template +Real TraceSpMat(const SpMatrix &A, const MatrixBase &B); + +/// Returns tr(A B C) +/// (A and C may be transposed as specified by transA and transC). +template +Real TraceMatSpMat(const MatrixBase &A, MatrixTransposeType transA, + const SpMatrix &B, const MatrixBase &C, + MatrixTransposeType transC); + +/// Returns tr (A B C D) +/// (A and C may be transposed as specified by transA and transB). +template +Real TraceMatSpMatSp(const MatrixBase &A, MatrixTransposeType transA, + const SpMatrix &B, const MatrixBase &C, + MatrixTransposeType transC, const SpMatrix &D); + +/** Computes v1^T * M * v2. Not as efficient as it could be where v1 == v2 + * (but no suitable blas routines available). + */ + +/// Returns \f$ v_1^T M v_2 \f$ +/// Not as efficient as it could be where v1 == v2. +template +Real VecSpVec(const VectorBase &v1, const SpMatrix &M, + const VectorBase &v2); + + +/// @} \addtogroup matrix_funcs_scalar + +/// \addtogroup matrix_funcs_misc +/// @{ + + +/// This class describes the options for maximizing various quadratic objective +/// functions. It's mostly as described in the SGMM paper "the subspace +/// Gaussian mixture model -- a structured model for speech recognition", but +/// the diagonal_precondition option is newly added, to handle problems where +/// different dimensions have very different scaling (we recommend to use the +/// option but it's set false for back compatibility). +struct SolverOptions { + BaseFloat K; // maximum condition number + BaseFloat eps; + std::string name; + bool optimize_delta; + bool diagonal_precondition; + bool print_debug_output; + explicit SolverOptions(const std::string &name): + K(1.0e+4), eps(1.0e-40), name(name), + optimize_delta(true), diagonal_precondition(false), + print_debug_output(true) { } + SolverOptions(): K(1.0e+4), eps(1.0e-40), name("[unknown]"), + optimize_delta(true), diagonal_precondition(false), + print_debug_output(true) { } + void Check() const; +}; + + +/// Maximizes the auxiliary function +/// \f[ Q(x) = x.g - 0.5 x^T H x \f] +/// using a numerically stable method. Like a numerically stable version of +/// \f$ x := Q^{-1} g. \f$ +/// Assumes H positive semidefinite. +/// Returns the objective-function change. + +template +Real SolveQuadraticProblem(const SpMatrix &H, + const VectorBase &g, + const SolverOptions &opts, + VectorBase *x); + + + +/// Maximizes the auxiliary function : +/// \f[ Q(x) = tr(M^T P Y) - 0.5 tr(P M Q M^T) \f] +/// Like a numerically stable version of \f$ M := Y Q^{-1} \f$. +/// Assumes Q and P positive semidefinite, and matrix dimensions match +/// enough to make expressions meaningful. +/// This is mostly as described in the SGMM paper "the subspace Gaussian mixture +/// model -- a structured model for speech recognition", but the +/// diagonal_precondition option is newly added, to handle problems +/// where different dimensions have very different scaling (we recommend to use +/// the option but it's set false for back compatibility). +template +Real SolveQuadraticMatrixProblem(const SpMatrix &Q, + const MatrixBase &Y, + const SpMatrix &P, + const SolverOptions &opts, + MatrixBase *M); + +/// Maximizes the auxiliary function : +/// \f[ Q(M) = tr(M^T G) -0.5 tr(P_1 M Q_1 M^T) -0.5 tr(P_2 M Q_2 M^T). \f] +/// Encountered in matrix update with a prior. We also apply a limit on the +/// condition but it should be less frequently necessary, and can be set larger. +template +Real SolveDoubleQuadraticMatrixProblem(const MatrixBase &G, + const SpMatrix &P1, + const SpMatrix &P2, + const SpMatrix &Q1, + const SpMatrix &Q2, + const SolverOptions &opts, + MatrixBase *M); + + +/// @} End of "addtogroup matrix_funcs_misc" + +} // namespace kaldi + + +// Including the implementation (now actually just includes some +// template specializations). +#include "matrix/sp-matrix-inl.h" + + +#endif // KALDI_MATRIX_SP_MATRIX_H_ diff --git a/speechx/speechx/kaldi/matrix/sparse-matrix.cc b/speechx/speechx/kaldi/matrix/sparse-matrix.cc new file mode 100644 index 00000000..68a61e17 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/sparse-matrix.cc @@ -0,0 +1,1296 @@ +// matrix/sparse-matrix.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2015 Guoguo Chen +// 2017 Shiyin Kang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "matrix/sparse-matrix.h" +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + +template +std::pair* SparseVector::Data() { + if (pairs_.empty()) + return NULL; + else + return &(pairs_[0]); +} + +template +const std::pair* SparseVector::Data() const { + if (pairs_.empty()) + return NULL; + else + return &(pairs_[0]); +} + +template +Real SparseVector::Sum() const { + Real sum = 0; + for (int32 i = 0; i < pairs_.size(); ++i) { + sum += pairs_[i].second; + } + return sum; +} + +template +void SparseVector::Scale(Real alpha) { + for (int32 i = 0; i < pairs_.size(); ++i) + pairs_[i].second *= alpha; +} + +template +template +void SparseVector::CopyElementsToVec(VectorBase *vec) const { + KALDI_ASSERT(vec->Dim() == this->dim_); + vec->SetZero(); + OtherReal *other_data = vec->Data(); + typename std::vector >::const_iterator + iter = pairs_.begin(), end = pairs_.end(); + for (; iter != end; ++iter) + other_data[iter->first] = iter->second; +} +template +void SparseVector::CopyElementsToVec(VectorBase *vec) const; +template +void SparseVector::CopyElementsToVec(VectorBase *vec) const; +template +void SparseVector::CopyElementsToVec(VectorBase *vec) const; +template +void SparseVector::CopyElementsToVec(VectorBase *vec) const; + +template +template +void SparseVector::AddToVec(Real alpha, + VectorBase *vec) const { + KALDI_ASSERT(vec->Dim() == dim_); + OtherReal *other_data = vec->Data(); + typename std::vector >::const_iterator + iter = pairs_.begin(), end = pairs_.end(); + if (alpha == 1.0) { // treat alpha==1.0 case specially. + for (; iter != end; ++iter) + other_data[iter->first] += iter->second; + } else { + for (; iter != end; ++iter) + other_data[iter->first] += alpha * iter->second; + } +} + +template +void SparseVector::AddToVec(float alpha, VectorBase *vec) const; +template +void SparseVector::AddToVec(float alpha, VectorBase *vec) const; +template +void SparseVector::AddToVec(double alpha, VectorBase *vec) const; +template +void SparseVector::AddToVec(double alpha, + VectorBase *vec) const; + +template +template +void SparseVector::CopyFromSvec(const SparseVector &other) { + dim_ = other.Dim(); + pairs_.clear(); + if (dim_ == 0) return; + for (int32 i = 0; i < other.NumElements(); ++i) { + pairs_.push_back(std::make_pair( + other.GetElement(i).first, + static_cast(other.GetElement(i).second))); + } +} +template +void SparseVector::CopyFromSvec(const SparseVector &svec); +template +void SparseVector::CopyFromSvec(const SparseVector &svec); +template +void SparseVector::CopyFromSvec(const SparseVector &svec); +template +void SparseVector::CopyFromSvec(const SparseVector &svec); + + +template +SparseVector& SparseVector::operator = ( + const SparseVector &other) { + this->CopyFromSvec(other); + dim_ = other.dim_; + pairs_ = other.pairs_; + return *this; +} + +template +void SparseVector::Swap(SparseVector *other) { + pairs_.swap(other->pairs_); + std::swap(dim_, other->dim_); +} + +template +void SparseVector::Write(std::ostream &os, bool binary) const { + if (binary) { + WriteToken(os, binary, "SV"); + WriteBasicType(os, binary, dim_); + MatrixIndexT num_elems = pairs_.size(); + WriteBasicType(os, binary, num_elems); + typename std::vector >::const_iterator + iter = pairs_.begin(), end = pairs_.end(); + for (; iter != end; ++iter) { + WriteBasicType(os, binary, iter->first); + WriteBasicType(os, binary, iter->second); + } + } else { + // In text-mode, use a human-friendly, script-friendly format; + // format is "dim=5 [ 0 0.2 3 0.9 ] " + os << "dim=" << dim_ << " [ "; + typename std::vector >::const_iterator + iter = pairs_.begin(), end = pairs_.end(); + for (; iter != end; ++iter) + os << iter->first << ' ' << iter->second << ' '; + os << "] "; + } +} + + +template +void SparseVector::Read(std::istream &is, bool binary) { + if (binary) { + ExpectToken(is, binary, "SV"); + ReadBasicType(is, binary, &dim_); + KALDI_ASSERT(dim_ >= 0); + int32 num_elems; + ReadBasicType(is, binary, &num_elems); + KALDI_ASSERT(num_elems >= 0 && num_elems <= dim_); + pairs_.resize(num_elems); + typename std::vector >::iterator + iter = pairs_.begin(), end = pairs_.end(); + for (; iter != end; ++iter) { + ReadBasicType(is, binary, &(iter->first)); + ReadBasicType(is, binary, &(iter->second)); + } + } else { + // In text-mode, format is "dim=5 [ 0 0.2 3 0.9 ] + std::string str; + is >> str; + if (str.substr(0, 4) != "dim=") + KALDI_ERR << "Reading sparse vector, expected 'dim=xxx', got " << str; + std::string dim_str = str.substr(4, std::string::npos); + std::istringstream dim_istr(dim_str); + int32 dim = -1; + dim_istr >> dim; + if (dim < 0 || dim_istr.fail()) { + KALDI_ERR << "Reading sparse vector, expected 'dim=[int]', got " << str; + } + dim_ = dim; + is >> std::ws; + is >> str; + if (str != "[") + KALDI_ERR << "Reading sparse vector, expected '[', got " << str; + pairs_.clear(); + while (1) { + is >> std::ws; + if (is.peek() == ']') { + is.get(); + break; + } + MatrixIndexT i; + BaseFloat p; + is >> i >> p; + if (is.fail()) + KALDI_ERR << "Error reading sparse vector, expecting numbers."; + KALDI_ASSERT(i >= 0 && i < dim + && (pairs_.empty() || i > pairs_.back().first)); + pairs_.push_back(std::pair(i, p)); + } + } +} + + +namespace sparse_vector_utils { +template +struct CompareFirst { + inline bool operator() (const std::pair &p1, + const std::pair &p2) const { + return p1.first < p2.first; + } +}; +} + +template +SparseVector::SparseVector( + MatrixIndexT dim, const std::vector > &pairs): + dim_(dim), + pairs_(pairs) { + std::sort(pairs_.begin(), pairs_.end(), + sparse_vector_utils::CompareFirst()); + typename std::vector >::iterator + out = pairs_.begin(), in = out, end = pairs_.end(); + // special case: while there is nothing to be changed, skip over + // initial input (avoids unnecessary copying). + while (in + 1 < end && in[0].first != in[1].first && in[0].second != 0.0) { + in++; + out++; + } + while (in < end) { + // We reach this point only at the first element of + // each stretch of identical .first elements. + *out = *in; + ++in; + while (in < end && in->first == out->first) { + out->second += in->second; // this is the merge operation. + ++in; + } + if (out->second != Real(0.0)) // Don't keep zero elements. + out++; + } + pairs_.erase(out, end); + if (!pairs_.empty()) { + // range check. + KALDI_ASSERT(pairs_.front().first >= 0 && pairs_.back().first < dim_); + } +} + +template +void SparseVector::SetRandn(BaseFloat zero_prob) { + pairs_.clear(); + KALDI_ASSERT(zero_prob >= 0 && zero_prob <= 1.0); + for (MatrixIndexT i = 0; i < dim_; i++) + if (WithProb(1.0 - zero_prob)) + pairs_.push_back(std::pair(i, RandGauss())); +} + +template +void SparseVector::Resize(MatrixIndexT dim, + MatrixResizeType resize_type) { + if (resize_type != kCopyData || dim == 0) + pairs_.clear(); + KALDI_ASSERT(dim >= 0); + if (dim < dim_ && resize_type == kCopyData) + while (!pairs_.empty() && pairs_.back().first >= dim) + pairs_.pop_back(); + dim_ = dim; +} + +template +MatrixIndexT SparseMatrix::NumRows() const { + return rows_.size(); +} + +template +MatrixIndexT SparseMatrix::NumCols() const { + if (rows_.empty()) + return 0.0; + else + return rows_[0].Dim(); +} + +template +MatrixIndexT SparseMatrix::NumElements() const { + int32 num_elements = 0; + for (int32 i = 0; i < rows_.size(); ++i) { + num_elements += rows_[i].NumElements(); + } + return num_elements; +} + +template +SparseVector* SparseMatrix::Data() { + if (rows_.empty()) + return NULL; + else + return rows_.data(); +} + +template +const SparseVector* SparseMatrix::Data() const { + if (rows_.empty()) + return NULL; + else + return rows_.data(); +} + +template +Real SparseMatrix::Sum() const { + Real sum = 0; + for (int32 i = 0; i < rows_.size(); ++i) { + sum += rows_[i].Sum(); + } + return sum; +} + +template +Real SparseMatrix::FrobeniusNorm() const { + Real squared_sum = 0; + for (int32 i = 0; i < rows_.size(); ++i) { + const std::pair *row_data = rows_[i].Data(); + for (int32 j = 0; j < rows_[i].NumElements(); ++j) { + squared_sum += row_data[j].second * row_data[j].second; + } + } + return std::sqrt(squared_sum); +} + +template +template +void SparseMatrix::CopyToMat(MatrixBase *other, + MatrixTransposeType trans) const { + if (trans == kNoTrans) { + MatrixIndexT num_rows = rows_.size(); + KALDI_ASSERT(other->NumRows() == num_rows); + for (MatrixIndexT i = 0; i < num_rows; i++) { + SubVector vec(*other, i); + rows_[i].CopyElementsToVec(&vec); + } + } else { + OtherReal *other_col_data = other->Data(); + MatrixIndexT other_stride = other->Stride(), + num_rows = NumRows(), num_cols = NumCols(); + KALDI_ASSERT(num_rows == other->NumCols() && num_cols == other->NumRows()); + other->SetZero(); + for (MatrixIndexT row = 0; row < num_rows; row++, other_col_data++) { + const SparseVector &svec = rows_[row]; + MatrixIndexT num_elems = svec.NumElements(); + const std::pair *sdata = svec.Data(); + for (MatrixIndexT e = 0; e < num_elems; e++) + other_col_data[sdata[e].first * other_stride] = sdata[e].second; + } + } +} + +template +void SparseMatrix::CopyToMat(MatrixBase *other, + MatrixTransposeType trans) const; +template +void SparseMatrix::CopyToMat(MatrixBase *other, + MatrixTransposeType trans) const; +template +void SparseMatrix::CopyToMat(MatrixBase *other, + MatrixTransposeType trans) const; +template +void SparseMatrix::CopyToMat(MatrixBase *other, + MatrixTransposeType trans) const; + +template +void SparseMatrix::CopyElementsToVec(VectorBase *other) const { + KALDI_ASSERT(other->Dim() == NumElements()); + Real *dst_data = other->Data(); + int32 dst_index = 0; + for (int32 i = 0; i < rows_.size(); ++i) { + for (int32 j = 0; j < rows_[i].NumElements(); ++j) { + dst_data[dst_index] = + static_cast(rows_[i].GetElement(j).second); + dst_index++; + } + } +} + +template +template +void SparseMatrix::CopyFromSmat(const SparseMatrix &other, + MatrixTransposeType trans) { + if (trans == kNoTrans) { + rows_.resize(other.NumRows()); + if (rows_.size() == 0) + return; + for (int32 r = 0; r < rows_.size(); ++r) { + rows_[r].CopyFromSvec(other.Row(r)); + } + } else { + std::vector > > pairs( + other.NumCols()); + for (MatrixIndexT i = 0; i < other.NumRows(); ++i) { + for (int id = 0; id < other.Row(i).NumElements(); ++id) { + MatrixIndexT j = other.Row(i).GetElement(id).first; + Real v = static_cast(other.Row(i).GetElement(id).second); + pairs[j].push_back( { i, v }); + } + } + SparseMatrix temp(other.NumRows(), pairs); + Swap(&temp); + } +} +template +void SparseMatrix::CopyFromSmat(const SparseMatrix &other, + MatrixTransposeType trans); +template +void SparseMatrix::CopyFromSmat(const SparseMatrix &other, + MatrixTransposeType trans); +template +void SparseMatrix::CopyFromSmat(const SparseMatrix &other, + MatrixTransposeType trans); +template +void SparseMatrix::CopyFromSmat(const SparseMatrix &other, + MatrixTransposeType trans); + +template +void SparseMatrix::Write(std::ostream &os, bool binary) const { + if (binary) { + // Note: we can use the same marker for float and double SparseMatrix, + // because internally we use WriteBasicType and ReadBasicType to read the + // floats and doubles, and this will automatically take care of type + // conversion. + WriteToken(os, binary, "SM"); + int32 num_rows = rows_.size(); + WriteBasicType(os, binary, num_rows); + for (int32 row = 0; row < num_rows; row++) + rows_[row].Write(os, binary); + } else { + // The format is "rows=10 dim=20 [ 1 0.4 9 1.2 ] dim=20 [ 3 1.7 19 0.6 ] .. + // not 100% efficient, but easy to work with, and we can re-use the + // read/write code from SparseVector. + int32 num_rows = rows_.size(); + os << "rows=" << num_rows << " "; + for (int32 row = 0; row < num_rows; row++) + rows_[row].Write(os, binary); + os << "\n"; // Might make it a little more readable. + } +} + +template +void SparseMatrix::Read(std::istream &is, bool binary) { + if (binary) { + ExpectToken(is, binary, "SM"); + int32 num_rows; + ReadBasicType(is, binary, &num_rows); + KALDI_ASSERT(num_rows >= 0 && num_rows < 10000000); + rows_.resize(num_rows); + for (int32 row = 0; row < num_rows; row++) + rows_[row].Read(is, binary); + } else { + std::string str; + is >> str; + if (str.substr(0, 5) != "rows=") + KALDI_ERR << "Reading sparse matrix, expected 'rows=xxx', got " << str; + std::string rows_str = str.substr(5, std::string::npos); + std::istringstream rows_istr(rows_str); + int32 num_rows = -1; + rows_istr >> num_rows; + if (num_rows < 0 || rows_istr.fail()) { + KALDI_ERR << "Reading sparse vector, expected 'rows=[int]', got " << str; + } + rows_.resize(num_rows); + for (int32 row = 0; row < num_rows; row++) + rows_[row].Read(is, binary); + } +} + + +template +void SparseMatrix::AddToMat(BaseFloat alpha, + MatrixBase *other, + MatrixTransposeType trans) const { + if (trans == kNoTrans) { + MatrixIndexT num_rows = rows_.size(); + KALDI_ASSERT(other->NumRows() == num_rows); + for (MatrixIndexT i = 0; i < num_rows; i++) { + SubVector vec(*other, i); + rows_[i].AddToVec(alpha, &vec); + } + } else { + Real *other_col_data = other->Data(); + MatrixIndexT other_stride = other->Stride(), + num_rows = NumRows(), num_cols = NumCols(); + KALDI_ASSERT(num_rows == other->NumCols() && num_cols == other->NumRows()); + for (MatrixIndexT row = 0; row < num_rows; row++, other_col_data++) { + const SparseVector &svec = rows_[row]; + MatrixIndexT num_elems = svec.NumElements(); + const std::pair *sdata = svec.Data(); + for (MatrixIndexT e = 0; e < num_elems; e++) + other_col_data[sdata[e].first * other_stride] += + alpha * sdata[e].second; + } + } +} + +template +Real VecSvec(const VectorBase &vec, + const SparseVector &svec) { + KALDI_ASSERT(vec.Dim() == svec.Dim()); + MatrixIndexT n = svec.NumElements(); + const std::pair *sdata = svec.Data(); + const Real *data = vec.Data(); + Real ans = 0.0; + for (MatrixIndexT i = 0; i < n; i++) + ans += data[sdata[i].first] * sdata[i].second; + return ans; +} + +template +float VecSvec(const VectorBase &vec, + const SparseVector &svec); +template +double VecSvec(const VectorBase &vec, + const SparseVector &svec); + +template +const SparseVector &SparseMatrix::Row(MatrixIndexT r) const { + KALDI_ASSERT(static_cast(r) < rows_.size()); + return rows_[r]; +} + +template +void SparseMatrix::SetRow(int32 r, const SparseVector &vec) { + KALDI_ASSERT(static_cast(r) < rows_.size() && + vec.Dim() == rows_[0].Dim()); + rows_[r] = vec; +} + + +template +void SparseMatrix::SelectRows(const std::vector &row_indexes, + const SparseMatrix &smat_other) { + Resize(row_indexes.size(), smat_other.NumCols()); + for (int i = 0; i < row_indexes.size(); ++i) { + SetRow(i, smat_other.Row(row_indexes[i])); + } +} + +template +SparseMatrix::SparseMatrix(const std::vector &indexes, int32 dim, + MatrixTransposeType trans) { + const std::vector& idx = indexes; + std::vector > > pair(idx.size()); + for (int i = 0; i < idx.size(); ++i) { + if (idx[i] >= 0) { + pair[i].push_back( { idx[i], Real(1) }); + } + } + SparseMatrix smat_cpu(dim, pair); + if (trans == kNoTrans) { + this->Swap(&smat_cpu); + } else { + SparseMatrix tmp(smat_cpu, kTrans); + this->Swap(&tmp); + } +} + +template +SparseMatrix::SparseMatrix(const std::vector &indexes, + const VectorBase &weights, int32 dim, + MatrixTransposeType trans) { + const std::vector& idx = indexes; + const VectorBase& w = weights; + std::vector > > pair(idx.size()); + for (int i = 0; i < idx.size(); ++i) { + if (idx[i] >= 0) { + pair[i].push_back( { idx[i], w(i) }); + } + } + SparseMatrix smat_cpu(dim, pair); + if (trans == kNoTrans) { + this->Swap(&smat_cpu); + } else { + SparseMatrix tmp(smat_cpu, kTrans); + this->Swap(&tmp); + } +} + +template +SparseMatrix& SparseMatrix::operator = ( + const SparseMatrix &other) { + rows_ = other.rows_; + return *this; +} + +template +void SparseMatrix::Swap(SparseMatrix *other) { + rows_.swap(other->rows_); +} + +template +SparseMatrix::SparseMatrix( + MatrixIndexT dim, + const std::vector > > &pairs): + rows_(pairs.size()) { + MatrixIndexT num_rows = pairs.size(); + for (MatrixIndexT row = 0; row < num_rows; row++) { + SparseVector svec(dim, pairs[row]); + rows_[row].Swap(&svec); + } +} + +template +void SparseMatrix::SetRandn(BaseFloat zero_prob) { + MatrixIndexT num_rows = rows_.size(); + for (MatrixIndexT row = 0; row < num_rows; row++) + rows_[row].SetRandn(zero_prob); +} + +template +void SparseMatrix::Resize(MatrixIndexT num_rows, + MatrixIndexT num_cols, + MatrixResizeType resize_type) { + KALDI_ASSERT(num_rows >= 0 && num_cols >= 0); + if (resize_type == kSetZero || resize_type == kUndefined) { + rows_.clear(); + Resize(num_rows, num_cols, kCopyData); + } else { + // Assume resize_type == kCopyData from here. + int32 old_num_rows = rows_.size(), old_num_cols = NumCols(); + SparseVector initializer(num_cols); + rows_.resize(num_rows, initializer); + if (num_cols != old_num_cols) + for (int32 row = 0; row < old_num_rows; row++) + rows_[row].Resize(num_cols, kCopyData); + } +} + +template +void SparseMatrix::AppendSparseMatrixRows( + std::vector > *inputs) { + rows_.clear(); + size_t num_rows = 0; + typename std::vector >::iterator + input_iter = inputs->begin(), + input_end = inputs->end(); + for (; input_iter != input_end; ++input_iter) + num_rows += input_iter->rows_.size(); + rows_.resize(num_rows); + typename std::vector >::iterator + row_iter = rows_.begin(), + row_end = rows_.end(); + for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter) { + typename std::vector >::iterator + input_row_iter = input_iter->rows_.begin(), + input_row_end = input_iter->rows_.end(); + for (; input_row_iter != input_row_end; ++input_row_iter, ++row_iter) + row_iter->Swap(&(*input_row_iter)); + } + KALDI_ASSERT(row_iter == row_end); + int32 num_cols = NumCols(); + for (row_iter = rows_.begin(); row_iter != row_end; ++row_iter) { + if (row_iter->Dim() != num_cols) + KALDI_ERR << "Appending rows with inconsistent dimensions, " + << row_iter->Dim() << " vs. " << num_cols; + } + inputs->clear(); +} + +template +void SparseMatrix::Scale(Real alpha) { + MatrixIndexT num_rows = rows_.size(); + for (MatrixIndexT row = 0; row < num_rows; row++) + rows_[row].Scale(alpha); +} + +template +SparseMatrix::SparseMatrix(const MatrixBase &mat) { + MatrixIndexT num_rows = mat.NumRows(); + rows_.resize(num_rows); + for (int32 row = 0; row < num_rows; row++) { + SparseVector this_row(mat.Row(row)); + rows_[row].Swap(&this_row); + } +} + +template +Real TraceMatSmat(const MatrixBase &A, + const SparseMatrix &B, + MatrixTransposeType trans) { + Real sum = 0.0; + if (trans == kTrans) { + MatrixIndexT num_rows = A.NumRows(); + KALDI_ASSERT(B.NumRows() == num_rows); + for (MatrixIndexT r = 0; r < num_rows; r++) + sum += VecSvec(A.Row(r), B.Row(r)); + } else { + const Real *A_col_data = A.Data(); + MatrixIndexT Astride = A.Stride(), Acols = A.NumCols(), Arows = A.NumRows(); + KALDI_ASSERT(Arows == B.NumCols() && Acols == B.NumRows()); + sum = 0.0; + for (MatrixIndexT i = 0; i < Acols; i++, A_col_data++) { + Real col_sum = 0.0; + const SparseVector &svec = B.Row(i); + MatrixIndexT num_elems = svec.NumElements(); + const std::pair *sdata = svec.Data(); + for (MatrixIndexT e = 0; e < num_elems; e++) + col_sum += A_col_data[Astride * sdata[e].first] * sdata[e].second; + sum += col_sum; + } + } + return sum; +} + +template +float TraceMatSmat(const MatrixBase &A, + const SparseMatrix &B, + MatrixTransposeType trans); +template +double TraceMatSmat(const MatrixBase &A, + const SparseMatrix &B, + MatrixTransposeType trans); + +void GeneralMatrix::Clear() { + mat_.Resize(0, 0); + cmat_.Clear(); + smat_.Resize(0, 0); +} + +GeneralMatrix& GeneralMatrix::operator= (const MatrixBase &mat) { + Clear(); + mat_ = mat; + return *this; +} + +GeneralMatrix& GeneralMatrix::operator= (const CompressedMatrix &cmat) { + Clear(); + cmat_ = cmat; + return *this; +} + +GeneralMatrix& GeneralMatrix::operator= (const SparseMatrix &smat) { + Clear(); + smat_ = smat; + return *this; +} + +GeneralMatrix& GeneralMatrix::operator= (const GeneralMatrix &gmat) { + mat_ = gmat.mat_; + smat_ = gmat.smat_; + cmat_ = gmat.cmat_; + return *this; +} + + +GeneralMatrixType GeneralMatrix::Type() const { + if (smat_.NumRows() != 0) + return kSparseMatrix; + else if (cmat_.NumRows() != 0) + return kCompressedMatrix; + else + return kFullMatrix; +} + +MatrixIndexT GeneralMatrix::NumRows() const { + MatrixIndexT r = smat_.NumRows(); + if (r != 0) + return r; + r = cmat_.NumRows(); + if (r != 0) + return r; + return mat_.NumRows(); +} + +MatrixIndexT GeneralMatrix::NumCols() const { + MatrixIndexT r = smat_.NumCols(); + if (r != 0) + return r; + r = cmat_.NumCols(); + if (r != 0) + return r; + return mat_.NumCols(); +} + + +void GeneralMatrix::Compress() { + if (mat_.NumRows() != 0) { + cmat_.CopyFromMat(mat_); + mat_.Resize(0, 0); + } +} + +void GeneralMatrix::Uncompress() { + if (cmat_.NumRows() != 0) { + mat_.Resize(cmat_.NumRows(), cmat_.NumCols(), kUndefined); + cmat_.CopyToMat(&mat_); + cmat_.Clear(); + } +} + +void GeneralMatrix::GetMatrix(Matrix *mat) const { + if (mat_.NumRows() !=0) { + *mat = mat_; + } else if (cmat_.NumRows() != 0) { + mat->Resize(cmat_.NumRows(), cmat_.NumCols(), kUndefined); + cmat_.CopyToMat(mat); + } else if (smat_.NumRows() != 0) { + mat->Resize(smat_.NumRows(), smat_.NumCols(), kUndefined); + smat_.CopyToMat(mat); + } else { + mat->Resize(0, 0); + } +} + +void GeneralMatrix::CopyToMat(MatrixBase *mat, + MatrixTransposeType trans) const { + if (mat_.NumRows() !=0) { + mat->CopyFromMat(mat_, trans); + } else if (cmat_.NumRows() != 0) { + cmat_.CopyToMat(mat, trans); + } else if (smat_.NumRows() != 0) { + smat_.CopyToMat(mat, trans); + } else { + KALDI_ASSERT(mat->NumRows() == 0); + } +} + +void GeneralMatrix::Scale(BaseFloat alpha) { + if (mat_.NumRows() != 0) { + mat_.Scale(alpha); + } else if (cmat_.NumRows() != 0) { + cmat_.Scale(alpha); + } else if (smat_.NumRows() != 0) { + smat_.Scale(alpha); + } + +} +const SparseMatrix& GeneralMatrix::GetSparseMatrix() const { + if (mat_.NumRows() != 0 || cmat_.NumRows() != 0) + KALDI_ERR << "GetSparseMatrix called on GeneralMatrix of wrong type."; + return smat_; +} + +void GeneralMatrix::SwapSparseMatrix(SparseMatrix *smat) { + if (mat_.NumRows() != 0 || cmat_.NumRows() != 0) + KALDI_ERR << "GetSparseMatrix called on GeneralMatrix of wrong type."; + smat->Swap(&smat_); +} + +void GeneralMatrix::SwapCompressedMatrix(CompressedMatrix *cmat) { + if (mat_.NumRows() != 0 || smat_.NumRows() != 0) + KALDI_ERR << "GetSparseMatrix called on GeneralMatrix of wrong type."; + cmat->Swap(&cmat_); +} + +const CompressedMatrix &GeneralMatrix::GetCompressedMatrix() const { + if (mat_.NumRows() != 0 || smat_.NumRows() != 0) + KALDI_ERR << "GetCompressedMatrix called on GeneralMatrix of wrong type."; + return cmat_; +} + +const Matrix &GeneralMatrix::GetFullMatrix() const { + if (smat_.NumRows() != 0 || cmat_.NumRows() != 0) + KALDI_ERR << "GetFullMatrix called on GeneralMatrix of wrong type."; + return mat_; +} + + +void GeneralMatrix::SwapFullMatrix(Matrix *mat) { + if (cmat_.NumRows() != 0 || smat_.NumRows() != 0) + KALDI_ERR << "SwapMatrix called on GeneralMatrix of wrong type."; + mat->Swap(&mat_); +} + +void GeneralMatrix::Write(std::ostream &os, bool binary) const { + if (smat_.NumRows() != 0) { + smat_.Write(os, binary); + } else if (cmat_.NumRows() != 0) { + cmat_.Write(os, binary); + } else { + mat_.Write(os, binary); + } +} + +void GeneralMatrix::Read(std::istream &is, bool binary) { + Clear(); + if (binary) { + int peekval = is.peek(); + if (peekval == 'C') { + // Token CM for compressed matrix + cmat_.Read(is, binary); + } else if (peekval == 'S') { + // Token SM for sparse matrix + smat_.Read(is, binary); + } else { + mat_.Read(is, binary); + } + } else { + // note: in text mode we will only ever read regular + // or sparse matrices, because the compressed-matrix format just + // gets written as a regular matrix in text mode. + is >> std::ws; // Eat up white space. + int peekval = is.peek(); + if (peekval == 'r') { // sparse format starts rows=[int]. + smat_.Read(is, binary); + } else { + mat_.Read(is, binary); + } + } +} + + +void AppendGeneralMatrixRows(const std::vector &src, + GeneralMatrix *mat) { + mat->Clear(); + int32 size = src.size(); + if (size == 0) + return; + bool all_sparse = true; + for (int32 i = 0; i < size; i++) { + if (src[i]->Type() != kSparseMatrix && src[i]->NumRows() != 0) { + all_sparse = false; + break; + } + } + if (all_sparse) { + std::vector > sparse_mats(size); + for (int32 i = 0; i < size; i++) + sparse_mats[i] = src[i]->GetSparseMatrix(); + SparseMatrix appended_mat; + appended_mat.AppendSparseMatrixRows(&sparse_mats); + mat->SwapSparseMatrix(&appended_mat); + } else { + int32 tot_rows = 0, num_cols = -1; + for (int32 i = 0; i < size; i++) { + const GeneralMatrix &src_mat = *(src[i]); + int32 src_rows = src_mat.NumRows(), src_cols = src_mat.NumCols(); + if (src_rows != 0) { + tot_rows += src_rows; + if (num_cols == -1) num_cols = src_cols; + else if (num_cols != src_cols) + KALDI_ERR << "Appending rows of matrices with inconsistent num-cols: " + << num_cols << " vs. " << src_cols; + } + } + Matrix appended_mat(tot_rows, num_cols, kUndefined); + int32 row_offset = 0; + for (int32 i = 0; i < size; i++) { + const GeneralMatrix &src_mat = *(src[i]); + int32 src_rows = src_mat.NumRows(); + if (src_rows != 0) { + SubMatrix dest_submat(appended_mat, row_offset, src_rows, + 0, num_cols); + src_mat.CopyToMat(&dest_submat); + row_offset += src_rows; + } + } + KALDI_ASSERT(row_offset == tot_rows); + mat->SwapFullMatrix(&appended_mat); + } +} + +void FilterCompressedMatrixRows(const CompressedMatrix &in, + const std::vector &keep_rows, + Matrix *out) { + KALDI_ASSERT(keep_rows.size() == static_cast(in.NumRows())); + int32 num_kept_rows = 0; + std::vector::const_iterator iter = keep_rows.begin(), + end = keep_rows.end(); + for (; iter != end; ++iter) + if (*iter) + num_kept_rows++; + if (num_kept_rows == 0) + KALDI_ERR << "No kept rows"; + if (num_kept_rows == static_cast(keep_rows.size())) { + out->Resize(in.NumRows(), in.NumCols(), kUndefined); + in.CopyToMat(out); + return; + } + const BaseFloat heuristic = 0.33; + // should be > 0 and < 1.0. represents the performance hit we get from + // iterating row-wise versus column-wise in compressed-matrix uncompression. + + if (num_kept_rows > heuristic * in.NumRows()) { + // if quite a few of the the rows are kept, it may be more efficient + // to uncompress the entire compressed matrix, since per-column operation + // is more efficient. + Matrix full_mat(in); + FilterMatrixRows(full_mat, keep_rows, out); + } else { + out->Resize(num_kept_rows, in.NumCols(), kUndefined); + + iter = keep_rows.begin(); + int32 out_row = 0; + for (int32 in_row = 0; iter != end; ++iter, ++in_row) { + if (*iter) { + SubVector dest(*out, out_row); + in.CopyRowToVec(in_row, &dest); + out_row++; + } + } + KALDI_ASSERT(out_row == num_kept_rows); + } +} + +template +void FilterMatrixRows(const Matrix &in, + const std::vector &keep_rows, + Matrix *out) { + KALDI_ASSERT(keep_rows.size() == static_cast(in.NumRows())); + int32 num_kept_rows = 0; + std::vector::const_iterator iter = keep_rows.begin(), + end = keep_rows.end(); + for (; iter != end; ++iter) + if (*iter) + num_kept_rows++; + if (num_kept_rows == 0) + KALDI_ERR << "No kept rows"; + if (num_kept_rows == static_cast(keep_rows.size())) { + *out = in; + return; + } + out->Resize(num_kept_rows, in.NumCols(), kUndefined); + iter = keep_rows.begin(); + int32 out_row = 0; + for (int32 in_row = 0; iter != end; ++iter, ++in_row) { + if (*iter) { + SubVector src(in, in_row); + SubVector dest(*out, out_row); + dest.CopyFromVec(src); + out_row++; + } + } + KALDI_ASSERT(out_row == num_kept_rows); +} + +template +void FilterMatrixRows(const Matrix &in, + const std::vector &keep_rows, + Matrix *out); +template +void FilterMatrixRows(const Matrix &in, + const std::vector &keep_rows, + Matrix *out); + +template +void FilterSparseMatrixRows(const SparseMatrix &in, + const std::vector &keep_rows, + SparseMatrix *out) { + KALDI_ASSERT(keep_rows.size() == static_cast(in.NumRows())); + int32 num_kept_rows = 0; + std::vector::const_iterator iter = keep_rows.begin(), + end = keep_rows.end(); + for (; iter != end; ++iter) + if (*iter) + num_kept_rows++; + if (num_kept_rows == 0) + KALDI_ERR << "No kept rows"; + if (num_kept_rows == static_cast(keep_rows.size())) { + *out = in; + return; + } + out->Resize(num_kept_rows, in.NumCols(), kUndefined); + iter = keep_rows.begin(); + int32 out_row = 0; + for (int32 in_row = 0; iter != end; ++iter, ++in_row) { + if (*iter) { + out->SetRow(out_row, in.Row(in_row)); + out_row++; + } + } + KALDI_ASSERT(out_row == num_kept_rows); +} + +template +void FilterSparseMatrixRows(const SparseMatrix &in, + const std::vector &keep_rows, + SparseMatrix *out); +template +void FilterSparseMatrixRows(const SparseMatrix &in, + const std::vector &keep_rows, + SparseMatrix *out); + + +void FilterGeneralMatrixRows(const GeneralMatrix &in, + const std::vector &keep_rows, + GeneralMatrix *out) { + out->Clear(); + KALDI_ASSERT(keep_rows.size() == static_cast(in.NumRows())); + int32 num_kept_rows = 0; + std::vector::const_iterator iter = keep_rows.begin(), + end = keep_rows.end(); + for (; iter != end; ++iter) + if (*iter) + num_kept_rows++; + if (num_kept_rows == 0) + KALDI_ERR << "No kept rows"; + if (num_kept_rows == static_cast(keep_rows.size())) { + *out = in; + return; + } + switch (in.Type()) { + case kCompressedMatrix: { + const CompressedMatrix &cmat = in.GetCompressedMatrix(); + Matrix full_mat; + FilterCompressedMatrixRows(cmat, keep_rows, &full_mat); + out->SwapFullMatrix(&full_mat); + return; + } + case kSparseMatrix: { + const SparseMatrix &smat = in.GetSparseMatrix(); + SparseMatrix smat_out; + FilterSparseMatrixRows(smat, keep_rows, &smat_out); + out->SwapSparseMatrix(&smat_out); + return; + } + case kFullMatrix: { + const Matrix &full_mat = in.GetFullMatrix(); + Matrix full_mat_out; + FilterMatrixRows(full_mat, keep_rows, &full_mat_out); + out->SwapFullMatrix(&full_mat_out); + return; + } + default: + KALDI_ERR << "Invalid general-matrix type."; + } +} + +void GeneralMatrix::AddToMat(BaseFloat alpha, MatrixBase *mat, + MatrixTransposeType trans) const { + switch (this->Type()) { + case kFullMatrix: { + mat->AddMat(alpha, mat_, trans); + break; + } + case kSparseMatrix: { + smat_.AddToMat(alpha, mat, trans); + break; + } + case kCompressedMatrix: { + Matrix temp_mat(cmat_); + mat->AddMat(alpha, temp_mat, trans); + break; + } + default: + KALDI_ERR << "Invalid general-matrix type."; + } +} + +template +Real SparseVector::Max(int32 *index_out) const { + KALDI_ASSERT(dim_ > 0 && pairs_.size() <= static_cast(dim_)); + Real ans = -std::numeric_limits::infinity(); + int32 index = 0; + typename std::vector >::const_iterator + iter = pairs_.begin(), end = pairs_.end(); + for (; iter != end; ++iter) { + if (iter->second > ans) { + ans = iter->second; + index = iter->first; + } + } + if (ans >= 0 || pairs_.size() == dim_) { + // ans >= 0 will be the normal case. + // if pairs_.size() == dim_ then we need to return + // even a negative answer as there are no spaces (hence no unlisted zeros). + *index_out = index; + return ans; + } + // all the stored elements are < 0, but there are unlisted + // elements -> pick the first unlisted element. + // Note that this class requires that the indexes are sorted + // and unique. + index = 0; // "index" will always be the next index, that + // we haven't seen listed yet. + iter = pairs_.begin(); + for (; iter != end; ++iter) { + if (iter->first > index) { // index "index" is not listed. + *index_out = index; + return 0.0; + } else { + // index is the next potential gap in the indexes. + index = iter->first + 1; + } + } + // we can reach here if either pairs_.empty(), or + // pairs_ is nonempty but contains a sequence (0, 1, 2,...). + if (!pairs_.empty()) + index = pairs_.back().first + 1; + // else leave index at zero + KALDI_ASSERT(index < dim_); + *index_out = index; + return 0.0; +} + +template +SparseVector::SparseVector(const VectorBase &vec) { + MatrixIndexT dim = vec.Dim(); + dim_ = dim; + if (dim == 0) + return; + const Real *ptr = vec.Data(); + for (MatrixIndexT i = 0; i < dim; i++) { + Real val = ptr[i]; + if (val != 0.0) + pairs_.push_back(std::pair(i,val)); + } +} + +void GeneralMatrix::Swap(GeneralMatrix *other) { + mat_.Swap(&(other->mat_)); + cmat_.Swap(&(other->cmat_)); + smat_.Swap(&(other->smat_)); +} + + +void ExtractRowRangeWithPadding( + const GeneralMatrix &in, + int32 row_offset, + int32 num_rows, + GeneralMatrix *out) { + // make sure 'out' is empty to start with. + Matrix empty_mat; + *out = empty_mat; + if (num_rows == 0) return; + switch (in.Type()) { + case kFullMatrix: { + const Matrix &mat_in = in.GetFullMatrix(); + int32 num_rows_in = mat_in.NumRows(), num_cols = mat_in.NumCols(); + KALDI_ASSERT(num_rows_in > 0); // we can't extract >0 rows from an empty + // matrix. + Matrix mat_out(num_rows, num_cols, kUndefined); + for (int32 row = 0; row < num_rows; row++) { + int32 row_in = row + row_offset; + if (row_in < 0) row_in = 0; + else if (row_in >= num_rows_in) row_in = num_rows_in - 1; + SubVector vec_in(mat_in, row_in), + vec_out(mat_out, row); + vec_out.CopyFromVec(vec_in); + } + out->SwapFullMatrix(&mat_out); + break; + } + case kSparseMatrix: { + const SparseMatrix &smat_in = in.GetSparseMatrix(); + int32 num_rows_in = smat_in.NumRows(), + num_cols = smat_in.NumCols(); + KALDI_ASSERT(num_rows_in > 0); // we can't extract >0 rows from an empty + // matrix. + SparseMatrix smat_out(num_rows, num_cols); + for (int32 row = 0; row < num_rows; row++) { + int32 row_in = row + row_offset; + if (row_in < 0) row_in = 0; + else if (row_in >= num_rows_in) row_in = num_rows_in - 1; + smat_out.SetRow(row, smat_in.Row(row_in)); + } + out->SwapSparseMatrix(&smat_out); + break; + } + case kCompressedMatrix: { + const CompressedMatrix &cmat_in = in.GetCompressedMatrix(); + bool allow_padding = true; + CompressedMatrix cmat_out(cmat_in, row_offset, num_rows, + 0, cmat_in.NumCols(), allow_padding); + out->SwapCompressedMatrix(&cmat_out); + break; + } + default: + KALDI_ERR << "Bad matrix type."; + } +} + + + +template class SparseVector; +template class SparseVector; +template class SparseMatrix; +template class SparseMatrix; + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/matrix/sparse-matrix.h b/speechx/speechx/kaldi/matrix/sparse-matrix.h new file mode 100644 index 00000000..76f77f53 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/sparse-matrix.h @@ -0,0 +1,452 @@ +// matrix/sparse-matrix.h + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2015 Guoguo Chen +// 2017 Shiyin Kang + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_MATRIX_SPARSE_MATRIX_H_ +#define KALDI_MATRIX_SPARSE_MATRIX_H_ 1 + +#include +#include + +#include "matrix/matrix-common.h" +#include "matrix/kaldi-matrix.h" +#include "matrix/kaldi-vector.h" +#include "matrix/compressed-matrix.h" + +namespace kaldi { + + +/// \addtogroup matrix_group +/// @{ + +template +class SparseVector { + public: + MatrixIndexT Dim() const { return dim_; } + + Real Sum() const; + + template + void CopyElementsToVec(VectorBase *vec) const; + + // *vec += alpha * *this. + template + void AddToVec(Real alpha, + VectorBase *vec) const; + + template + void CopyFromSvec(const SparseVector &other); + + SparseVector &operator = (const SparseVector &other); + + SparseVector(const SparseVector &other) { *this = other; } + + void Swap(SparseVector *other); + + // Returns the maximum value in this row and outputs the index associated with + // it. This is not the index into the Data() pointer, it is the index into + // the vector it represents, i.e. the .first value in the pair. + // If this vector's Dim() is zero it is an error to call this function. + // If all the elements stored were negative and there underlying vector had + // zero indexes not listed in the elements, or if no elements are stored, it + // will return the first un-listed index, whose value (implicitly) is zero. + Real Max(int32 *index) const; + + /// Returns the number of nonzero elements. + MatrixIndexT NumElements() const { return pairs_.size(); } + + /// get an indexed element (0 <= i < NumElements()). + const std::pair &GetElement(MatrixIndexT i) const { + return pairs_[i]; + } + + // returns pointer to element data, or NULL if empty (use with NumElements()). + std::pair *Data(); + + // returns pointer to element data, or NULL if empty (use with NumElements()); + // const version + const std::pair *Data() const; + + /// Sets elements to zero with probability zero_prob, else normally + /// distributed. Useful in testing. + void SetRandn(BaseFloat zero_prob); + + SparseVector(): dim_(0) { } + + explicit SparseVector(MatrixIndexT dim): dim_(dim) { KALDI_ASSERT(dim >= 0); } + + // constructor from pairs; does not assume input pairs are sorted and uniq + SparseVector(MatrixIndexT dim, + const std::vector > &pairs); + + // constructor from a VectorBase that keeps only the nonzero elements of 'vec'. + explicit SparseVector(const VectorBase &vec); + + /// Resizes to this dimension. resize_type == kUndefined + /// behaves the same as kSetZero. + void Resize(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero); + + void Write(std::ostream &os, bool binary) const; + + void Read(std::istream &os, bool binary); + + /// Scale all elements of sparse vector. + void Scale(Real alpha); + + private: + MatrixIndexT dim_; + // pairs of (row-index, value). Stored in sorted order with no duplicates. + // For now we use std::vector, but we could change this. + std::vector > pairs_; +}; + + +template +Real VecSvec(const VectorBase &vec, + const SparseVector &svec); + + + +template +class SparseMatrix { + public: + MatrixIndexT NumRows() const; + + MatrixIndexT NumCols() const; + + MatrixIndexT NumElements() const; + + Real Sum() const; + + Real FrobeniusNorm() const; + + + /// This constructor creates a SparseMatrix that just contains the nonzero + /// elements of 'mat'. + explicit SparseMatrix(const MatrixBase &mat); + + /// Copy to matrix. It must already have the correct size. + template + void CopyToMat(MatrixBase *other, + MatrixTransposeType t = kNoTrans) const; + + /// Copies the values of all the elements in SparseMatrix into a VectorBase + /// object. + void CopyElementsToVec(VectorBase *other) const; + + /// Copies data from another sparse matrix. + template + void CopyFromSmat(const SparseMatrix &other, + MatrixTransposeType trans = kNoTrans); + + /// Does *other = *other + alpha * *this. + void AddToMat(BaseFloat alpha, MatrixBase *other, + MatrixTransposeType t = kNoTrans) const; + + SparseMatrix &operator = (const SparseMatrix &other); + + SparseMatrix(const SparseMatrix &other, MatrixTransposeType trans = + kNoTrans) { + this->CopyFromSmat(other, trans); + } + + void Swap(SparseMatrix *other); + + // returns pointer to element data, or NULL if empty (use with NumElements()). + SparseVector *Data(); + + // returns pointer to element data, or NULL if empty (use with NumElements()); + // const version + const SparseVector *Data() const; + + // initializer from the type that elsewhere in Kaldi is referred to as type + // Posterior. indexed first by row-index; the pairs are (column-index, value), + // and the constructor does not require them to be sorted and uniq. + SparseMatrix( + int32 dim, + const std::vector > > &pairs); + + /// Sets up to a pseudo-randomly initialized matrix, with each element zero + /// with probability zero_prob and else normally distributed- mostly for + /// purposes of testing. + void SetRandn(BaseFloat zero_prob); + + void Write(std::ostream &os, bool binary) const; + + void Read(std::istream &os, bool binary); + + const SparseVector &Row(MatrixIndexT r) const; + + /// Sets row r to "vec"; makes sure it has the correct dimension. + void SetRow(int32 r, const SparseVector &vec); + + /// Select a subset of the rows of a SparseMatrix. + /// Sets *this to only the rows of 'smat_other' that are listed + /// in 'row_indexes'. + /// 'row_indexes' must satisfy 0 <= row_indexes[i] < smat_other.NumRows(). + void SelectRows(const std::vector &row_indexes, + const SparseMatrix &smat_other); + + + /// Sets *this to all the rows of *inputs appended together; this + /// function is destructive of the inputs. Requires, obviously, + /// that the inputs all have the same dimension (although some may be + /// empty). + void AppendSparseMatrixRows(std::vector > *inputs); + + SparseMatrix() { } + + SparseMatrix(int32 num_rows, int32 num_cols) { Resize(num_rows, num_cols); } + + /// Constructor from an array of indexes. + /// If trans == kNoTrans, construct a sparse matrix + /// with num-rows == indexes.Dim() and num-cols = 'dim'. + /// 'indexes' is expected to contain elements in the + /// range [0, dim - 1]. Each row 'i' of *this after + /// calling the constructor will contain a single + /// element at column-index indexes[i] with value 1.0. + /// + /// If trans == kTrans, the result will be the transpose + /// of the sparse matrix described above. + SparseMatrix(const std::vector &indexes, int32 dim, + MatrixTransposeType trans = kNoTrans); + + /// Constructor from an array of indexes and an array of + /// weights; requires indexes.Dim() == weights.Dim(). + /// If trans == kNoTrans, construct a sparse matrix + /// with num-rows == indexes.Dim() and num-cols = 'dim'. + /// 'indexes' is expected to contain elements in the + /// range [0, dim - 1]. Each row 'i' of *this after + /// calling the constructor will contain a single + /// element at column-index indexes[i] with value weights[i]. + /// If trans == kTrans, the result will be the transpose + /// of the sparse matrix described above. + SparseMatrix(const std::vector &indexes, + const VectorBase &weights, int32 dim, + MatrixTransposeType trans = kNoTrans); + + /// Resizes the matrix; analogous to Matrix::Resize(). resize_type == + /// kUndefined behaves the same as kSetZero. + void Resize(MatrixIndexT rows, MatrixIndexT cols, + MatrixResizeType resize_type = kSetZero); + + /// Scale all elements in sparse matrix. + void Scale(Real alpha); + + // Use the Matrix::CopyFromSmat() function to copy from this to Matrix. Also + // see Matrix::AddSmat(). There is not very extensive functionality for + // SparseMat just yet (e.g. no matrix multiply); we will add things as needed + // and as it seems necessary. + private: + // vector of SparseVectors, all of same dime (use an stl vector for now; this + // could change). + std::vector > rows_; +}; + + +template +Real TraceMatSmat(const MatrixBase &A, + const SparseMatrix &B, + MatrixTransposeType trans = kNoTrans); + + +enum GeneralMatrixType { + kFullMatrix, + kCompressedMatrix, + kSparseMatrix +}; + +/// This class is a wrapper that enables you to store a matrix +/// in one of three forms: either as a Matrix, or a CompressedMatrix, +/// or a SparseMatrix. It handles the I/O for you, i.e. you read +/// and write a single object type. It is useful for neural-net training +/// targets which might be sparse or not, and might be compressed or not. +class GeneralMatrix { + public: + /// Returns the type of the matrix: kSparseMatrix, kCompressedMatrix or + /// kFullMatrix. If this matrix is empty, returns kFullMatrix. + GeneralMatrixType Type() const; + + void Compress(); // If it was a full matrix, compresses, changing Type() to + // kCompressedMatrix; otherwise does nothing. + + void Uncompress(); // If it was a compressed matrix, uncompresses, changing + // Type() to kFullMatrix; otherwise does nothing. + + void Write(std::ostream &os, bool binary) const; + + + /// Note: if you write a compressed matrix in text form, it will be read as + /// a regular full matrix. + void Read(std::istream &is, bool binary); + + /// Returns the contents as a SparseMatrix. This will only work if + /// Type() returns kSparseMatrix, or NumRows() == 0; otherwise it will crash. + const SparseMatrix &GetSparseMatrix() const; + + /// Swaps the with the given SparseMatrix. This will only work if + /// Type() returns kSparseMatrix, or NumRows() == 0. + void SwapSparseMatrix(SparseMatrix *smat); + + /// Returns the contents as a compressed matrix. This will only work if + /// Type() returns kCompressedMatrix, or NumRows() == 0; otherwise it will + /// crash. + const CompressedMatrix &GetCompressedMatrix() const; + + /// Swaps the with the given CompressedMatrix. This will only work if + /// Type() returns kCompressedMatrix, or NumRows() == 0. + void SwapCompressedMatrix(CompressedMatrix *cmat); + + /// Returns the contents as a Matrix. This will only work if + /// Type() returns kFullMatrix, or NumRows() == 0; otherwise it will crash. + const Matrix& GetFullMatrix() const; + + /// Outputs the contents as a matrix. This will work regardless of + /// Type(). Sizes its output, unlike CopyToMat(). + void GetMatrix(Matrix *mat) const; + + /// Swaps the with the given Matrix. This will only work if + /// Type() returns kFullMatrix, or NumRows() == 0. + void SwapFullMatrix(Matrix *mat); + + /// Copies contents, regardless of type, to "mat", which must be correctly + /// sized. See also GetMatrix(), which will size its output for you. + void CopyToMat(MatrixBase *mat, + MatrixTransposeType trans = kNoTrans) const; + + /// Copies contents, regardless of type, to "cu_mat", which must be + /// correctly sized. Implemented in ../cudamatrix/cu-sparse-matrix.cc + void CopyToMat(CuMatrixBase *cu_mat, + MatrixTransposeType trans = kNoTrans) const; + + /// Adds alpha times *this to mat. + void AddToMat(BaseFloat alpha, MatrixBase *mat, + MatrixTransposeType trans = kNoTrans) const; + + /// Adds alpha times *this to cu_mat. + /// Implemented in ../cudamatrix/cu-sparse-matrix.cc + void AddToMat(BaseFloat alpha, CuMatrixBase *cu_mat, + MatrixTransposeType trans = kNoTrans) const; + + /// Scale each element of matrix by alpha. + void Scale(BaseFloat alpha); + + /// Assignment from regular matrix. + GeneralMatrix &operator= (const MatrixBase &mat); + + /// Assignment from compressed matrix. + GeneralMatrix &operator= (const CompressedMatrix &mat); + + /// Assignment from SparseMatrix + GeneralMatrix &operator= (const SparseMatrix &smat); + + MatrixIndexT NumRows() const; + + MatrixIndexT NumCols() const; + + explicit GeneralMatrix(const MatrixBase &mat) { *this = mat; } + + explicit GeneralMatrix(const CompressedMatrix &cmat) { *this = cmat; } + + explicit GeneralMatrix(const SparseMatrix &smat) { *this = smat; } + + GeneralMatrix() { } + // Assignment operator. + GeneralMatrix &operator =(const GeneralMatrix &other); + // Copy constructor + GeneralMatrix(const GeneralMatrix &other) { *this = other; } + // Sets to the empty matrix. + void Clear(); + // shallow swap + void Swap(GeneralMatrix *other); + private: + // We don't explicitly store the type of the matrix. Rather, we make + // sure that only one of the matrices is ever nonempty, and the Type() + // returns that one, or kFullMatrix if all are empty. + Matrix mat_; + CompressedMatrix cmat_; + SparseMatrix smat_; +}; + + +/// Appends all the matrix rows of a list of GeneralMatrixes, to get a single +/// GeneralMatrix. Preserves sparsity if all inputs were sparse (or empty). +/// Does not preserve compression, if inputs were compressed; you have to +/// re-compress manually, if that's what you need. +void AppendGeneralMatrixRows(const std::vector &src, + GeneralMatrix *mat); + + +/// Outputs a SparseMatrix containing only the rows r of "in" such that +/// keep_rows[r] == true. keep_rows.size() must equal in.NumRows(), and rows +/// must contain at least one "true" element. +template +void FilterSparseMatrixRows(const SparseMatrix &in, + const std::vector &keep_rows, + SparseMatrix *out); + +/// Outputs a Matrix containing only the rows r of "in" such that +/// keep_keep_rows[r] == true. keep_rows.size() must equal in.NumRows(), and +/// keep_rows must contain at least one "true" element. +template +void FilterMatrixRows(const Matrix &in, + const std::vector &keep_rows, + Matrix *out); + +/// Outputs a Matrix containing only the rows r of "in" such that +/// keep_rows[r] == true. keep_rows.size() must equal in.NumRows(), and rows +/// must contain at least one "true" element. +void FilterCompressedMatrixRows(const CompressedMatrix &in, + const std::vector &keep_rows, + Matrix *out); + + +/// Outputs a GeneralMatrix containing only the rows r of "in" such that +/// keep_rows[r] == true. keep_rows.size() must equal in.NumRows(), and +/// keep_rows must contain at least one "true" element. If in.Type() is +/// kCompressedMatrix, the result will not be compressed; otherwise, the type +/// is preserved. +void FilterGeneralMatrixRows(const GeneralMatrix &in, + const std::vector &keep_rows, + GeneralMatrix *out); + +/// This function extracts a row-range of a GeneralMatrix and writes +/// as a GeneralMatrix containing the same type of underlying +/// matrix. If the row-range is partly outside the row-range of 'in' +/// (i.e. if row_offset < 0 or row_offset + num_rows > in.NumRows()) +/// then it will pad with copies of the first and last row as +/// needed. +/// This is more efficient than un-compressing and +/// re-compressing the underlying CompressedMatrix, and causes +/// less accuracy loss due to re-compression (no loss in most cases). +void ExtractRowRangeWithPadding( + const GeneralMatrix &in, + int32 row_offset, + int32 num_rows, + GeneralMatrix *out); + + +/// @} end of \addtogroup matrix_group + + +} // namespace kaldi + +#endif // KALDI_MATRIX_SPARSE_MATRIX_H_ diff --git a/speechx/speechx/kaldi/matrix/srfft.cc b/speechx/speechx/kaldi/matrix/srfft.cc new file mode 100644 index 00000000..f6189496 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/srfft.cc @@ -0,0 +1,440 @@ +// matrix/srfft.cc + +// Copyright 2009-2011 Microsoft Corporation; Go Vivace Inc. + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +// + +// This file includes a modified version of code originally published in Malvar, +// H., "Signal processing with lapped transforms, " Artech House, Inc., 1992. The +// current copyright holder of the original code, Henrique S. Malvar, has given +// his permission for the release of this modified version under the Apache +// License v2.0. + + +#include "matrix/srfft.h" +#include "matrix/matrix-functions.h" + +namespace kaldi { + + +template +SplitRadixComplexFft::SplitRadixComplexFft(MatrixIndexT N) { + if ( (N & (N-1)) != 0 || N <= 1) + KALDI_ERR << "SplitRadixComplexFft called with invalid number of points " + << N; + N_ = N; + logn_ = 0; + while (N > 1) { + N >>= 1; + logn_ ++; + } + ComputeTables(); +} + +template +SplitRadixComplexFft::SplitRadixComplexFft( + const SplitRadixComplexFft &other): + N_(other.N_), logn_(other.logn_) { + // This code duplicates tables from a previously computed object. + // Compare with the code in ComputeTables(). + MatrixIndexT lg2 = logn_ >> 1; + if (logn_ & 1) lg2++; + MatrixIndexT brseed_size = 1 << lg2; + brseed_ = new MatrixIndexT[brseed_size]; + std::memcpy(brseed_, other.brseed_, sizeof(MatrixIndexT) * brseed_size); + + if (logn_ < 4) { + tab_ = NULL; + } else { + tab_ = new Real*[logn_ - 3]; + for (MatrixIndexT i = logn_; i >= 4 ; i--) { + MatrixIndexT m = 1 << i, m2 = m / 2, m4 = m2 / 2; + MatrixIndexT this_array_size = 6 * (m4 - 2); + tab_[i-4] = new Real[this_array_size]; + std::memcpy(tab_[i-4], other.tab_[i-4], + sizeof(Real) * this_array_size); + } + } +} + +template +void SplitRadixComplexFft::ComputeTables() { + MatrixIndexT imax, lg2, i, j; + MatrixIndexT m, m2, m4, m8, nel, n; + Real *cn, *spcn, *smcn, *c3n, *spc3n, *smc3n; + Real ang, c, s; + + lg2 = logn_ >> 1; + if (logn_ & 1) lg2++; + brseed_ = new MatrixIndexT[1 << lg2]; + brseed_[0] = 0; + brseed_[1] = 1; + for (j = 2; j <= lg2; j++) { + imax = 1 << (j - 1); + for (i = 0; i < imax; i++) { + brseed_[i] <<= 1; + brseed_[i + imax] = brseed_[i] + 1; + } + } + + if (logn_ < 4) { + tab_ = NULL; + } else { + tab_ = new Real* [logn_-3]; + for (i = logn_; i>=4 ; i--) { + /* Compute a few constants */ + m = 1 << i; m2 = m / 2; m4 = m2 / 2; m8 = m4 /2; + + /* Allocate memory for tables */ + nel = m4 - 2; + + tab_[i-4] = new Real[6*nel]; + + /* Initialize pointers */ + cn = tab_[i-4]; spcn = cn + nel; smcn = spcn + nel; + c3n = smcn + nel; spc3n = c3n + nel; smc3n = spc3n + nel; + + /* Compute tables */ + for (n = 1; n < m4; n++) { + if (n == m8) continue; + ang = n * M_2PI / m; + c = std::cos(ang); s = std::sin(ang); + *cn++ = c; *spcn++ = - (s + c); *smcn++ = s - c; + ang = 3 * n * M_2PI / m; + c = std::cos(ang); s = std::sin(ang); + *c3n++ = c; *spc3n++ = - (s + c); *smc3n++ = s - c; + } + } + } +} + +template +SplitRadixComplexFft::~SplitRadixComplexFft() { + delete [] brseed_; + if (tab_ != NULL) { + for (MatrixIndexT i = 0; i < logn_-3; i++) + delete [] tab_[i]; + delete [] tab_; + } +} + +template +void SplitRadixComplexFft::Compute(Real *xr, Real *xi, bool forward) const { + if (!forward) { // reverse real and imaginary parts for complex FFT. + Real *tmp = xr; + xr = xi; + xi = tmp; + } + ComputeRecursive(xr, xi, logn_); + if (logn_ > 1) { + BitReversePermute(xr, logn_); + BitReversePermute(xi, logn_); + } +} + +template +void SplitRadixComplexFft::Compute(Real *x, bool forward, + std::vector *temp_buffer) const { + KALDI_ASSERT(temp_buffer != NULL); + if (temp_buffer->size() != N_) + temp_buffer->resize(N_); + Real *temp_ptr = &((*temp_buffer)[0]); + for (MatrixIndexT i = 0; i < N_; i++) { + x[i] = x[i * 2]; // put the real part in the first half of x. + temp_ptr[i] = x[i * 2 + 1]; // put the imaginary part in temp_buffer. + } + // copy the imaginary part back to the second half of x. + memcpy(static_cast(x + N_), + static_cast(temp_ptr), + sizeof(Real) * N_); + + Compute(x, x + N_, forward); + // Now change the format back to interleaved. + memcpy(static_cast(temp_ptr), + static_cast(x + N_), + sizeof(Real) * N_); + for (MatrixIndexT i = N_-1; i > 0; i--) { // don't include 0, + // in case MatrixIndexT is unsigned, the loop would not terminate. + // Treat it as a special case. + x[i*2] = x[i]; + x[i*2 + 1] = temp_ptr[i]; + } + x[1] = temp_ptr[0]; // special case of i = 0. +} + +template +void SplitRadixComplexFft::Compute(Real *x, bool forward) { + this->Compute(x, forward, &temp_buffer_); +} + +template +void SplitRadixComplexFft::BitReversePermute(Real *x, MatrixIndexT logn) const { + MatrixIndexT i, j, lg2, n; + MatrixIndexT off, fj, gno, *brp; + Real tmp, *xp, *xq; + + lg2 = logn >> 1; + n = 1 << lg2; + if (logn & 1) lg2++; + + /* Unshuffling loop */ + for (off = 1; off < n; off++) { + fj = n * brseed_[off]; i = off; j = fj; + tmp = x[i]; x[i] = x[j]; x[j] = tmp; + xp = &x[i]; + brp = &(brseed_[1]); + for (gno = 1; gno < brseed_[off]; gno++) { + xp += n; + j = fj + *brp++; + xq = x + j; + tmp = *xp; *xp = *xq; *xq = tmp; + } + } +} + + +template +void SplitRadixComplexFft::ComputeRecursive(Real *xr, Real *xi, MatrixIndexT logn) const { + + MatrixIndexT m, m2, m4, m8, nel, n; + Real *xr1, *xr2, *xi1, *xi2; + Real *cn = nullptr, *spcn = nullptr, *smcn = nullptr, *c3n = nullptr, + *spc3n = nullptr, *smc3n = nullptr; + Real tmp1, tmp2; + Real sqhalf = M_SQRT1_2; + + /* Check range of logn */ + if (logn < 0) + KALDI_ERR << "Error: logn is out of bounds in SRFFT"; + + /* Compute trivial cases */ + if (logn < 3) { + if (logn == 2) { /* length m = 4 */ + xr2 = xr + 2; + xi2 = xi + 2; + tmp1 = *xr + *xr2; + *xr2 = *xr - *xr2; + *xr = tmp1; + tmp1 = *xi + *xi2; + *xi2 = *xi - *xi2; + *xi = tmp1; + xr1 = xr + 1; + xi1 = xi + 1; + xr2++; + xi2++; + tmp1 = *xr1 + *xr2; + *xr2 = *xr1 - *xr2; + *xr1 = tmp1; + tmp1 = *xi1 + *xi2; + *xi2 = *xi1 - *xi2; + *xi1 = tmp1; + xr2 = xr + 1; + xi2 = xi + 1; + tmp1 = *xr + *xr2; + *xr2 = *xr - *xr2; + *xr = tmp1; + tmp1 = *xi + *xi2; + *xi2 = *xi - *xi2; + *xi = tmp1; + xr1 = xr + 2; + xi1 = xi + 2; + xr2 = xr + 3; + xi2 = xi + 3; + tmp1 = *xr1 + *xi2; + tmp2 = *xi1 + *xr2; + *xi1 = *xi1 - *xr2; + *xr2 = *xr1 - *xi2; + *xr1 = tmp1; + *xi2 = tmp2; + return; + } + else if (logn == 1) { /* length m = 2 */ + xr2 = xr + 1; + xi2 = xi + 1; + tmp1 = *xr + *xr2; + *xr2 = *xr - *xr2; + *xr = tmp1; + tmp1 = *xi + *xi2; + *xi2 = *xi - *xi2; + *xi = tmp1; + return; + } + else if (logn == 0) return; /* length m = 1 */ + } + + /* Compute a few constants */ + m = 1 << logn; m2 = m / 2; m4 = m2 / 2; m8 = m4 /2; + + + /* Step 1 */ + xr1 = xr; xr2 = xr1 + m2; + xi1 = xi; xi2 = xi1 + m2; + for (n = 0; n < m2; n++) { + tmp1 = *xr1 + *xr2; + *xr2 = *xr1 - *xr2; + xr2++; + *xr1++ = tmp1; + tmp2 = *xi1 + *xi2; + *xi2 = *xi1 - *xi2; + xi2++; + *xi1++ = tmp2; + } + + /* Step 2 */ + xr1 = xr + m2; xr2 = xr1 + m4; + xi1 = xi + m2; xi2 = xi1 + m4; + for (n = 0; n < m4; n++) { + tmp1 = *xr1 + *xi2; + tmp2 = *xi1 + *xr2; + *xi1 = *xi1 - *xr2; + xi1++; + *xr2++ = *xr1 - *xi2; + *xr1++ = tmp1; + *xi2++ = tmp2; + // xr1++; xr2++; xi1++; xi2++; + } + + /* Steps 3 & 4 */ + xr1 = xr + m2; xr2 = xr1 + m4; + xi1 = xi + m2; xi2 = xi1 + m4; + if (logn >= 4) { + nel = m4 - 2; + cn = tab_[logn-4]; spcn = cn + nel; smcn = spcn + nel; + c3n = smcn + nel; spc3n = c3n + nel; smc3n = spc3n + nel; + } + xr1++; xr2++; xi1++; xi2++; + // xr1++; xi1++; + for (n = 1; n < m4; n++) { + if (n == m8) { + tmp1 = sqhalf * (*xr1 + *xi1); + *xi1 = sqhalf * (*xi1 - *xr1); + *xr1 = tmp1; + tmp2 = sqhalf * (*xi2 - *xr2); + *xi2 = -sqhalf * (*xr2 + *xi2); + *xr2 = tmp2; + } else { + tmp2 = *cn++ * (*xr1 + *xi1); + tmp1 = *spcn++ * *xr1 + tmp2; + *xr1 = *smcn++ * *xi1 + tmp2; + *xi1 = tmp1; + tmp2 = *c3n++ * (*xr2 + *xi2); + tmp1 = *spc3n++ * *xr2 + tmp2; + *xr2 = *smc3n++ * *xi2 + tmp2; + *xi2 = tmp1; + } + xr1++; xr2++; xi1++; xi2++; + } + + /* Call ssrec again with half DFT length */ + ComputeRecursive(xr, xi, logn-1); + + /* Call ssrec again twice with one quarter DFT length. + Constants have to be recomputed, because they are static! */ + // m = 1 << logn; m2 = m / 2; + ComputeRecursive(xr + m2, xi + m2, logn - 2); + // m = 1 << logn; + m4 = 3 * (m / 4); + ComputeRecursive(xr + m4, xi + m4, logn - 2); +} + + +template +void SplitRadixRealFft::Compute(Real *data, bool forward) { + Compute(data, forward, &this->temp_buffer_); +} + + +// This code is mostly the same as the RealFft function. It would be +// possible to replace it with more efficient code from Rico's book. +template +void SplitRadixRealFft::Compute(Real *data, bool forward, + std::vector *temp_buffer) const { + MatrixIndexT N = N_, N2 = N/2; + KALDI_ASSERT(N%2 == 0); + if (forward) // call to base class + SplitRadixComplexFft::Compute(data, true, temp_buffer); + + Real rootN_re, rootN_im; // exp(-2pi/N), forward; exp(2pi/N), backward + int forward_sign = forward ? -1 : 1; + ComplexImExp(static_cast(M_2PI/N *forward_sign), &rootN_re, &rootN_im); + Real kN_re = -forward_sign, kN_im = 0.0; // exp(-2pik/N), forward; exp(-2pik/N), backward + // kN starts out as 1.0 for forward algorithm but -1.0 for backward. + for (MatrixIndexT k = 1; 2*k <= N2; k++) { + ComplexMul(rootN_re, rootN_im, &kN_re, &kN_im); + + Real Ck_re, Ck_im, Dk_re, Dk_im; + // C_k = 1/2 (B_k + B_{N/2 - k}^*) : + Ck_re = 0.5 * (data[2*k] + data[N - 2*k]); + Ck_im = 0.5 * (data[2*k + 1] - data[N - 2*k + 1]); + // re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})): + Dk_re = 0.5 * (data[2*k + 1] + data[N - 2*k + 1]); + // im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k})) + Dk_im =-0.5 * (data[2*k] - data[N - 2*k]); + // A_k = C_k + 1^(k/N) D_k: + data[2*k] = Ck_re; // A_k <-- C_k + data[2*k+1] = Ck_im; + // now A_k += D_k 1^(k/N) + ComplexAddProduct(Dk_re, Dk_im, kN_re, kN_im, &(data[2*k]), &(data[2*k+1])); + + MatrixIndexT kdash = N2 - k; + if (kdash != k) { + // Next we handle the index k' = N/2 - k. This is necessary + // to do now, to avoid invalidating data that we will later need. + // The quantities C_{k'} and D_{k'} are just the conjugates of C_k + // and D_k, so the equations are simple modifications of the above, + // replacing Ck_im and Dk_im with their negatives. + data[2*kdash] = Ck_re; // A_k' <-- C_k' + data[2*kdash+1] = -Ck_im; + // now A_k' += D_k' 1^(k'/N) + // We use 1^(k'/N) = 1^((N/2 - k) / N) = 1^(1/2) 1^(-k/N) = -1 * (1^(k/N))^* + // so it's the same as 1^(k/N) but with the real part negated. + ComplexAddProduct(Dk_re, -Dk_im, -kN_re, kN_im, &(data[2*kdash]), &(data[2*kdash+1])); + } + } + + { // Now handle k = 0. + // In simple terms: after the complex fft, data[0] becomes the sum of real + // parts input[0], input[2]... and data[1] becomes the sum of imaginary + // pats input[1], input[3]... + // "zeroth" [A_0] is just the sum of input[0]+input[1]+input[2].. + // and "n2th" [A_{N/2}] is input[0]-input[1]+input[2]... . + Real zeroth = data[0] + data[1], + n2th = data[0] - data[1]; + data[0] = zeroth; + data[1] = n2th; + if (!forward) { + data[0] /= 2; + data[1] /= 2; + } + } + if (!forward) { // call to base class + SplitRadixComplexFft::Compute(data, false, temp_buffer); + for (MatrixIndexT i = 0; i < N; i++) + data[i] *= 2.0; + // This is so we get a factor of N increase, rather than N/2 which we would + // otherwise get from [ComplexFft, forward] + [ComplexFft, backward] in dimension N/2. + // It's for consistency with our normal FFT convensions. + } +} + +template class SplitRadixComplexFft; +template class SplitRadixComplexFft; +template class SplitRadixRealFft; +template class SplitRadixRealFft; + + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/matrix/srfft.h b/speechx/speechx/kaldi/matrix/srfft.h new file mode 100644 index 00000000..98ff782a --- /dev/null +++ b/speechx/speechx/kaldi/matrix/srfft.h @@ -0,0 +1,141 @@ +// matrix/srfft.h + +// Copyright 2009-2011 Microsoft Corporation; Go Vivace Inc. +// 2014 Daniel Povey +// +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +// +// This file includes a modified version of code originally published in Malvar, +// H., "Signal processing with lapped transforms, " Artech House, Inc., 1992. The +// current copyright holder of the original code, Henrique S. Malvar, has given +// his permission for the release of this modified version under the Apache +// License v2.0. + +#ifndef KALDI_MATRIX_SRFFT_H_ +#define KALDI_MATRIX_SRFFT_H_ + +#include "matrix/kaldi-vector.h" +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + +/// @addtogroup matrix_funcs_misc +/// @{ + + +// This class is based on code by Henrique (Rico) Malvar, from his book +// "Signal Processing with Lapped Transforms" (1992). Copied with +// permission, optimized by Go Vivace Inc., and converted into C++ by +// Microsoft Corporation +// This is a more efficient way of doing the complex FFT than ComplexFft +// (declared in matrix-functios.h), but it only works for powers of 2. +// Note: in multi-threaded code, you would need to have one of these objects per +// thread, because multiple calls to Compute in parallel would not work. +template +class SplitRadixComplexFft { + public: + typedef MatrixIndexT Integer; + + // N is the number of complex points (must be a power of two, or this + // will crash). Note that the constructor does some work so it's best to + // initialize the object once and do the computation many times. + SplitRadixComplexFft(Integer N); + + // Copy constructor + SplitRadixComplexFft(const SplitRadixComplexFft &other); + + // Does the FFT computation, given pointers to the real and + // imaginary parts. If "forward", do the forward FFT; else + // do the inverse FFT (without the 1/N factor). + // xr and xi are pointers to zero-based arrays of size N, + // containing the real and imaginary parts + // respectively. + void Compute(Real *xr, Real *xi, bool forward) const; + + // This version of Compute takes a single array of size N*2, + // containing [ r0 im0 r1 im1 ... ]. Otherwise its behavior is the + // same as the version above. + void Compute(Real *x, bool forward); + + + // This version of Compute is const; it operates on an array of size N*2 + // containing [ r0 im0 r1 im1 ... ], but it uses the argument "temp_buffer" as + // temporary storage instead of a class-member variable. It will allocate it if + // needed. + void Compute(Real *x, bool forward, std::vector *temp_buffer) const; + + ~SplitRadixComplexFft(); + + protected: + // temp_buffer_ is allocated only if someone calls Compute with only one Real* + // argument and we need a temporary buffer while creating interleaved data. + std::vector temp_buffer_; + private: + void ComputeTables(); + void ComputeRecursive(Real *xr, Real *xi, Integer logn) const; + void BitReversePermute(Real *x, Integer logn) const; + + Integer N_; + Integer logn_; // log(N) + + Integer *brseed_; + // brseed is Evans' seed table, ref: (Ref: D. M. W. + // Evans, "An improved digit-reversal permutation algorithm ...", + // IEEE Trans. ASSP, Aug. 1987, pp. 1120-1125). + Real **tab_; // Tables of butterfly coefficients. + + // Disallow assignment. + SplitRadixComplexFft &operator =(const SplitRadixComplexFft &other); +}; + +template +class SplitRadixRealFft: private SplitRadixComplexFft { + public: + SplitRadixRealFft(MatrixIndexT N): // will fail unless N>=4 and N is a power of 2. + SplitRadixComplexFft (N/2), N_(N) { } + + // Copy constructor + SplitRadixRealFft(const SplitRadixRealFft &other): + SplitRadixComplexFft(other), N_(other.N_) { } + + /// If forward == true, this function transforms from a sequence of N real points to its complex fourier + /// transform; otherwise it goes in the reverse direction. If you call it + /// in the forward and then reverse direction and multiply by 1.0/N, you + /// will get back the original data. + /// The interpretation of the complex-FFT data is as follows: the array + /// is a sequence of complex numbers C_n of length N/2 with (real, im) format, + /// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...]. + void Compute(Real *x, bool forward); + + + /// This is as the other Compute() function, but it is a const version that + /// uses a user-supplied buffer. + void Compute(Real *x, bool forward, std::vector *temp_buffer) const; + + private: + // Disallow assignment. + SplitRadixRealFft &operator =(const SplitRadixRealFft &other); + int N_; +}; + + +/// @} end of "addtogroup matrix_funcs_misc" + +} // end namespace kaldi + + +#endif + diff --git a/speechx/speechx/kaldi/matrix/tp-matrix.cc b/speechx/speechx/kaldi/matrix/tp-matrix.cc new file mode 100644 index 00000000..6e34dc64 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/tp-matrix.cc @@ -0,0 +1,145 @@ +// matrix/tp-matrix.cc + +// Copyright 2009-2011 Ondrej Glembek; Lukas Burget; Microsoft Corporation +// Saarland University; Yanmin Qian; Haihua Xu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "matrix/tp-matrix.h" +#include "matrix/sp-matrix.h" +#include "matrix/kaldi-matrix.h" +#include "matrix/cblas-wrappers.h" + + +namespace kaldi { + +#ifndef HAVE_ATLAS +template +void TpMatrix::Invert() { + // these are CLAPACK types + KaldiBlasInt result; + KaldiBlasInt rows = static_cast(this->num_rows_); + + // clapack call + // NOTE: Even though "U" is for upper, lapack assumes column-wise storage + // of the data. We have a row-wise storage, therefore, we need to "invert" + clapack_Xtptri(&rows, this->data_, &result); + + if (result < 0) { + KALDI_ERR << "Call to CLAPACK stptri_ function failed"; + } else if (result > 0) { + KALDI_ERR << "Matrix is singular"; + } +} +#else +template +void TpMatrix::Invert() { + // ATLAS doesn't implement triangular matrix inversion in packed + // format, so we temporarily put in non-packed format. + Matrix tmp(*this); + int rows = static_cast(this->num_rows_); + + // ATLAS call. It's really row-major ordering and a lower triangular matrix, + // but there is some weirdness with Fortran-style indexing that we need to + // take account of, so everything gets swapped. + int result = clapack_Xtrtri( rows, tmp.Data(), tmp.Stride()); + // Let's hope ATLAS has the same return value conventions as clapack. + // I couldn't find any documentation online. + if (result < 0) { + KALDI_ERR << "Call to ATLAS strtri function failed"; + } else if (result > 0) { + KALDI_ERR << "Matrix is singular"; + } + (*this).CopyFromMat(tmp); +} +#endif + +template +Real TpMatrix::Determinant() { + double det = 1.0; + for (MatrixIndexT i = 0; iNumRows(); i++) { + det *= (*this)(i, i); + } + return static_cast(det); +} + + +template +void TpMatrix::Swap(TpMatrix *other) { + std::swap(this->data_, other->data_); + std::swap(this->num_rows_, other->num_rows_); +} + + +template +void TpMatrix::Cholesky(const SpMatrix &orig) { + KALDI_ASSERT(orig.NumRows() == this->NumRows()); + MatrixIndexT n = this->NumRows(); + this->SetZero(); + Real *data = this->data_, *jdata = data; // start of j'th row of matrix. + const Real *orig_jdata = orig.Data(); // start of j'th row of matrix. + for (MatrixIndexT j = 0; j < n; j++, jdata += j, orig_jdata += j) { + Real *kdata = data; // start of k'th row of matrix. + Real d(0.0); + for (MatrixIndexT k = 0; k < j; k++, kdata += k) { + Real s = cblas_Xdot(k, kdata, 1, jdata, 1); + // (*this)(j, k) = s = (orig(j, k) - s)/(*this)(k, k); + jdata[k] = s = (orig_jdata[k] - s)/kdata[k]; + d = d + s*s; + } + // d = orig(j, j) - d; + d = orig_jdata[j] - d; + + if (d >= 0.0) { + // (*this)(j, j) = std::sqrt(d); + jdata[j] = std::sqrt(d); + } else { + KALDI_ERR << "Cholesky decomposition failed. Maybe matrix " + "is not positive definite."; + } + } +} + +template +void TpMatrix::CopyFromMat(const MatrixBase &M, + MatrixTransposeType Trans) { + if (Trans == kNoTrans) { + KALDI_ASSERT(this->NumRows() == M.NumRows() && M.NumRows() == M.NumCols()); + MatrixIndexT D = this->NumRows(); + const Real *in_i = M.Data(); + MatrixIndexT stride = M.Stride(); + Real *out_i = this->data_; + for (MatrixIndexT i = 0; i < D; i++, in_i += stride, out_i += i) + for (MatrixIndexT j = 0; j <= i; j++) + out_i[j] = in_i[j]; + } else { + KALDI_ASSERT(this->NumRows() == M.NumRows() && M.NumRows() == M.NumCols()); + MatrixIndexT D = this->NumRows(); + const Real *in_i = M.Data(); + MatrixIndexT stride = M.Stride(); + Real *out_i = this->data_; + for (MatrixIndexT i = 0; i < D; i++, in_i++, out_i += i) { + for (MatrixIndexT j = 0; j <= i; j++) + out_i[j] = in_i[stride*j]; + } + } +} + + +template class TpMatrix; +template class TpMatrix; + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/matrix/tp-matrix.h b/speechx/speechx/kaldi/matrix/tp-matrix.h new file mode 100644 index 00000000..e3b08701 --- /dev/null +++ b/speechx/speechx/kaldi/matrix/tp-matrix.h @@ -0,0 +1,134 @@ +// matrix/tp-matrix.h + +// Copyright 2009-2011 Ondrej Glembek; Lukas Burget; Microsoft Corporation; +// Saarland University; Yanmin Qian; Haihua Xu +// 2013 Johns Hopkins Universith (author: Daniel Povey) + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_MATRIX_TP_MATRIX_H_ +#define KALDI_MATRIX_TP_MATRIX_H_ + + +#include "matrix/packed-matrix.h" + +namespace kaldi { +/// \addtogroup matrix_group +/// @{ + +template class TpMatrix; + +/// @brief Packed symetric matrix class +template +class TpMatrix : public PackedMatrix { + friend class CuTpMatrix; + friend class CuTpMatrix; + public: + TpMatrix() : PackedMatrix() {} + explicit TpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero) + : PackedMatrix(r, resize_type) {} + TpMatrix(const TpMatrix& orig) : PackedMatrix(orig) {} + + /// Copy constructor from CUDA TpMatrix + /// This is defined in ../cudamatrix/cu-tp-matrix.cc + explicit TpMatrix(const CuTpMatrix &cu); + + + template explicit TpMatrix(const TpMatrix& orig) + : PackedMatrix(orig) {} + + Real operator() (MatrixIndexT r, MatrixIndexT c) const { + if (static_cast(c) > + static_cast(r)) { + KALDI_ASSERT(static_cast(c) < + static_cast(this->num_rows_)); + return 0; + } + KALDI_ASSERT(static_cast(r) < + static_cast(this->num_rows_)); + // c<=r now so don't have to check c. + return *(this->data_ + (r*(r+1)) / 2 + c); + // Duplicating code from PackedMatrix.h + } + + Real &operator() (MatrixIndexT r, MatrixIndexT c) { + KALDI_ASSERT(static_cast(r) < + static_cast(this->num_rows_)); + KALDI_ASSERT(static_cast(c) <= + static_cast(r) && + "you cannot access the upper triangle of TpMatrix using " + "a non-const matrix object."); + return *(this->data_ + (r*(r+1)) / 2 + c); + // Duplicating code from PackedMatrix.h + } + // Note: Cholesky may throw KaldiFatalError. + void Cholesky(const SpMatrix& orig); + + void Invert(); + + // Inverts in double precision. + void InvertDouble() { + TpMatrix dmat(*this); + dmat.Invert(); + (*this).CopyFromTp(dmat); + } + + /// Shallow swap + void Swap(TpMatrix *other); + + /// Returns the determinant of the matrix (product of diagonals) + Real Determinant(); + + /// CopyFromMat copies the lower triangle of M into *this + /// (or the upper triangle, if Trans == kTrans). + void CopyFromMat(const MatrixBase &M, + MatrixTransposeType Trans = kNoTrans); + + /// This is implemented in ../cudamatrix/cu-tp-matrix.cc + void CopyFromMat(const CuTpMatrix &other); + + /// CopyFromTp copies another triangular matrix into this one. + void CopyFromTp(const TpMatrix &other) { + PackedMatrix::CopyFromPacked(other); + } + + template void CopyFromTp(const TpMatrix &other) { + PackedMatrix::CopyFromPacked(other); + } + + /// AddTp does *this += alpha * M. + void AddTp(const Real alpha, const TpMatrix &M) { + this->AddPacked(alpha, M); + } + + TpMatrix& operator=(const TpMatrix &other) { + PackedMatrix::operator=(other); + return *this; + } + + using PackedMatrix::Scale; + + void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero) { + PackedMatrix::Resize(nRows, resize_type); + } +}; + +/// @} end of "addtogroup matrix_group". + +} // namespace kaldi + + +#endif diff --git a/speechx/speechx/kaldi/util/CMakeLists.txt b/speechx/speechx/kaldi/util/CMakeLists.txt new file mode 100644 index 00000000..1ab26df3 --- /dev/null +++ b/speechx/speechx/kaldi/util/CMakeLists.txt @@ -0,0 +1,12 @@ +add_library(kaldi-util + kaldi-holder.cc + kaldi-io.cc + kaldi-semaphore.cc + kaldi-table.cc + kaldi-thread.cc + parse-options.cc + simple-io-funcs.cc + simple-options.cc + text-utils.cc +) +target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix) \ No newline at end of file diff --git a/speechx/speechx/kaldi/util/basic-filebuf.h b/speechx/speechx/kaldi/util/basic-filebuf.h new file mode 100644 index 00000000..51cf12f4 --- /dev/null +++ b/speechx/speechx/kaldi/util/basic-filebuf.h @@ -0,0 +1,994 @@ +/////////////////////////////////////////////////////////////////////////////// +// This is a modified version of the std::basic_filebuf from libc++ +// (http://libcxx.llvm.org/). +// It allows one to create basic_filebuf from an existing FILE* handle or file +// descriptor. +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source License licenses. See LICENSE.TXT for details (included at the +// bottom). +/////////////////////////////////////////////////////////////////////////////// +#ifndef KALDI_UTIL_BASIC_FILEBUF_H_ +#define KALDI_UTIL_BASIC_FILEBUF_H_ + +/////////////////////////////////////////////////////////////////////////////// +#include +#include +#include +#include +#include + +/////////////////////////////////////////////////////////////////////////////// +namespace kaldi { +/////////////////////////////////////////////////////////////////////////////// +template > +class basic_filebuf : public std::basic_streambuf { + public: + typedef CharT char_type; + typedef Traits traits_type; + typedef typename traits_type::int_type int_type; + typedef typename traits_type::pos_type pos_type; + typedef typename traits_type::off_type off_type; + typedef typename traits_type::state_type state_type; + + basic_filebuf(); + basic_filebuf(basic_filebuf&& rhs); + virtual ~basic_filebuf(); + + basic_filebuf& operator=(basic_filebuf&& rhs); + void swap(basic_filebuf& rhs); + + bool is_open() const; + basic_filebuf* open(const char* s, std::ios_base::openmode mode); + basic_filebuf* open(const std::string& s, std::ios_base::openmode mode); + basic_filebuf* open(int fd, std::ios_base::openmode mode); + basic_filebuf* open(FILE* f, std::ios_base::openmode mode); + basic_filebuf* close(); + + FILE* file() { return this->_M_file; } + int fd() { return fileno(this->_M_file); } + + protected: + int_type underflow() override; + int_type pbackfail(int_type c = traits_type::eof()) override; + int_type overflow(int_type c = traits_type::eof()) override; + std::basic_streambuf* + setbuf(char_type* s, std::streamsize n) override; + pos_type seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode wch = + std::ios_base::in | std::ios_base::out) override; + pos_type seekpos(pos_type sp, + std::ios_base::openmode wch = + std::ios_base::in | std::ios_base::out) override; + int sync() override; + void imbue(const std::locale& loc) override; + + protected: + char* _M_extbuf; + const char* _M_extbufnext; + const char* _M_extbufend; + char _M_extbuf_min[8]; + size_t _M_ebs; + char_type* _M_intbuf; + size_t _M_ibs; + FILE* _M_file; + const std::codecvt* _M_cv; + state_type _M_st; + state_type _M_st_last; + std::ios_base::openmode _M_om; + std::ios_base::openmode _M_cm; + bool _M_owns_eb; + bool _M_owns_ib; + bool _M_always_noconv; + + const char* _M_get_mode(std::ios_base::openmode mode); + bool _M_read_mode(); + void _M_write_mode(); +}; + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf::basic_filebuf() + : _M_extbuf(nullptr), + _M_extbufnext(nullptr), + _M_extbufend(nullptr), + _M_ebs(0), + _M_intbuf(nullptr), + _M_ibs(0), + _M_file(nullptr), + _M_cv(nullptr), + _M_st(), + _M_st_last(), + _M_om(std::ios_base::openmode(0)), + _M_cm(std::ios_base::openmode(0)), + _M_owns_eb(false), + _M_owns_ib(false), + _M_always_noconv(false) { + if (std::has_facet > + (this->getloc())) { + _M_cv = &std::use_facet > + (this->getloc()); + _M_always_noconv = _M_cv->always_noconv(); + } + setbuf(0, 4096); +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf::basic_filebuf(basic_filebuf&& rhs) + : std::basic_streambuf(rhs) { + if (rhs._M_extbuf == rhs._M_extbuf_min) { + _M_extbuf = _M_extbuf_min; + _M_extbufnext = _M_extbuf + (rhs._M_extbufnext - rhs._M_extbuf); + _M_extbufend = _M_extbuf + (rhs._M_extbufend - rhs._M_extbuf); + } else { + _M_extbuf = rhs._M_extbuf; + _M_extbufnext = rhs._M_extbufnext; + _M_extbufend = rhs._M_extbufend; + } + _M_ebs = rhs._M_ebs; + _M_intbuf = rhs._M_intbuf; + _M_ibs = rhs._M_ibs; + _M_file = rhs._M_file; + _M_cv = rhs._M_cv; + _M_st = rhs._M_st; + _M_st_last = rhs._M_st_last; + _M_om = rhs._M_om; + _M_cm = rhs._M_cm; + _M_owns_eb = rhs._M_owns_eb; + _M_owns_ib = rhs._M_owns_ib; + _M_always_noconv = rhs._M_always_noconv; + if (rhs.pbase()) { + if (rhs.pbase() == rhs._M_intbuf) + this->setp(_M_intbuf, _M_intbuf + (rhs. epptr() - rhs.pbase())); + else + this->setp(reinterpret_cast(_M_extbuf), + reinterpret_cast(_M_extbuf) + + (rhs. epptr() - rhs.pbase())); + this->pbump(rhs. pptr() - rhs.pbase()); + } else if (rhs.eback()) { + if (rhs.eback() == rhs._M_intbuf) + this->setg(_M_intbuf, _M_intbuf + (rhs.gptr() - rhs.eback()), + _M_intbuf + (rhs.egptr() - rhs.eback())); + else + this->setg(reinterpret_cast(_M_extbuf), + reinterpret_cast(_M_extbuf) + + (rhs.gptr() - rhs.eback()), + reinterpret_cast(_M_extbuf) + + (rhs.egptr() - rhs.eback())); + } + rhs._M_extbuf = nullptr; + rhs._M_extbufnext = nullptr; + rhs._M_extbufend = nullptr; + rhs._M_ebs = 0; + rhs._M_intbuf = nullptr; + rhs._M_ibs = 0; + rhs._M_file = nullptr; + rhs._M_st = state_type(); + rhs._M_st_last = state_type(); + rhs._M_om = std::ios_base::openmode(0); + rhs._M_cm = std::ios_base::openmode(0); + rhs._M_owns_eb = false; + rhs._M_owns_ib = false; + rhs.setg(0, 0, 0); + rhs.setp(0, 0); +} + +/////////////////////////////////////////////////////////////////////////////// +template +inline +basic_filebuf& +basic_filebuf::operator=(basic_filebuf&& rhs) { + close(); + swap(rhs); + return *this; +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf::~basic_filebuf() { + // try + // { + // close(); + // } + // catch (...) + // { + // } + if (_M_owns_eb) + delete [] _M_extbuf; + if (_M_owns_ib) + delete [] _M_intbuf; +} + +/////////////////////////////////////////////////////////////////////////////// +template +void +basic_filebuf::swap(basic_filebuf& rhs) { + std::basic_streambuf::swap(rhs); + if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { + std::swap(_M_extbuf, rhs._M_extbuf); + std::swap(_M_extbufnext, rhs._M_extbufnext); + std::swap(_M_extbufend, rhs._M_extbufend); + } else { + ptrdiff_t ln = _M_extbufnext - _M_extbuf; + ptrdiff_t le = _M_extbufend - _M_extbuf; + ptrdiff_t rn = rhs._M_extbufnext - rhs._M_extbuf; + ptrdiff_t re = rhs._M_extbufend - rhs._M_extbuf; + if (_M_extbuf == _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { + _M_extbuf = rhs._M_extbuf; + rhs._M_extbuf = rhs._M_extbuf_min; + } else if (_M_extbuf != _M_extbuf_min && + rhs._M_extbuf == rhs._M_extbuf_min) { + rhs._M_extbuf = _M_extbuf; + _M_extbuf = _M_extbuf_min; + } + _M_extbufnext = _M_extbuf + rn; + _M_extbufend = _M_extbuf + re; + rhs._M_extbufnext = rhs._M_extbuf + ln; + rhs._M_extbufend = rhs._M_extbuf + le; + } + std::swap(_M_ebs, rhs._M_ebs); + std::swap(_M_intbuf, rhs._M_intbuf); + std::swap(_M_ibs, rhs._M_ibs); + std::swap(_M_file, rhs._M_file); + std::swap(_M_cv, rhs._M_cv); + std::swap(_M_st, rhs._M_st); + std::swap(_M_st_last, rhs._M_st_last); + std::swap(_M_om, rhs._M_om); + std::swap(_M_cm, rhs._M_cm); + std::swap(_M_owns_eb, rhs._M_owns_eb); + std::swap(_M_owns_ib, rhs._M_owns_ib); + std::swap(_M_always_noconv, rhs._M_always_noconv); + if (this->eback() == reinterpret_cast(rhs._M_extbuf_min)) { + ptrdiff_t n = this->gptr() - this->eback(); + ptrdiff_t e = this->egptr() - this->eback(); + this->setg(reinterpret_cast(_M_extbuf_min), + reinterpret_cast(_M_extbuf_min) + n, + reinterpret_cast(_M_extbuf_min) + e); + } else if (this->pbase() == + reinterpret_cast(rhs._M_extbuf_min)) { + ptrdiff_t n = this->pptr() - this->pbase(); + ptrdiff_t e = this->epptr() - this->pbase(); + this->setp(reinterpret_cast(_M_extbuf_min), + reinterpret_cast(_M_extbuf_min) + e); + this->pbump(n); + } + if (rhs.eback() == reinterpret_cast(_M_extbuf_min)) { + ptrdiff_t n = rhs.gptr() - rhs.eback(); + ptrdiff_t e = rhs.egptr() - rhs.eback(); + rhs.setg(reinterpret_cast(rhs._M_extbuf_min), + reinterpret_cast(rhs._M_extbuf_min) + n, + reinterpret_cast(rhs._M_extbuf_min) + e); + } else if (rhs.pbase() == reinterpret_cast(_M_extbuf_min)) { + ptrdiff_t n = rhs.pptr() - rhs.pbase(); + ptrdiff_t e = rhs.epptr() - rhs.pbase(); + rhs.setp(reinterpret_cast(rhs._M_extbuf_min), + reinterpret_cast(rhs._M_extbuf_min) + e); + rhs.pbump(n); + } +} + +/////////////////////////////////////////////////////////////////////////////// +template +inline +void +swap(basic_filebuf& x, basic_filebuf& y) { + x.swap(y); +} + +/////////////////////////////////////////////////////////////////////////////// +template +inline +bool +basic_filebuf::is_open() const { + return _M_file != nullptr; +} + +/////////////////////////////////////////////////////////////////////////////// +template +const char* basic_filebuf:: +_M_get_mode(std::ios_base::openmode mode) { + switch ((mode & ~std::ios_base::ate) | 0) { + case std::ios_base::out: + case std::ios_base::out | std::ios_base::trunc: + return "w"; + case std::ios_base::out | std::ios_base::app: + case std::ios_base::app: + return "a"; + break; + case std::ios_base::in: + return "r"; + case std::ios_base::in | std::ios_base::out: + return "r+"; + case std::ios_base::in | std::ios_base::out | std::ios_base::trunc: + return "w+"; + case std::ios_base::in | std::ios_base::out | std::ios_base::app: + case std::ios_base::in | std::ios_base::app: + return "a+"; + case std::ios_base::out | std::ios_base::binary: + case std::ios_base::out | std::ios_base::trunc | std::ios_base::binary: + return "wb"; + case std::ios_base::out | std::ios_base::app | std::ios_base::binary: + case std::ios_base::app | std::ios_base::binary: + return "ab"; + case std::ios_base::in | std::ios_base::binary: + return "rb"; + case std::ios_base::in | std::ios_base::out | std::ios_base::binary: + return "r+b"; + case std::ios_base::in | std::ios_base::out | std::ios_base::trunc | + std::ios_base::binary: + return "w+b"; + case std::ios_base::in | std::ios_base::out | std::ios_base::app | + std::ios_base::binary: + case std::ios_base::in | std::ios_base::app | std::ios_base::binary: + return "a+b"; + default: + return nullptr; + } +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf* +basic_filebuf:: +open(const char* s, std::ios_base::openmode mode) { + basic_filebuf* rt = nullptr; + if (_M_file == nullptr) { + const char* md= _M_get_mode(mode); + if (md) { + _M_file = fopen(s, md); + if (_M_file) { + rt = this; + _M_om = mode; + if (mode & std::ios_base::ate) { + if (fseek(_M_file, 0, SEEK_END)) { + fclose(_M_file); + _M_file = nullptr; + rt = nullptr; + } + } + } + } + } + return rt; +} + +/////////////////////////////////////////////////////////////////////////////// +template +inline +basic_filebuf* +basic_filebuf::open(const std::string& s, + std::ios_base::openmode mode) { + return open(s.c_str(), mode); +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf* +basic_filebuf::open(int fd, std::ios_base::openmode mode) { + const char* md= this->_M_get_mode(mode); + if (md) { + this->_M_file= fdopen(fd, md); + this->_M_om = mode; + return this; + } else { + return nullptr; + } +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf* +basic_filebuf::open(FILE* f, std::ios_base::openmode mode) { + this->_M_file = f; + this->_M_om = mode; + return this; +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf* +basic_filebuf::close() { + basic_filebuf* rt = nullptr; + if (_M_file) { + rt = this; + std::unique_ptr h(_M_file, fclose); + if (sync()) + rt = nullptr; + if (fclose(h.release()) == 0) + _M_file = nullptr; + else + rt = nullptr; + } + return rt; +} + +/////////////////////////////////////////////////////////////////////////////// +template +typename basic_filebuf::int_type +basic_filebuf::underflow() { + if (_M_file == nullptr) + return traits_type::eof(); + bool initial = _M_read_mode(); + char_type buf; + if (this->gptr() == nullptr) + this->setg(&buf, &buf+1, &buf+1); + const size_t unget_sz = initial ? 0 : std:: + min((this->egptr() - this->eback()) / 2, 4); + int_type c = traits_type::eof(); + if (this->gptr() == this->egptr()) { + memmove(this->eback(), this->egptr() - unget_sz, + unget_sz * sizeof(char_type)); + if (_M_always_noconv) { + size_t nmemb = static_cast + (this->egptr() - this->eback() - unget_sz); + nmemb = fread(this->eback() + unget_sz, 1, nmemb, _M_file); + if (nmemb != 0) { + this->setg(this->eback(), + this->eback() + unget_sz, + this->eback() + unget_sz + nmemb); + c = traits_type::to_int_type(*this->gptr()); + } + } else { + memmove(_M_extbuf, _M_extbufnext, _M_extbufend - _M_extbufnext); + _M_extbufnext = _M_extbuf + (_M_extbufend - _M_extbufnext); + _M_extbufend = _M_extbuf + + (_M_extbuf == _M_extbuf_min ? sizeof(_M_extbuf_min) : _M_ebs); + size_t nmemb = std::min(static_cast(_M_ibs - unget_sz), + static_cast + (_M_extbufend - _M_extbufnext)); + std::codecvt_base::result r; + _M_st_last = _M_st; + size_t nr = fread( + reinterpret_cast(const_cast(_M_extbufnext)), + 1, nmemb, _M_file); + if (nr != 0) { + if (!_M_cv) + throw std::bad_cast(); + _M_extbufend = _M_extbufnext + nr; + char_type* inext; + r = _M_cv->in(_M_st, _M_extbuf, _M_extbufend, _M_extbufnext, + this->eback() + unget_sz, + this->eback() + _M_ibs, inext); + if (r == std::codecvt_base::noconv) { + this->setg(reinterpret_cast(_M_extbuf), + reinterpret_cast(_M_extbuf), + const_cast(_M_extbufend)); + c = traits_type::to_int_type(*this->gptr()); + } else if (inext != this->eback() + unget_sz) { + this->setg(this->eback(), this->eback() + unget_sz, inext); + c = traits_type::to_int_type(*this->gptr()); + } + } + } + } else { + c = traits_type::to_int_type(*this->gptr()); + } + if (this->eback() == &buf) + this->setg(0, 0, 0); + return c; +} + +/////////////////////////////////////////////////////////////////////////////// +template +typename basic_filebuf::int_type +basic_filebuf::pbackfail(int_type c) { + if (_M_file && this->eback() < this->gptr()) { + if (traits_type::eq_int_type(c, traits_type::eof())) { + this->gbump(-1); + return traits_type::not_eof(c); + } + if ((_M_om & std::ios_base::out) || + traits_type::eq(traits_type::to_char_type(c), this->gptr()[-1])) { + this->gbump(-1); + *this->gptr() = traits_type::to_char_type(c); + return c; + } + } + return traits_type::eof(); +} + +/////////////////////////////////////////////////////////////////////////////// +template +typename basic_filebuf::int_type +basic_filebuf::overflow(int_type c) { + if (_M_file == nullptr) + return traits_type::eof(); + _M_write_mode(); + char_type buf; + char_type* pb_save = this->pbase(); + char_type* epb_save = this->epptr(); + if (!traits_type::eq_int_type(c, traits_type::eof())) { + if (this->pptr() == nullptr) + this->setp(&buf, &buf+1); + *this->pptr() = traits_type::to_char_type(c); + this->pbump(1); + } + if (this->pptr() != this->pbase()) { + if (_M_always_noconv) { + size_t nmemb = static_cast(this->pptr() - this->pbase()); + if (fwrite(this->pbase(), sizeof(char_type), + nmemb, _M_file) != nmemb) + return traits_type::eof(); + } else { + char* extbe = _M_extbuf; + std::codecvt_base::result r; + do { + if (!_M_cv) + throw std::bad_cast(); + const char_type* e; + r = _M_cv->out(_M_st, this->pbase(), this->pptr(), e, + _M_extbuf, _M_extbuf + _M_ebs, extbe); + if (e == this->pbase()) + return traits_type::eof(); + if (r == std::codecvt_base::noconv) { + size_t nmemb = static_cast + (this->pptr() - this->pbase()); + if (fwrite(this->pbase(), 1, nmemb, _M_file) != nmemb) + return traits_type::eof(); + } else if (r == std::codecvt_base::ok || + r == std::codecvt_base::partial) { + size_t nmemb = static_cast(extbe - _M_extbuf); + if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) + return traits_type::eof(); + if (r == std::codecvt_base::partial) { + this->setp(const_cast(e), + this->pptr()); + this->pbump(this->epptr() - this->pbase()); + } + } else { + return traits_type::eof(); + } + } while (r == std::codecvt_base::partial); + } + this->setp(pb_save, epb_save); + } + return traits_type::not_eof(c); +} + +/////////////////////////////////////////////////////////////////////////////// +template +std::basic_streambuf* +basic_filebuf::setbuf(char_type* s, std::streamsize n) { + this->setg(0, 0, 0); + this->setp(0, 0); + if (_M_owns_eb) + delete [] _M_extbuf; + if (_M_owns_ib) + delete [] _M_intbuf; + _M_ebs = n; + if (_M_ebs > sizeof(_M_extbuf_min)) { + if (_M_always_noconv && s) { + _M_extbuf = reinterpret_cast(s); + _M_owns_eb = false; + } else { + _M_extbuf = new char[_M_ebs]; + _M_owns_eb = true; + } + } else { + _M_extbuf = _M_extbuf_min; + _M_ebs = sizeof(_M_extbuf_min); + _M_owns_eb = false; + } + if (!_M_always_noconv) { + _M_ibs = std::max(n, sizeof(_M_extbuf_min)); + if (s && _M_ibs >= sizeof(_M_extbuf_min)) { + _M_intbuf = s; + _M_owns_ib = false; + } else { + _M_intbuf = new char_type[_M_ibs]; + _M_owns_ib = true; + } + } else { + _M_ibs = 0; + _M_intbuf = 0; + _M_owns_ib = false; + } + return this; +} + +/////////////////////////////////////////////////////////////////////////////// +template +typename basic_filebuf::pos_type +basic_filebuf::seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode) { + if (!_M_cv) + throw std::bad_cast(); + int width = _M_cv->encoding(); + if (_M_file == nullptr || (width <= 0 && off != 0) || sync()) + return pos_type(off_type(-1)); + // width > 0 || off == 0 + int whence; + switch (way) { + case std::ios_base::beg: + whence = SEEK_SET; + break; + case std::ios_base::cur: + whence = SEEK_CUR; + break; + case std::ios_base::end: + whence = SEEK_END; + break; + default: + return pos_type(off_type(-1)); + } +#if _WIN32 + if (fseek(_M_file, width > 0 ? width * off : 0, whence)) + return pos_type(off_type(-1)); + pos_type r = ftell(_M_file); +#else + if (fseeko(_M_file, width > 0 ? width * off : 0, whence)) + return pos_type(off_type(-1)); + pos_type r = ftello(_M_file); +#endif + r.state(_M_st); + return r; +} + +/////////////////////////////////////////////////////////////////////////////// +template +typename basic_filebuf::pos_type +basic_filebuf::seekpos(pos_type sp, std::ios_base::openmode) { + if (_M_file == nullptr || sync()) + return pos_type(off_type(-1)); +#if _WIN32 + if (fseek(_M_file, sp, SEEK_SET)) + return pos_type(off_type(-1)); +#else + if (fseeko(_M_file, sp, SEEK_SET)) + return pos_type(off_type(-1)); +#endif + _M_st = sp.state(); + return sp; +} + +/////////////////////////////////////////////////////////////////////////////// +template +int +basic_filebuf::sync() { + if (_M_file == nullptr) + return 0; + if (!_M_cv) + throw std::bad_cast(); + if (_M_cm & std::ios_base::out) { + if (this->pptr() != this->pbase()) + if (overflow() == traits_type::eof()) + return -1; + std::codecvt_base::result r; + do { + char* extbe; + r = _M_cv->unshift(_M_st, _M_extbuf, _M_extbuf + _M_ebs, extbe); + size_t nmemb = static_cast(extbe - _M_extbuf); + if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) + return -1; + } while (r == std::codecvt_base::partial); + if (r == std::codecvt_base::error) + return -1; + if (fflush(_M_file)) + return -1; + } else if (_M_cm & std::ios_base::in) { + off_type c; + state_type state = _M_st_last; + bool update_st = false; + if (_M_always_noconv) { + c = this->egptr() - this->gptr(); + } else { + int width = _M_cv->encoding(); + c = _M_extbufend - _M_extbufnext; + if (width > 0) { + c += width * (this->egptr() - this->gptr()); + } else { + if (this->gptr() != this->egptr()) { + const int off = _M_cv->length(state, _M_extbuf, + _M_extbufnext, + this->gptr() - this->eback()); + c += _M_extbufnext - _M_extbuf - off; + update_st = true; + } + } + } +#if _WIN32 + if (fseek(_M_file_, -c, SEEK_CUR)) + return -1; +#else + if (fseeko(_M_file, -c, SEEK_CUR)) + return -1; +#endif + if (update_st) + _M_st = state; + _M_extbufnext = _M_extbufend = _M_extbuf; + this->setg(0, 0, 0); + _M_cm = std::ios_base::openmode(0); + } + return 0; +} + +/////////////////////////////////////////////////////////////////////////////// +template +void +basic_filebuf::imbue(const std::locale& loc) { + sync(); + _M_cv = &std::use_facet >(loc); + bool old_anc = _M_always_noconv; + _M_always_noconv = _M_cv->always_noconv(); + if (old_anc != _M_always_noconv) { + this->setg(0, 0, 0); + this->setp(0, 0); + // invariant, char_type is char, else we couldn't get here + // need to dump _M_intbuf + if (_M_always_noconv) { + if (_M_owns_eb) + delete [] _M_extbuf; + _M_owns_eb = _M_owns_ib; + _M_ebs = _M_ibs; + _M_extbuf = reinterpret_cast(_M_intbuf); + _M_ibs = 0; + _M_intbuf = nullptr; + _M_owns_ib = false; + } else { // need to obtain an _M_intbuf. + // If _M_extbuf is user-supplied, use it, else new _M_intbuf + if (!_M_owns_eb && _M_extbuf != _M_extbuf_min) { + _M_ibs = _M_ebs; + _M_intbuf = reinterpret_cast(_M_extbuf); + _M_owns_ib = false; + _M_extbuf = new char[_M_ebs]; + _M_owns_eb = true; + } else { + _M_ibs = _M_ebs; + _M_intbuf = new char_type[_M_ibs]; + _M_owns_ib = true; + } + } + } +} + +/////////////////////////////////////////////////////////////////////////////// +template +bool +basic_filebuf::_M_read_mode() { + if (!(_M_cm & std::ios_base::in)) { + this->setp(0, 0); + if (_M_always_noconv) + this->setg(reinterpret_cast(_M_extbuf), + reinterpret_cast(_M_extbuf) + _M_ebs, + reinterpret_cast(_M_extbuf) + _M_ebs); + else + this->setg(_M_intbuf, _M_intbuf + _M_ibs, _M_intbuf + _M_ibs); + _M_cm = std::ios_base::in; + return true; + } + return false; +} + +/////////////////////////////////////////////////////////////////////////////// +template +void +basic_filebuf::_M_write_mode() { + if (!(_M_cm & std::ios_base::out)) { + this->setg(0, 0, 0); + if (_M_ebs > sizeof(_M_extbuf_min)) { + if (_M_always_noconv) + this->setp(reinterpret_cast(_M_extbuf), + reinterpret_cast(_M_extbuf) + + (_M_ebs - 1)); + else + this->setp(_M_intbuf, _M_intbuf + (_M_ibs - 1)); + } else { + this->setp(0, 0); + } + _M_cm = std::ios_base::out; + } +} + +/////////////////////////////////////////////////////////////////////////////// +} + +/////////////////////////////////////////////////////////////////////////////// +#endif // KALDI_UTIL_BASIC_FILEBUF_H_ + +/////////////////////////////////////////////////////////////////////////////// + +/* + * ============================================================================ + * libc++ License + * ============================================================================ + * + * The libc++ library is dual licensed under both the University of Illinois + * "BSD-Like" license and the MIT license. As a user of this code you may + * choose to use it under either license. As a contributor, you agree to allow + * your code to be used under both. + * + * Full text of the relevant licenses is included below. + * + * ============================================================================ + * + * University of Illinois/NCSA + * Open Source License + * + * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included below) + * + * All rights reserved. + * + * Developed by: + * + * LLVM Team + * + * University of Illinois at Urbana-Champaign + * + * http://llvm.org + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal with + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the names of the LLVM Team, University of Illinois at + * Urbana-Champaign, nor the names of its contributors may be used to + * endorse or promote products derived from this Software without specific + * prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + * SOFTWARE. + * + * ============================================================================== + * + * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included below) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * ============================================================================== + * + * This file is a partial list of people who have contributed to the LLVM/libc++ + * project. If you have contributed a patch or made some other contribution to + * LLVM/libc++, please submit a patch to this file to add yourself, and it will be + * done! + * + * The list is sorted by surname and formatted to allow easy grepping and + * beautification by scripts. The fields are: name (N), email (E), web-address + * (W), PGP key ID and fingerprint (P), description (D), and snail-mail address + * (S). + * + * N: Saleem Abdulrasool + * E: compnerd@compnerd.org + * D: Minor patches and Linux fixes. + * + * N: Dimitry Andric + * E: dimitry@andric.com + * D: Visibility fixes, minor FreeBSD portability patches. + * + * N: Holger Arnold + * E: holgerar@gmail.com + * D: Minor fix. + * + * N: Ruben Van Boxem + * E: vanboxem dot ruben at gmail dot com + * D: Initial Windows patches. + * + * N: David Chisnall + * E: theraven at theravensnest dot org + * D: FreeBSD and Solaris ports, libcxxrt support, some atomics work. + * + * N: Marshall Clow + * E: mclow.lists@gmail.com + * E: marshall@idio.com + * D: C++14 support, patches and bug fixes. + * + * N: Bill Fisher + * E: william.w.fisher@gmail.com + * D: Regex bug fixes. + * + * N: Matthew Dempsky + * E: matthew@dempsky.org + * D: Minor patches and bug fixes. + * + * N: Google Inc. + * D: Copyright owner and contributor of the CityHash algorithm + * + * N: Howard Hinnant + * E: hhinnant@apple.com + * D: Architect and primary author of libc++ + * + * N: Hyeon-bin Jeong + * E: tuhertz@gmail.com + * D: Minor patches and bug fixes. + * + * N: Argyrios Kyrtzidis + * E: kyrtzidis@apple.com + * D: Bug fixes. + * + * N: Bruce Mitchener, Jr. + * E: bruce.mitchener@gmail.com + * D: Emscripten-related changes. + * + * N: Michel Morin + * E: mimomorin@gmail.com + * D: Minor patches to is_convertible. + * + * N: Andrew Morrow + * E: andrew.c.morrow@gmail.com + * D: Minor patches and Linux fixes. + * + * N: Arvid Picciani + * E: aep at exys dot org + * D: Minor patches and musl port. + * + * N: Bjorn Reese + * E: breese@users.sourceforge.net + * D: Initial regex prototype + * + * N: Nico Rieck + * E: nico.rieck@gmail.com + * D: Windows fixes + * + * N: Jonathan Sauer + * D: Minor patches, mostly related to constexpr + * + * N: Craig Silverstein + * E: csilvers@google.com + * D: Implemented Cityhash as the string hash function on 64-bit machines + * + * N: Richard Smith + * D: Minor patches. + * + * N: Joerg Sonnenberger + * E: joerg@NetBSD.org + * D: NetBSD port. + * + * N: Stephan Tolksdorf + * E: st@quanttec.com + * D: Minor fix + * + * N: Michael van der Westhuizen + * E: r1mikey at gmail dot com + * + * N: Klaas de Vries + * E: klaas at klaasgaaf dot nl + * D: Minor bug fix. + * + * N: Zhang Xiongpang + * E: zhangxiongpang@gmail.com + * D: Minor patches and bug fixes. + * + * N: Xing Xue + * E: xingxue@ca.ibm.com + * D: AIX port + * + * N: Zhihao Yuan + * E: lichray@gmail.com + * D: Standard compatibility fixes. + * + * N: Jeffrey Yasskin + * E: jyasskin@gmail.com + * E: jyasskin@google.com + * D: Linux fixes. + */ diff --git a/speechx/speechx/kaldi/util/common-utils.h b/speechx/speechx/kaldi/util/common-utils.h new file mode 100644 index 00000000..cfb0c255 --- /dev/null +++ b/speechx/speechx/kaldi/util/common-utils.h @@ -0,0 +1,31 @@ +// util/common-utils.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_UTIL_COMMON_UTILS_H_ +#define KALDI_UTIL_COMMON_UTILS_H_ + +#include "base/kaldi-common.h" +#include "util/parse-options.h" +#include "util/kaldi-io.h" +#include "util/simple-io-funcs.h" +#include "util/kaldi-holder.h" +#include "util/kaldi-table.h" +#include "util/table-types.h" +#include "util/text-utils.h" + +#endif // KALDI_UTIL_COMMON_UTILS_H_ diff --git a/speechx/speechx/kaldi/util/const-integer-set-inl.h b/speechx/speechx/kaldi/util/const-integer-set-inl.h new file mode 100644 index 00000000..32560535 --- /dev/null +++ b/speechx/speechx/kaldi/util/const-integer-set-inl.h @@ -0,0 +1,91 @@ +// util/const-integer-set-inl.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_CONST_INTEGER_SET_INL_H_ +#define KALDI_UTIL_CONST_INTEGER_SET_INL_H_ + +// Do not include this file directly. It is included by const-integer-set.h + + +namespace kaldi { + +template +void ConstIntegerSet::InitInternal() { + KALDI_ASSERT_IS_INTEGER_TYPE(I); + quick_set_.clear(); // just in case we previously had data. + if (slow_set_.size() == 0) { + lowest_member_=(I) 1; + highest_member_=(I) 0; + contiguous_ = false; + quick_ = false; + } else { + lowest_member_ = slow_set_.front(); + highest_member_ = slow_set_.back(); + size_t range = highest_member_ + 1 - lowest_member_; + if (range == slow_set_.size()) { + contiguous_ = true; + quick_= false; + } else { + contiguous_ = false; + // If it would be more compact to store as bool + if (range < slow_set_.size() * 8 * sizeof(I)) { + // (assuming 1 bit per element)... + quick_set_.resize(range, false); + for (size_t i = 0;i < slow_set_.size();i++) + quick_set_[slow_set_[i] - lowest_member_] = true; + quick_ = true; + } else { + quick_ = false; + } + } + } +} + +template +int ConstIntegerSet::count(I i) const { + if (i < lowest_member_ || i > highest_member_) { + return 0; + } else { + if (contiguous_) return true; + if (quick_) { + return (quick_set_[i-lowest_member_] ? 1 : 0); + } else { + bool ans = std::binary_search(slow_set_.begin(), slow_set_.end(), i); + return (ans ? 1 : 0); + } + } +} + +template +void ConstIntegerSet::Write(std::ostream &os, bool binary) const { + WriteIntegerVector(os, binary, slow_set_); +} + +template +void ConstIntegerSet::Read(std::istream &is, bool binary) { + ReadIntegerVector(is, binary, &slow_set_); + InitInternal(); +} + + + +} // end namespace kaldi + +#endif // KALDI_UTIL_CONST_INTEGER_SET_INL_H_ diff --git a/speechx/speechx/kaldi/util/const-integer-set.h b/speechx/speechx/kaldi/util/const-integer-set.h new file mode 100644 index 00000000..bb10a504 --- /dev/null +++ b/speechx/speechx/kaldi/util/const-integer-set.h @@ -0,0 +1,96 @@ +// util/const-integer-set.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_CONST_INTEGER_SET_H_ +#define KALDI_UTIL_CONST_INTEGER_SET_H_ +#include +#include +#include +#include +#include +#include "util/stl-utils.h" + + /* ConstIntegerSet is a way to efficiently test whether something is in a + supplied set of integers. It can be initialized from a vector or set, but + never changed after that. It either uses a sorted vector or an array of + bool, depending on the input. It behaves like a const version of an STL set, with + only a subset of the functionality, except all the member functions are + upper-case. + + Note that we could get rid of the member slow_set_, but we'd have to + do more work to implement an iterator type. This would save memory. + */ + +namespace kaldi { + +template class ConstIntegerSet { + public: + ConstIntegerSet(): lowest_member_(1), highest_member_(0) { } + + void Init(const std::vector &input) { + slow_set_ = input; + SortAndUniq(&slow_set_); + InitInternal(); + } + + void Init(const std::set &input) { + CopySetToVector(input, &slow_set_); + InitInternal(); + } + + explicit ConstIntegerSet(const std::vector &input): slow_set_(input) { + SortAndUniq(&slow_set_); + InitInternal(); + } + explicit ConstIntegerSet(const std::set &input) { + CopySetToVector(input, &slow_set_); + InitInternal(); + } + explicit ConstIntegerSet(const ConstIntegerSet &other): + slow_set_(other.slow_set_) { + InitInternal(); + } + + int count(I i) const; // returns 1 or 0. + + typedef typename std::vector::const_iterator iterator; + iterator begin() const { return slow_set_.begin(); } + iterator end() const { return slow_set_.end(); } + size_t size() const { return slow_set_.size(); } + bool empty() const { return slow_set_.empty(); } + + void Write(std::ostream &os, bool binary) const; + void Read(std::istream &is, bool binary); + + private: + I lowest_member_; + I highest_member_; + bool contiguous_; + bool quick_; + std::vector quick_set_; + std::vector slow_set_; + void InitInternal(); +}; + +} // end namespace kaldi + +#include "util/const-integer-set-inl.h" + +#endif // KALDI_UTIL_CONST_INTEGER_SET_H_ diff --git a/speechx/speechx/kaldi/util/edit-distance-inl.h b/speechx/speechx/kaldi/util/edit-distance-inl.h new file mode 100644 index 00000000..3304b27d --- /dev/null +++ b/speechx/speechx/kaldi/util/edit-distance-inl.h @@ -0,0 +1,200 @@ +// util/edit-distance-inl.h + +// Copyright 2009-2011 Microsoft Corporation; Haihua Xu; Yanmin Qian + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_EDIT_DISTANCE_INL_H_ +#define KALDI_UTIL_EDIT_DISTANCE_INL_H_ +#include +#include +#include +#include "util/stl-utils.h" + +namespace kaldi { + +template +int32 LevenshteinEditDistance(const std::vector &a, + const std::vector &b) { + // Algorithm: + // write A and B for the sequences, with elements a_0 .. + // let |A| = M and |B| = N be the lengths, and have + // elements a_0 ... a_{M-1} and b_0 ... b_{N-1}. + // We are computing the recursion + // E(m, n) = min( E(m-1, n-1) + (1-delta(a_{m-1}, b_{n-1})), + // E(m-1, n) + 1, + // E(m, n-1) + 1). + // where E(m, n) is defined for m = 0..M and n = 0..N and out-of- + // bounds quantities are considered to be infinity (i.e. the + // recursion does not visit them). + + // We do this computation using a vector e of size N+1. + // The outer iterations range over m = 0..M. + + int M = a.size(), N = b.size(); + std::vector e(N+1); + std::vector e_tmp(N+1); + // initialize e. + for (size_t i = 0; i < e.size(); i++) + e[i] = i; + for (int32 m = 1; m <= M; m++) { + // computing E(m, .) from E(m-1, .) + // handle special case n = 0: + e_tmp[0] = e[0] + 1; + + for (int32 n = 1; n <= N; n++) { + int32 term1 = e[n-1] + (a[m-1] == b[n-1] ? 0 : 1); + int32 term2 = e[n] + 1; + int32 term3 = e_tmp[n-1] + 1; + e_tmp[n] = std::min(term1, std::min(term2, term3)); + } + e = e_tmp; + } + return e.back(); +} +// +struct error_stats { + int32 ins_num; + int32 del_num; + int32 sub_num; + int32 total_cost; // minimum total cost to the current alignment. +}; +// Note that both hyp and ref should not contain noise word in +// the following implementation. + +template +int32 LevenshteinEditDistance(const std::vector &ref, + const std::vector &hyp, + int32 *ins, int32 *del, int32 *sub) { + // temp sequence to remember error type and stats. + std::vector e(ref.size()+1); + std::vector cur_e(ref.size()+1); + // initialize the first hypothesis aligned to the reference at each + // position:[hyp_index =0][ref_index] + for (size_t i =0; i < e.size(); i ++) { + e[i].ins_num = 0; + e[i].sub_num = 0; + e[i].del_num = i; + e[i].total_cost = i; + } + + // for other alignments + for (size_t hyp_index = 1; hyp_index <= hyp.size(); hyp_index ++) { + cur_e[0] = e[0]; + cur_e[0].ins_num++; + cur_e[0].total_cost++; + for (size_t ref_index = 1; ref_index <= ref.size(); ref_index ++) { + int32 ins_err = e[ref_index].total_cost + 1; + int32 del_err = cur_e[ref_index-1].total_cost + 1; + int32 sub_err = e[ref_index-1].total_cost; + if (hyp[hyp_index-1] != ref[ref_index-1]) + sub_err++; + + if (sub_err < ins_err && sub_err < del_err) { + cur_e[ref_index] =e[ref_index-1]; + if (hyp[hyp_index-1] != ref[ref_index-1]) + cur_e[ref_index].sub_num++; // substitution error should be increased + cur_e[ref_index].total_cost = sub_err; + } else if (del_err < ins_err) { + cur_e[ref_index] = cur_e[ref_index-1]; + cur_e[ref_index].total_cost = del_err; + cur_e[ref_index].del_num++; // deletion number is increased. + } else { + cur_e[ref_index] = e[ref_index]; + cur_e[ref_index].total_cost = ins_err; + cur_e[ref_index].ins_num++; // insertion number is increased. + } + } + e = cur_e; // alternate for the next recursion. + } + size_t ref_index = e.size()-1; + *ins = e[ref_index].ins_num, *del = + e[ref_index].del_num, *sub = e[ref_index].sub_num; + return e[ref_index].total_cost; +} + +template +int32 LevenshteinAlignment(const std::vector &a, + const std::vector &b, + T eps_symbol, + std::vector > *output) { + // Check inputs: + { + KALDI_ASSERT(output != NULL); + for (size_t i = 0; i < a.size(); i++) KALDI_ASSERT(a[i] != eps_symbol); + for (size_t i = 0; i < b.size(); i++) KALDI_ASSERT(b[i] != eps_symbol); + } + output->clear(); + // This is very memory-inefficiently implemented using a vector of vectors. + size_t M = a.size(), N = b.size(); + size_t m, n; + std::vector > e(M+1); + for (m = 0; m <=M; m++) e[m].resize(N+1); + for (n = 0; n <= N; n++) + e[0][n] = n; + for (m = 1; m <= M; m++) { + e[m][0] = e[m-1][0] + 1; + for (n = 1; n <= N; n++) { + int32 sub_or_ok = e[m-1][n-1] + (a[m-1] == b[n-1] ? 0 : 1); + int32 del = e[m-1][n] + 1; // assumes a == ref, b == hyp. + int32 ins = e[m][n-1] + 1; + e[m][n] = std::min(sub_or_ok, std::min(del, ins)); + } + } + // get time-reversed output first: trace back. + m = M; + n = N; + while (m != 0 || n != 0) { + size_t last_m, last_n; + if (m == 0) { + last_m = m; + last_n = n-1; + } else if (n == 0) { + last_m = m-1; + last_n = n; + } else { + int32 sub_or_ok = e[m-1][n-1] + (a[m-1] == b[n-1] ? 0 : 1); + int32 del = e[m-1][n] + 1; // assumes a == ref, b == hyp. + int32 ins = e[m][n-1] + 1; + // choose sub_or_ok if all else equal. + if (sub_or_ok <= std::min(del, ins)) { + last_m = m-1; + last_n = n-1; + } else { + if (del <= ins) { // choose del over ins if equal. + last_m = m-1; + last_n = n; + } else { + last_m = m; + last_n = n-1; + } + } + } + T a_sym, b_sym; + a_sym = (last_m == m ? eps_symbol : a[last_m]); + b_sym = (last_n == n ? eps_symbol : b[last_n]); + output->push_back(std::make_pair(a_sym, b_sym)); + m = last_m; + n = last_n; + } + ReverseVector(output); + return e[M][N]; +} + + +} // end namespace kaldi + +#endif // KALDI_UTIL_EDIT_DISTANCE_INL_H_ diff --git a/speechx/speechx/kaldi/util/edit-distance.h b/speechx/speechx/kaldi/util/edit-distance.h new file mode 100644 index 00000000..5eac4aea --- /dev/null +++ b/speechx/speechx/kaldi/util/edit-distance.h @@ -0,0 +1,64 @@ +// util/edit-distance.h + +// Copyright 2009-2011 Microsoft Corporation; Haihua Xu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_EDIT_DISTANCE_H_ +#define KALDI_UTIL_EDIT_DISTANCE_H_ +#include +#include +#include +#include +#include +#include +#include "util/edit-distance-inl.h" +#include "base/kaldi-types.h" + +namespace kaldi { + +// Compute the edit-distance between two strings. +template +int32 LevenshteinEditDistance(const std::vector &a, + const std::vector &b); + + +// edit distance calculation with conventional method. +// note: noise word must be filtered out from the hypothesis and +// reference sequence +// before the following procedure conducted. +template +int32 LevenshteinEditDistance(const std::vector &ref, + const std::vector &hyp, + int32 *ins, int32 *del, int32 *sub); + +// This version of the edit-distance computation outputs the alignment +// between the two. This is a vector of pairs of (symbol a, symbol b). +// The epsilon symbol (eps_symbol) must not occur in sequences a or b. +// Where one aligned to no symbol in the other (insertion or deletion), +// epsilon will be the corresponding member of the pair. +// It returns the edit-distance between the two strings. + +template +int32 LevenshteinAlignment(const std::vector &a, + const std::vector &b, + T eps_symbol, + std::vector > *output); + +} // end namespace kaldi + +#endif // KALDI_UTIL_EDIT_DISTANCE_H_ diff --git a/speechx/speechx/kaldi/util/hash-list-inl.h b/speechx/speechx/kaldi/util/hash-list-inl.h new file mode 100644 index 00000000..da6165af --- /dev/null +++ b/speechx/speechx/kaldi/util/hash-list-inl.h @@ -0,0 +1,194 @@ +// util/hash-list-inl.h + +// Copyright 2009-2011 Microsoft Corporation +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_HASH_LIST_INL_H_ +#define KALDI_UTIL_HASH_LIST_INL_H_ + +// Do not include this file directly. It is included by fast-hash.h + + +namespace kaldi { + +template HashList::HashList() { + list_head_ = NULL; + bucket_list_tail_ = static_cast(-1); // invalid. + hash_size_ = 0; + freed_head_ = NULL; +} + +template void HashList::SetSize(size_t size) { + hash_size_ = size; + KALDI_ASSERT(list_head_ == NULL && + bucket_list_tail_ == static_cast(-1)); // make sure empty. + if (size > buckets_.size()) + buckets_.resize(size, HashBucket(0, NULL)); +} + +template +typename HashList::Elem* HashList::Clear() { + // Clears the hashtable and gives ownership of the currently contained list + // to the user. + for (size_t cur_bucket = bucket_list_tail_; + cur_bucket != static_cast(-1); + cur_bucket = buckets_[cur_bucket].prev_bucket) { + buckets_[cur_bucket].last_elem = NULL; // this is how we indicate "empty". + } + bucket_list_tail_ = static_cast(-1); + Elem *ans = list_head_; + list_head_ = NULL; + return ans; +} + +template +const typename HashList::Elem* HashList::GetList() const { + return list_head_; +} + +template +inline void HashList::Delete(Elem *e) { + e->tail = freed_head_; + freed_head_ = e; +} + +template +inline typename HashList::Elem* HashList::Find(I key) { + size_t index = (static_cast(key) % hash_size_); + HashBucket &bucket = buckets_[index]; + if (bucket.last_elem == NULL) { + return NULL; // empty bucket. + } else { + Elem *head = (bucket.prev_bucket == static_cast(-1) ? + list_head_ : + buckets_[bucket.prev_bucket].last_elem->tail), + *tail = bucket.last_elem->tail; + for (Elem *e = head; e != tail; e = e->tail) + if (e->key == key) return e; + return NULL; // Not found. + } +} + +template +inline typename HashList::Elem* HashList::New() { + if (freed_head_) { + Elem *ans = freed_head_; + freed_head_ = freed_head_->tail; + return ans; + } else { + Elem *tmp = new Elem[allocate_block_size_]; + for (size_t i = 0; i+1 < allocate_block_size_; i++) + tmp[i].tail = tmp+i+1; + tmp[allocate_block_size_-1].tail = NULL; + freed_head_ = tmp; + allocated_.push_back(tmp); + return this->New(); + } +} + +template +HashList::~HashList() { + // First test whether we had any memory leak within the + // HashList, i.e. things for which the user did not call Delete(). + size_t num_in_list = 0, num_allocated = 0; + for (Elem *e = freed_head_; e != NULL; e = e->tail) + num_in_list++; + for (size_t i = 0; i < allocated_.size(); i++) { + num_allocated += allocate_block_size_; + delete[] allocated_[i]; + } + if (num_in_list != num_allocated) { + KALDI_WARN << "Possible memory leak: " << num_in_list + << " != " << num_allocated + << ": you might have forgotten to call Delete on " + << "some Elems"; + } +} + +template +inline typename HashList::Elem* HashList::Insert(I key, T val) { + size_t index = (static_cast(key) % hash_size_); + HashBucket &bucket = buckets_[index]; + // Check the element is existing or not. + if (bucket.last_elem != NULL) { + Elem *head = (bucket.prev_bucket == static_cast(-1) ? + list_head_ : + buckets_[bucket.prev_bucket].last_elem->tail), + *tail = bucket.last_elem->tail; + for (Elem *e = head; e != tail; e = e->tail) + if (e->key == key) return e; + } + + // This is a new element. Insert it. + Elem *elem = New(); + elem->key = key; + elem->val = val; + if (bucket.last_elem == NULL) { // Unoccupied bucket. Insert at + // head of bucket list (which is tail of regular list, they go in + // opposite directions). + if (bucket_list_tail_ == static_cast(-1)) { + // list was empty so this is the first elem. + KALDI_ASSERT(list_head_ == NULL); + list_head_ = elem; + } else { + // link in to the chain of Elems + buckets_[bucket_list_tail_].last_elem->tail = elem; + } + elem->tail = NULL; + bucket.last_elem = elem; + bucket.prev_bucket = bucket_list_tail_; + bucket_list_tail_ = index; + } else { + // Already-occupied bucket. Insert at tail of list of elements within + // the bucket. + elem->tail = bucket.last_elem->tail; + bucket.last_elem->tail = elem; + bucket.last_elem = elem; + } + return elem; +} + +template +void HashList::InsertMore(I key, T val) { + size_t index = (static_cast(key) % hash_size_); + HashBucket &bucket = buckets_[index]; + Elem *elem = New(); + elem->key = key; + elem->val = val; + + KALDI_ASSERT(bucket.last_elem != NULL); // assume one element is already here + if (bucket.last_elem->key == key) { // standard behavior: add as last element + elem->tail = bucket.last_elem->tail; + bucket.last_elem->tail = elem; + bucket.last_elem = elem; + return; + } + Elem *e = (bucket.prev_bucket == static_cast(-1) ? + list_head_ : buckets_[bucket.prev_bucket].last_elem->tail); + // find place to insert in linked list + while (e != bucket.last_elem->tail && e->key != key) e = e->tail; + KALDI_ASSERT(e->key == key); // not found? - should not happen + elem->tail = e->tail; + e->tail = elem; +} + + +} // end namespace kaldi + +#endif // KALDI_UTIL_HASH_LIST_INL_H_ diff --git a/speechx/speechx/kaldi/util/hash-list.h b/speechx/speechx/kaldi/util/hash-list.h new file mode 100644 index 00000000..9ae0043f --- /dev/null +++ b/speechx/speechx/kaldi/util/hash-list.h @@ -0,0 +1,147 @@ +// util/hash-list.h + +// Copyright 2009-2011 Microsoft Corporation +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_HASH_LIST_H_ +#define KALDI_UTIL_HASH_LIST_H_ +#include +#include +#include +#include +#include +#include "util/stl-utils.h" + + +/* This header provides utilities for a structure that's used in a decoder (but + is quite generic in nature so we implement and test it separately). + Basically it's a singly-linked list, but implemented in such a way that we + can quickly search for elements in the list. We give it a slightly richer + interface than just a hash and a list. The idea is that we want to separate + the hash part and the list part: basically, in the decoder, we want to have a + single hash for the current frame and the next frame, because by the time we + need to access the hash for the next frame we no longer need the hash for the + previous frame. So we have an operation that clears the hash but leaves the + list structure intact. We also control memory management inside this object, + to avoid repeated new's/deletes. + + See hash-list-test.cc for an example of how to use this object. +*/ + + +namespace kaldi { + +template class HashList { + public: + struct Elem { + I key; + T val; + Elem *tail; + }; + + /// Constructor takes no arguments. + /// Call SetSize to inform it of the likely size. + HashList(); + + /// Clears the hash and gives the head of the current list to the user; + /// ownership is transferred to the user (the user must call Delete() + /// for each element in the list, at his/her leisure). + Elem *Clear(); + + /// Gives the head of the current list to the user. Ownership retained in the + /// class. Caution: in December 2013 the return type was changed to const + /// Elem* and this function was made const. You may need to change some types + /// of local Elem* variables to const if this produces compilation errors. + const Elem *GetList() const; + + /// Think of this like delete(). It is to be called for each Elem in turn + /// after you "obtained ownership" by doing Clear(). This is not the opposite + /// of. Insert, it is the opposite of New. It's really a memory operation. + inline void Delete(Elem *e); + + /// This should probably not be needed to be called directly by the user. + /// Think of it as opposite + /// to Delete(); + inline Elem *New(); + + /// Find tries to find this element in the current list using the hashtable. + /// It returns NULL if not present. The Elem it returns is not owned by the + /// user, it is part of the internal list owned by this object, but the user + /// is free to modify the "val" element. + inline Elem *Find(I key); + + /// Insert inserts a new element into the hashtable/stored list. + /// Because element keys in a hashtable are unique, this operation checks + /// whether each inserted element has a key equivalent to the one of an + /// element already in the hashtable. If so, the element is not inserted, + /// returning an pointer to this existing element. + inline Elem *Insert(I key, T val); + + /// Insert inserts another element with same key into the hashtable/ + /// stored list. + /// By calling this, the user asserts that one element with that key is + /// already present. + /// We insert it that way, that all elements with the same key + /// follow each other. + /// Find() will return the first one of the elements with the same key. + inline void InsertMore(I key, T val); + + /// SetSize tells the object how many hash buckets to allocate (should + /// typically be at least twice the number of objects we expect to go in the + /// structure, for fastest performance). It must be called while the hash + /// is empty (e.g. after Clear() or after initializing the object, but before + /// adding anything to the hash. + void SetSize(size_t sz); + + /// Returns current number of hash buckets. + inline size_t Size() { return hash_size_; } + + ~HashList(); + private: + + struct HashBucket { + size_t prev_bucket; // index to next bucket (-1 if list tail). Note: + // list of buckets goes in opposite direction to list of Elems. + Elem *last_elem; // pointer to last element in this bucket (NULL if empty) + inline HashBucket(size_t i, Elem *e): prev_bucket(i), last_elem(e) {} + }; + + Elem *list_head_; // head of currently stored list. + size_t bucket_list_tail_; // tail of list of active hash buckets. + + size_t hash_size_; // number of hash buckets. + + std::vector buckets_; + + Elem *freed_head_; // head of list of currently freed elements. [ready for + // allocation] + + std::vector allocated_; // list of allocated blocks. + + static const size_t allocate_block_size_ = 1024; // Number of Elements to + // allocate in one block. Must be largish so storing allocated_ doesn't + // become a problem. +}; + + +} // end namespace kaldi + +#include "util/hash-list-inl.h" + +#endif // KALDI_UTIL_HASH_LIST_H_ diff --git a/speechx/speechx/kaldi/util/kaldi-cygwin-io-inl.h b/speechx/speechx/kaldi/util/kaldi-cygwin-io-inl.h new file mode 100644 index 00000000..8a3cd91a --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-cygwin-io-inl.h @@ -0,0 +1,129 @@ +// util/kaldi-cygwin-io-inl.h + +// Copyright 2015 Smart Action Company LLC (author: Kirill Katsnelson) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_UTIL_KALDI_CYGWIN_IO_INL_H_ +#define KALDI_UTIL_KALDI_CYGWIN_IO_INL_H_ + +#ifndef _MSC_VER +#error This is a Windows-compatibility file. Something went wery wrong. +#endif + +#include + +// This file is included only into kaldi-io.cc, and only if +// KALDI_CYGWIN_COMPAT is enabled. +// +// The routines map unix-ey paths passed to Windows programs from shell +// scripts in egs. Since shell scripts run under cygwin, they use cygwin's +// own mount table and a mapping to the file system. It is quite possible to +// create quite an intricate mapping that only own cygwin API would be able +// to untangle. Unfortunately, the API to map between filenames is not +// available to non-cygwin programs. Running cygpath for every file operation +// would as well be cumbersome. So this is only a simplistic path resolution, +// assuming that the default cygwin prefix /cygdrive is used, and that all +// resolved unix-style full paths end up prefixed with /cygdrive. This is +// quite a sensible approach. We'll also try to map /dev/null and /tmp/**, +// die on all other /dev/** and warn about all other rooted paths. + +namespace kaldi { + +static bool prefixp(const std::string& pfx, const std::string& str) { + return pfx.length() <= str.length() && + std::equal(pfx.begin(), pfx.end(), str.begin()); +} + +static std::string cygprefix("/cygdrive/"); + +static std::string MapCygwinPathNoTmp(const std::string &filename) { + // UNC(?), relative, native Windows and empty paths are ok already. + if (prefixp("//", filename) || !prefixp("/", filename)) + return filename; + + // /dev/... + if (filename == "/dev/null") + return "\\\\.\\nul"; + if (prefixp("/dev/", filename)) { + KALDI_ERR << "Unable to resolve path '" << filename + << "' - only have /dev/null here."; + return "\\\\.\\invalid"; + } + + // /cygdrive/?[/....] + int preflen = cygprefix.size(); + if (prefixp(cygprefix, filename) + && filename.size() >= preflen + 1 && isalpha(filename[preflen]) + && (filename.size() == preflen + 1 || filename[preflen + 1] == '/')) { + return std::string() + filename[preflen] + ':' + + (filename.size() > preflen + 1 ? filename.substr(preflen + 1) : "/"); + } + + KALDI_WARN << "Unable to resolve path '" << filename + << "' - cannot map unix prefix. " + << "Will go on, but breakage will likely ensue."; + return filename; +} + +// extern for unit testing. +std::string MapCygwinPath(const std::string &filename) { + // /tmp[/....] + if (filename != "/tmp" && !prefixp("/tmp/", filename)) { + return MapCygwinPathNoTmp(filename); + } + char *tmpdir = std::getenv("TMP"); + if (tmpdir == nullptr) + tmpdir = std::getenv("TEMP"); + if (tmpdir == nullptr) { + KALDI_ERR << "Unable to resolve path '" << filename + << "' - unable to find temporary directory. Set TMP."; + return filename; + } + // Map the value of tmpdir again, as cygwin environment actually may contain + // unix-style paths. + return MapCygwinPathNoTmp(std::string(tmpdir) + filename.substr(4)); +} + +// A popen implementation that passes the command line through cygwin +// bash.exe. This is necessary since some piped commands are cygwin links +// (e. g. fgrep is a soft link to grep), and some are #!-files, such as +// gunzip which is a shell script that invokes gzip, or kaldi's own run.pl +// which is a perl script. +// +// _popen uses cmd.exe or whatever shell is specified via the COMSPEC +// variable. Unfortunately, it adds a hardcoded " /c " to it, so we cannot +// just substitute the environment variable COMSPEC to point to bash.exe. +// Instead, quote the command and pass it to bash via its -c switch. +static FILE *CygwinCompatPopen(const char* command, const char* mode) { + // To speed up command launch marginally, optionally accept full path + // to bash.exe. This will not work if the path contains spaces, but + // no sane person would install cygwin into a space-ridden path. + const char* bash_exe = std::getenv("BASH_EXE"); + std::string qcmd(bash_exe != nullptr ? bash_exe : "bash.exe"); + qcmd += " -c \""; + for (; *command; ++command) { + if (*command == '\"') + qcmd += '\"'; + qcmd += *command; + } + qcmd += '\"'; + + return _popen(qcmd.c_str(), mode); +} + +} // namespace kaldi + +#endif // KALDI_UTIL_KALDI_CYGWIN_IO_INL_H_ diff --git a/speechx/speechx/kaldi/util/kaldi-holder-inl.h b/speechx/speechx/kaldi/util/kaldi-holder-inl.h new file mode 100644 index 00000000..134cdd93 --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-holder-inl.h @@ -0,0 +1,922 @@ +// util/kaldi-holder-inl.h + +// Copyright 2009-2011 Microsoft Corporation +// 2016 Xiaohui Zhang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_KALDI_HOLDER_INL_H_ +#define KALDI_UTIL_KALDI_HOLDER_INL_H_ + +#include +#include +#include +#include + +#include "base/kaldi-utils.h" +#include "util/kaldi-io.h" +#include "util/text-utils.h" +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + +/// \addtogroup holders +/// @{ + + +// KaldiObjectHolder is valid only for Kaldi objects with +// copy constructors, default constructors, and "normal" +// Kaldi Write and Read functions. E.g. it works for +// Matrix and Vector. +template class KaldiObjectHolder { + public: + typedef KaldiType T; + + KaldiObjectHolder(): t_(NULL) { } + + static bool Write(std::ostream &os, bool binary, const T &t) { + InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. + try { + t.Write(os, binary); + return os.good(); + } catch(const std::exception &e) { + KALDI_WARN << "Exception caught writing Table object. " << e.what(); + return false; // Write failure. + } + } + + void Clear() { + if (t_) { + delete t_; + t_ = NULL; + } + } + + // Reads into the holder. + bool Read(std::istream &is) { + delete t_; + t_ = new T; + // Don't want any existing state to complicate the read function: get new + // object. + bool is_binary; + if (!InitKaldiInputStream(is, &is_binary)) { + KALDI_WARN << "Reading Table object, failed reading binary header\n"; + return false; + } + try { + t_->Read(is, is_binary); + return true; + } catch(const std::exception &e) { + KALDI_WARN << "Exception caught reading Table object. " << e.what(); + delete t_; + t_ = NULL; + return false; + } + } + + // Kaldi objects always have the stream open in binary mode for + // reading. + static bool IsReadInBinary() { return true; } + + T &Value() { + // code error if !t_. + if (!t_) KALDI_ERR << "KaldiObjectHolder::Value() called wrongly."; + return *t_; + } + + void Swap(KaldiObjectHolder *other) { + // the t_ values are pointers so this is a shallow swap. + std::swap(t_, other->t_); + } + + bool ExtractRange(const KaldiObjectHolder &other, + const std::string &range) { + KALDI_ASSERT(other.t_ != NULL); + delete t_; + t_ = new T; + // this call will fail for most object types. + return ExtractObjectRange(*(other.t_), range, t_); + } + + ~KaldiObjectHolder() { delete t_; } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(KaldiObjectHolder); + T *t_; +}; + + +// BasicHolder is valid for float, double, bool, and integer +// types. There will be a compile time error otherwise, because +// we make sure that the {Write, Read}BasicType functions do not +// get instantiated for other types. + +template class BasicHolder { + public: + typedef BasicType T; + + BasicHolder(): t_(static_cast(-1)) { } + + static bool Write(std::ostream &os, bool binary, const T &t) { + InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. + try { + WriteBasicType(os, binary, t); + if (!binary) os << '\n'; // Makes output format more readable and + // easier to manipulate. + return os.good(); + } catch(const std::exception &e) { + KALDI_WARN << "Exception caught writing Table object. " << e.what(); + return false; // Write failure. + } + } + + void Clear() { } + + // Reads into the holder. + bool Read(std::istream &is) { + bool is_binary; + if (!InitKaldiInputStream(is, &is_binary)) { + KALDI_WARN << "Reading Table object [integer type], failed reading binary" + " header\n"; + return false; + } + try { + int c; + if (!is_binary) { // This is to catch errors, the class would work + // without it.. + // Eat up any whitespace and make sure it's not newline. + while (isspace((c = is.peek())) && c != static_cast('\n')) { + is.get(); + } + if (is.peek() == '\n') { + KALDI_WARN << "Found newline but expected basic type."; + return false; // This is just to catch a more- + // likely-than average type of error (empty line before the token), + // since ReadBasicType will eat it up. + } + } + + ReadBasicType(is, is_binary, &t_); + + if (!is_binary) { // This is to catch errors, the class would work + // without it.. + // make sure there is a newline. + while (isspace((c = is.peek())) && c != static_cast('\n')) { + is.get(); + } + if (is.peek() != '\n') { + KALDI_WARN << "BasicHolder::Read, expected newline, got " + << CharToString(is.peek()) << ", position " << is.tellg(); + return false; + } + is.get(); // Consume the newline. + } + return true; + } catch(const std::exception &e) { + KALDI_WARN << "Exception caught reading Table object. " << e.what(); + return false; + } + } + + // Objects read/written with the Kaldi I/O functions always have the stream + // open in binary mode for reading. + static bool IsReadInBinary() { return true; } + + T &Value() { + return t_; + } + + void Swap(BasicHolder *other) { + std::swap(t_, other->t_); + } + + bool ExtractRange(const BasicHolder &other, const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + + ~BasicHolder() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(BasicHolder); + + T t_; +}; + + +/// A Holder for a vector of basic types, e.g. +/// std::vector, std::vector, and so on. +/// Note: a basic type is defined as a type for which ReadBasicType +/// and WriteBasicType are implemented, i.e. integer and floating +/// types, and bool. +template class BasicVectorHolder { + public: + typedef std::vector T; + + BasicVectorHolder() { } + + static bool Write(std::ostream &os, bool binary, const T &t) { + InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. + try { + if (binary) { // need to write the size, in binary mode. + KALDI_ASSERT(static_cast(static_cast(t.size())) == + t.size()); + // Or this Write routine cannot handle such a large vector. + // use int32 because it's fixed size regardless of compilation. + // change to int64 (plus in Read function) if this becomes a problem. + WriteBasicType(os, binary, static_cast(t.size())); + for (typename std::vector::const_iterator iter = t.begin(); + iter != t.end(); ++iter) + WriteBasicType(os, binary, *iter); + + } else { + for (typename std::vector::const_iterator iter = t.begin(); + iter != t.end(); ++iter) + WriteBasicType(os, binary, *iter); + os << '\n'; // Makes output format more readable and + // easier to manipulate. In text mode, this function writes something + // like "1 2 3\n". + } + return os.good(); + } catch(const std::exception &e) { + KALDI_WARN << "Exception caught writing Table object (BasicVector). " + << e.what(); + return false; // Write failure. + } + } + + void Clear() { t_.clear(); } + + // Reads into the holder. + bool Read(std::istream &is) { + t_.clear(); + bool is_binary; + if (!InitKaldiInputStream(is, &is_binary)) { + KALDI_WARN << "Reading Table object [integer type], failed reading binary" + " header\n"; + return false; + } + if (!is_binary) { + // In text mode, we terminate with newline. + std::string line; + getline(is, line); // this will discard the \n, if present. + if (is.fail()) { + KALDI_WARN << "BasicVectorHolder::Read, error reading line " << + (is.eof() ? "[eof]" : ""); + return false; // probably eof. fail in any case. + } + std::istringstream line_is(line); + try { + while (1) { + line_is >> std::ws; // eat up whitespace. + if (line_is.eof()) break; + BasicType bt; + ReadBasicType(line_is, false, &bt); + t_.push_back(bt); + } + return true; + } catch(const std::exception &e) { + KALDI_WARN << "BasicVectorHolder::Read, could not interpret line: " + << "'" << line << "'" << "\n" << e.what(); + return false; + } + } else { // binary mode. + size_t filepos = is.tellg(); + try { + int32 size; + ReadBasicType(is, true, &size); + t_.resize(size); + for (typename std::vector::iterator iter = t_.begin(); + iter != t_.end(); + ++iter) { + ReadBasicType(is, true, &(*iter)); + } + return true; + } catch(...) { + KALDI_WARN << "BasicVectorHolder::Read, read error or unexpected data" + " at archive entry beginning at file position " << filepos; + return false; + } + } + } + + // Objects read/written with the Kaldi I/O functions always have the stream + // open in binary mode for reading. + static bool IsReadInBinary() { return true; } + + T &Value() { return t_; } + + void Swap(BasicVectorHolder *other) { + t_.swap(other->t_); + } + + bool ExtractRange(const BasicVectorHolder &other, + const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + + ~BasicVectorHolder() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(BasicVectorHolder); + T t_; +}; + + +/// BasicVectorVectorHolder is a Holder for a vector of vector of +/// a basic type, e.g. std::vector >. +/// Note: a basic type is defined as a type for which ReadBasicType +/// and WriteBasicType are implemented, i.e. integer and floating +/// types, and bool. +template class BasicVectorVectorHolder { + public: + typedef std::vector > T; + + BasicVectorVectorHolder() { } + + static bool Write(std::ostream &os, bool binary, const T &t) { + InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. + try { + if (binary) { // need to write the size, in binary mode. + KALDI_ASSERT(static_cast(static_cast(t.size())) == + t.size()); + // Or this Write routine cannot handle such a large vector. + // use int32 because it's fixed size regardless of compilation. + // change to int64 (plus in Read function) if this becomes a problem. + WriteBasicType(os, binary, static_cast(t.size())); + for (typename std::vector >::const_iterator + iter = t.begin(); + iter != t.end(); ++iter) { + KALDI_ASSERT(static_cast(static_cast(iter->size())) + == iter->size()); + WriteBasicType(os, binary, static_cast(iter->size())); + for (typename std::vector::const_iterator + iter2 = iter->begin(); + iter2 != iter->end(); ++iter2) { + WriteBasicType(os, binary, *iter2); + } + } + } else { // text mode... + // In text mode, we write out something like (for integers): + // "1 2 3 ; 4 5 ; 6 ; ; 7 8 9 ;\n" + // where the semicolon is a terminator, not a separator + // (a separator would cause ambiguity between an + // empty list, and a list containing a single empty list). + for (typename std::vector >::const_iterator + iter = t.begin(); + iter != t.end(); + ++iter) { + for (typename std::vector::const_iterator + iter2 = iter->begin(); + iter2 != iter->end(); ++iter2) + WriteBasicType(os, binary, *iter2); + os << "; "; + } + os << '\n'; + } + return os.good(); + } catch(const std::exception &e) { + KALDI_WARN << "Exception caught writing Table object. " << e.what(); + return false; // Write failure. + } + } + + void Clear() { t_.clear(); } + + // Reads into the holder. + bool Read(std::istream &is) { + t_.clear(); + bool is_binary; + if (!InitKaldiInputStream(is, &is_binary)) { + KALDI_WARN << "Failed reading binary header\n"; + return false; + } + if (!is_binary) { + // In text mode, we terminate with newline. + try { // catching errors from ReadBasicType.. + std::vector v; // temporary vector + while (1) { + int i = is.peek(); + if (i == -1) { + KALDI_WARN << "Unexpected EOF"; + return false; + } else if (static_cast(i) == '\n') { + if (!v.empty()) { + KALDI_WARN << "No semicolon before newline (wrong format)"; + return false; + } else { + is.get(); + return true; + } + } else if (std::isspace(i)) { + is.get(); + } else if (static_cast(i) == ';') { + t_.push_back(v); + v.clear(); + is.get(); + } else { // some object we want to read... + BasicType b; + ReadBasicType(is, false, &b); // throws on error. + v.push_back(b); + } + } + } catch(const std::exception &e) { + KALDI_WARN << "BasicVectorVectorHolder::Read, read error. " << e.what(); + return false; + } + } else { // binary mode. + size_t filepos = is.tellg(); + try { + int32 size; + ReadBasicType(is, true, &size); + t_.resize(size); + for (typename std::vector >::iterator + iter = t_.begin(); + iter != t_.end(); + ++iter) { + int32 size2; + ReadBasicType(is, true, &size2); + iter->resize(size2); + for (typename std::vector::iterator iter2 = iter->begin(); + iter2 != iter->end(); + ++iter2) + ReadBasicType(is, true, &(*iter2)); + } + return true; + } catch(...) { + KALDI_WARN << "Read error or unexpected data at archive entry beginning" + " at file position " << filepos; + return false; + } + } + } + + // Objects read/written with the Kaldi I/O functions always have the stream + // open in binary mode for reading. + static bool IsReadInBinary() { return true; } + + T &Value() { return t_; } + + void Swap(BasicVectorVectorHolder *other) { + t_.swap(other->t_); + } + + bool ExtractRange(BasicVectorVectorHolder &other, + const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + + ~BasicVectorVectorHolder() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(BasicVectorVectorHolder); + T t_; +}; + + +/// BasicPairVectorHolder is a Holder for a vector of pairs of +/// a basic type, e.g. std::vector >. +/// Note: a basic type is defined as a type for which ReadBasicType +/// and WriteBasicType are implemented, i.e. integer and floating +/// types, and bool. +template class BasicPairVectorHolder { + public: + typedef std::vector > T; + + BasicPairVectorHolder() { } + + static bool Write(std::ostream &os, bool binary, const T &t) { + InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. + try { + if (binary) { // need to write the size, in binary mode. + KALDI_ASSERT(static_cast(static_cast(t.size())) == + t.size()); + // Or this Write routine cannot handle such a large vector. + // use int32 because it's fixed size regardless of compilation. + // change to int64 (plus in Read function) if this becomes a problem. + WriteBasicType(os, binary, static_cast(t.size())); + for (typename T::const_iterator iter = t.begin(); + iter != t.end(); ++iter) { + WriteBasicType(os, binary, iter->first); + WriteBasicType(os, binary, iter->second); + } + } else { // text mode... + // In text mode, we write out something like (for integers): + // "1 2 ; 4 5 ; 6 7 ; 8 9 \n" + // where the semicolon is a separator, not a terminator. + for (typename T::const_iterator iter = t.begin(); + iter != t.end();) { + WriteBasicType(os, binary, iter->first); + WriteBasicType(os, binary, iter->second); + ++iter; + if (iter != t.end()) + os << "; "; + } + os << '\n'; + } + return os.good(); + } catch(const std::exception &e) { + KALDI_WARN << "Exception caught writing Table object. " << e.what(); + return false; // Write failure. + } + } + + void Clear() { t_.clear(); } + + // Reads into the holder. + bool Read(std::istream &is) { + t_.clear(); + bool is_binary; + if (!InitKaldiInputStream(is, &is_binary)) { + KALDI_WARN << "Reading Table object [integer type], failed reading binary" + " header\n"; + return false; + } + if (!is_binary) { + // In text mode, we terminate with newline. + try { // catching errors from ReadBasicType.. + std::vector v; // temporary vector + while (1) { + int i = is.peek(); + if (i == -1) { + KALDI_WARN << "Unexpected EOF"; + return false; + } else if (static_cast(i) == '\n') { + if (t_.empty() && v.empty()) { + is.get(); + return true; + } else if (v.size() == 2) { + t_.push_back(std::make_pair(v[0], v[1])); + is.get(); + return true; + } else { + KALDI_WARN << "Unexpected newline, reading vector >; got " + << v.size() << " elements, expected 2."; + return false; + } + } else if (std::isspace(i)) { + is.get(); + } else if (static_cast(i) == ';') { + if (v.size() != 2) { + KALDI_WARN << "Wrong input format, reading vector >; got " + << v.size() << " elements, expected 2."; + return false; + } + t_.push_back(std::make_pair(v[0], v[1])); + v.clear(); + is.get(); + } else { // some object we want to read... + BasicType b; + ReadBasicType(is, false, &b); // throws on error. + v.push_back(b); + } + } + } catch(const std::exception &e) { + KALDI_WARN << "BasicPairVectorHolder::Read, read error. " << e.what(); + return false; + } + } else { // binary mode. + size_t filepos = is.tellg(); + try { + int32 size; + ReadBasicType(is, true, &size); + t_.resize(size); + for (typename T::iterator iter = t_.begin(); + iter != t_.end(); + ++iter) { + ReadBasicType(is, true, &(iter->first)); + ReadBasicType(is, true, &(iter->second)); + } + return true; + } catch(...) { + KALDI_WARN << "BasicVectorHolder::Read, read error or unexpected data" + " at archive entry beginning at file position " << filepos; + return false; + } + } + } + + // Objects read/written with the Kaldi I/O functions always have the stream + // open in binary mode for reading. + static bool IsReadInBinary() { return true; } + + T &Value() { return t_; } + + void Swap(BasicPairVectorHolder *other) { + t_.swap(other->t_); + } + + bool ExtractRange(const BasicPairVectorHolder &other, + const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + + ~BasicPairVectorHolder() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(BasicPairVectorHolder); + T t_; +}; + + + + +// We define a Token as a nonempty, printable, whitespace-free std::string. +// The binary and text formats here are the same (newline-terminated) +// and as such we don't bother with the binary-mode headers. +class TokenHolder { + public: + typedef std::string T; + + TokenHolder() {} + + static bool Write(std::ostream &os, bool, const T &t) { // ignore binary-mode + KALDI_ASSERT(IsToken(t)); + os << t << '\n'; + return os.good(); + } + + void Clear() { t_.clear(); } + + // Reads into the holder. + bool Read(std::istream &is) { + is >> t_; + if (is.fail()) return false; + char c; + while (isspace(c = is.peek()) && c!= '\n') is.get(); + if (is.peek() != '\n') { + KALDI_WARN << "TokenHolder::Read, expected newline, got char " + << CharToString(is.peek()) + << ", at stream pos " << is.tellg(); + return false; + } + is.get(); // get '\n' + return true; + } + + + // Since this is fundamentally a text format, read in text mode (would work + // fine either way, but doing it this way will exercise more of the code). + static bool IsReadInBinary() { return false; } + + T &Value() { return t_; } + + ~TokenHolder() { } + + void Swap(TokenHolder *other) { + t_.swap(other->t_); + } + + bool ExtractRange(const TokenHolder &other, + const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(TokenHolder); + T t_; +}; + +// A Token is a nonempty, whitespace-free std::string. +// Class TokenVectorHolder is a Holder class for vectors of these. +class TokenVectorHolder { + public: + typedef std::vector T; + + TokenVectorHolder() { } + + static bool Write(std::ostream &os, bool, const T &t) { // ignore binary-mode + for (std::vector::const_iterator iter = t.begin(); + iter != t.end(); + ++iter) { + KALDI_ASSERT(IsToken(*iter)); // make sure it's whitespace-free, + // printable and nonempty. + os << *iter << ' '; + } + os << '\n'; + return os.good(); + } + + void Clear() { t_.clear(); } + + + // Reads into the holder. + bool Read(std::istream &is) { + t_.clear(); + + // there is no binary/non-binary mode. + + std::string line; + getline(is, line); // this will discard the \n, if present. + if (is.fail()) { + KALDI_WARN << "BasicVectorHolder::Read, error reading line " << (is.eof() + ? "[eof]" : ""); + return false; // probably eof. fail in any case. + } + const char *white_chars = " \t\n\r\f\v"; + SplitStringToVector(line, white_chars, true, &t_); // true== omit + // empty strings e.g. between spaces. + return true; + } + + // Read in text format since it's basically a text-mode thing.. doesn't really + // matter, it would work either way since we ignore the extra '\r'. + static bool IsReadInBinary() { return false; } + + T &Value() { return t_; } + + void Swap(TokenVectorHolder *other) { + t_.swap(other->t_); + } + + bool ExtractRange(const TokenVectorHolder &other, + const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(TokenVectorHolder); + T t_; +}; + + +class HtkMatrixHolder { + public: + typedef std::pair, HtkHeader> T; + + HtkMatrixHolder() {} + + static bool Write(std::ostream &os, bool binary, const T &t) { + if (!binary) + KALDI_ERR << "Non-binary HTK-format write not supported."; + bool ans = WriteHtk(os, t.first, t.second); + if (!ans) + KALDI_WARN << "Error detected writing HTK-format matrix."; + return ans; + } + + void Clear() { t_.first.Resize(0, 0); } + + // Reads into the holder. + bool Read(std::istream &is) { + bool ans = ReadHtk(is, &t_.first, &t_.second); + if (!ans) { + KALDI_WARN << "Error detected reading HTK-format matrix."; + return false; + } + return ans; + } + + // HTK-format matrices only read in binary. + static bool IsReadInBinary() { return true; } + + T &Value() { return t_; } + + void Swap(HtkMatrixHolder *other) { + t_.first.Swap(&(other->t_.first)); + std::swap(t_.second, other->t_.second); + } + + bool ExtractRange(const HtkMatrixHolder &other, + const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + // Default destructor. + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(HtkMatrixHolder); + T t_; +}; + +// SphinxMatrixHolder can be used to read and write feature files in +// CMU Sphinx format. 13-dimensional big-endian features are assumed. +// The ultimate reference is SphinxBase's source code (for example see +// feat_s2mfc_read() in src/libsphinxbase/feat/feat.c). +// We can't fully automate the detection of machine/feature file endianess +// mismatch here, because for this Sphinx relies on comparing the feature +// file's size with the number recorded in its header. We are working with +// streams, however(what happens if this is a Kaldi archive?). This should +// be no problem, because the usage help of Sphinx' "wave2feat" for example +// says that Sphinx features are always big endian. +// Note: the kFeatDim defaults to 13, see forward declaration in kaldi-holder.h +template class SphinxMatrixHolder { + public: + typedef Matrix T; + + SphinxMatrixHolder() {} + + void Clear() { feats_.Resize(0, 0); } + + // Writes Sphinx-format features + static bool Write(std::ostream &os, bool binary, const T &m) { + if (!binary) { + KALDI_WARN << "SphinxMatrixHolder can't write Sphinx features in text "; + return false; + } + + int32 size = m.NumRows() * m.NumCols(); + if (MachineIsLittleEndian()) + KALDI_SWAP4(size); + // write the header + os.write(reinterpret_cast (&size), sizeof(size)); + + for (MatrixIndexT i = 0; i < m.NumRows(); i++) { + std::vector tmp(m.NumCols()); + for (MatrixIndexT j = 0; j < m.NumCols(); j++) { + tmp[j] = static_cast(m(i, j)); + if (MachineIsLittleEndian()) + KALDI_SWAP4(tmp[j]); + } + os.write(reinterpret_cast(&(tmp[0])), + tmp.size() * 4); + } + return true; + } + + // Reads the features into a Kaldi Matrix + bool Read(std::istream &is) { + int32 nmfcc; + + is.read(reinterpret_cast (&nmfcc), sizeof(nmfcc)); + if (MachineIsLittleEndian()) + KALDI_SWAP4(nmfcc); + KALDI_VLOG(2) << "#feats: " << nmfcc; + int32 nfvec = nmfcc / kFeatDim; + if ((nmfcc % kFeatDim) != 0) { + KALDI_WARN << "Sphinx feature count is inconsistent with vector length "; + return false; + } + + feats_.Resize(nfvec, kFeatDim); + for (MatrixIndexT i = 0; i < feats_.NumRows(); i++) { + if (sizeof(BaseFloat) == sizeof(float32)) { + is.read(reinterpret_cast (feats_.RowData(i)), + kFeatDim * sizeof(float32)); + if (!is.good()) { + KALDI_WARN << "Unexpected error/EOF while reading Sphinx features "; + return false; + } + if (MachineIsLittleEndian()) { + for (MatrixIndexT j = 0; j < kFeatDim; j++) + KALDI_SWAP4(feats_(i, j)); + } + } else { // KALDI_DOUBLEPRECISION=1 + float32 tmp[kFeatDim]; + is.read(reinterpret_cast (tmp), sizeof(tmp)); + if (!is.good()) { + KALDI_WARN << "Unexpected error/EOF while reading Sphinx features "; + return false; + } + for (MatrixIndexT j = 0; j < kFeatDim; j++) { + if (MachineIsLittleEndian()) + KALDI_SWAP4(tmp[j]); + feats_(i, j) = static_cast(tmp[j]); + } + } + } + + return true; + } + + // Only read in binary + static bool IsReadInBinary() { return true; } + + T &Value() { return feats_; } + + void Swap(SphinxMatrixHolder *other) { + feats_.Swap(&(other->feats_)); + } + + bool ExtractRange(const SphinxMatrixHolder &other, + const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(SphinxMatrixHolder); + T feats_; +}; + + +/// @} end "addtogroup holders" + +} // end namespace kaldi + + + +#endif // KALDI_UTIL_KALDI_HOLDER_INL_H_ diff --git a/speechx/speechx/kaldi/util/kaldi-holder.cc b/speechx/speechx/kaldi/util/kaldi-holder.cc new file mode 100644 index 00000000..577679ef --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-holder.cc @@ -0,0 +1,229 @@ +// util/kaldi-holder.cc + +// Copyright 2009-2011 Microsoft Corporation +// 2016 Xiaohui Zhang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "util/kaldi-holder.h" +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + +// Parse matrix range specifier in form r1:r2,c1:c2 +// where any of those four numbers can be missing. In those +// cases, the missing number is set either to 0 (for r1 or c1) +// or the value of parameter rows -1 or columns -1 (which +// represent the dimensions of the original matrix) for missing +// r2 or c2, respectively. +// Examples of valid ranges: 0:39,: or :,:3 or :,5:10 +bool ParseMatrixRangeSpecifier(const std::string &range, + const int rows, const int cols, + std::vector *row_range, + std::vector *col_range) { + if (range.empty()) { + KALDI_ERR << "Empty range specifier."; + return false; + } + std::vector splits; + SplitStringToVector(range, ",", false, &splits); + if (!((splits.size() == 1 && !splits[0].empty()) || + (splits.size() == 2 && !splits[0].empty() && !splits[1].empty()))) { + KALDI_ERR << "Invalid range specifier for matrix: " << range; + return false; + } + + bool status = true; + + if (splits[0] != ":") + status = SplitStringToIntegers(splits[0], ":", false, row_range); + + if (splits.size() == 2 && splits[1] != ":") { + status = status && SplitStringToIntegers(splits[1], ":", false, col_range); + } + if (row_range->size() == 0) { + row_range->push_back(0); + row_range->push_back(rows - 1); + } + if (col_range->size() == 0) { + col_range->push_back(0); + col_range->push_back(cols - 1); + } + + // Length tolerance of 3 -- 2 to account for edge effects when + // frame-length is 25ms and frame-shift is 10ms, and 1 for rounding effects + // since segments are usually retained up to 2 decimal places. + int32 length_tolerance = 3; + if (!(status && row_range->size() == 2 && col_range->size() == 2 && + row_range->at(0) >= 0 && row_range->at(0) <= row_range->at(1) && + row_range->at(1) < rows + length_tolerance && + col_range->at(0) >=0 && + col_range->at(0) <= col_range->at(1) && col_range->at(1) < cols)) { + KALDI_ERR << "Invalid range specifier: " << range + << " for matrix of size " << rows + << "x" << cols; + return false; + } + + if (row_range->at(1) >= rows) + KALDI_WARN << "Row range " << row_range->at(0) << ":" << row_range->at(1) + << " goes beyond the number of rows of the " + << "matrix " << rows; + return status; +} + +bool ExtractObjectRange(const GeneralMatrix &input, const std::string &range, + GeneralMatrix *output) { + // We just inspect input's type and forward to the correct implementation + // if available. For kSparseMatrix, we do just fairly inefficient conversion + // to a full matrix. + Matrix output_mat; + if (input.Type() == kFullMatrix) { + const Matrix &in = input.GetFullMatrix(); + ExtractObjectRange(in, range, &output_mat); + } else if (input.Type() == kCompressedMatrix) { + const CompressedMatrix &in = input.GetCompressedMatrix(); + ExtractObjectRange(in, range, &output_mat); + } else { + KALDI_ASSERT(input.Type() == kSparseMatrix); + // NOTE: this is fairly inefficient, so if this happens to be bottleneck + // it should be re-implemented more efficiently. + Matrix input_mat; + input.GetMatrix(&input_mat); + ExtractObjectRange(input_mat, range, &output_mat); + } + output->Clear(); + output->SwapFullMatrix(&output_mat); + return true; +} + +template +bool ExtractObjectRange(const CompressedMatrix &input, const std::string &range, + Matrix *output) { + std::vector row_range, col_range; + + if (!ParseMatrixRangeSpecifier(range, input.NumRows(), input.NumCols(), + &row_range, &col_range)) { + KALDI_ERR << "Could not parse range specifier \"" << range << "\"."; + } + + int32 row_size = std::min(row_range[1], input.NumRows() - 1) + - row_range[0] + 1, + col_size = col_range[1] - col_range[0] + 1; + + output->Resize(row_size, col_size, kUndefined); + input.CopyToMat(row_range[0], col_range[0], output); + return true; +} + +// template instantiation +template bool ExtractObjectRange(const CompressedMatrix &, const std::string &, + Matrix *); +template bool ExtractObjectRange(const CompressedMatrix &, const std::string &, + Matrix *); + +template +bool ExtractObjectRange(const Matrix &input, const std::string &range, + Matrix *output) { + std::vector row_range, col_range; + + if (!ParseMatrixRangeSpecifier(range, input.NumRows(), input.NumCols(), + &row_range, &col_range)) { + KALDI_ERR << "Could not parse range specifier \"" << range << "\"."; + } + + int32 row_size = std::min(row_range[1], input.NumRows() - 1) + - row_range[0] + 1, + col_size = col_range[1] - col_range[0] + 1; + output->Resize(row_size, col_size, kUndefined); + output->CopyFromMat(input.Range(row_range[0], row_size, + col_range[0], col_size)); + return true; +} + +// template instantiation +template bool ExtractObjectRange(const Matrix &, const std::string &, + Matrix *); +template bool ExtractObjectRange(const Matrix &, const std::string &, + Matrix *); + +template +bool ExtractObjectRange(const Vector &input, const std::string &range, + Vector *output) { + if (range.empty()) { + KALDI_ERR << "Empty range specifier."; + return false; + } + std::vector splits; + SplitStringToVector(range, ",", false, &splits); + if (!((splits.size() == 1 && !splits[0].empty()))) { + KALDI_ERR << "Invalid range specifier for vector: " << range; + return false; + } + std::vector index_range; + bool status = true; + if (splits[0] != ":") + status = SplitStringToIntegers(splits[0], ":", false, &index_range); + + if (index_range.size() == 0) { + index_range.push_back(0); + index_range.push_back(input.Dim() - 1); + } + + // Length tolerance of 3 -- 2 to account for edge effects when + // frame-length is 25ms and frame-shift is 10ms, and 1 for rounding effects + // since segments are usually retained up to 2 decimal places. + int32 length_tolerance = 3; + if (!(status && index_range.size() == 2 && + index_range[0] >= 0 && index_range[0] <= index_range[1] && + index_range[1] < input.Dim() + length_tolerance)) { + KALDI_ERR << "Invalid range specifier: " << range + << " for vector of size " << input.Dim(); + return false; + } + + if (index_range[1] >= input.Dim()) + KALDI_WARN << "Range " << index_range[0] << ":" << index_range[1] + << " goes beyond the vector dimension " << input.Dim(); + int32 size = std::min(index_range[1], input.Dim() - 1) - index_range[0] + 1; + output->Resize(size, kUndefined); + output->CopyFromVec(input.Range(index_range[0], size)); + return true; +} + +// template instantiation +template bool ExtractObjectRange(const Vector &, const std::string &, + Vector *); +template bool ExtractObjectRange(const Vector &, const std::string &, + Vector *); + +bool ExtractRangeSpecifier(const std::string &rxfilename_with_range, + std::string *data_rxfilename, + std::string *range) { + if (rxfilename_with_range.empty() || + rxfilename_with_range[rxfilename_with_range.size()-1] != ']') + KALDI_ERR << "ExtractRangeRspecifier called wrongly."; + std::vector splits; + SplitStringToVector(rxfilename_with_range, "[", false, &splits); + if (splits.size() == 2 && !splits[0].empty() && splits[1].size() > 1) { + *data_rxfilename = splits[0]; + range->assign(splits[1], 0, splits[1].size()-1); + return true; + } + return false; +} + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/util/kaldi-holder.h b/speechx/speechx/kaldi/util/kaldi-holder.h new file mode 100644 index 00000000..f495f27f --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-holder.h @@ -0,0 +1,282 @@ +// util/kaldi-holder.h + +// Copyright 2009-2011 Microsoft Corporation +// 2016 Johns Hopkins University (author: Daniel Povey) +// 2016 Xiaohui Zhang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_KALDI_HOLDER_H_ +#define KALDI_UTIL_KALDI_HOLDER_H_ + +#include +#include "util/kaldi-io.h" +#include "util/text-utils.h" +#include "matrix/kaldi-vector.h" +#include "matrix/sparse-matrix.h" + +namespace kaldi { + + +// The Table class uses a Holder class to wrap objects, and make them behave +// in a "normalized" way w.r.t. reading and writing, so the Table class can +// be template-ized without too much trouble. Look below this +// comment (search for GenericHolder) to see what it looks like. +// +// Requirements of the holder class: +// +// They can only contain objects that can be read/written without external +// information; other objects cannot be stored in this type of archive. +// +// In terms of what functions it should have, see GenericHolder below. +// It is just for documentation. +// +// (1) Requirements of the Read and Write functions +// +// The Read and Write functions should have the property that in a longer +// file, if the Read function is started from where the Write function started +// writing, it should go to where the Write function stopped writing, in either +// text or binary mode (but it's OK if it doesn't eat up trailing space). +// +// [Desirable property: when writing in text mode the output should contain +// exactly one newline, at the end of the output; this makes it easier to +// manipulate] +// +// [Desirable property for classes: the output should just be a binary-mode +// header (if in binary mode and it's a Kaldi object, or no header +// othewise), and then the output of Object.Write(). This means that when +// written to individual files with the scp: type of wspecifier, we can +// read the individual files in the "normal" Kaldi way by reading the +// binary header and then the object.] +// +// +// The Write function takes a 'binary' argument. In general, each object will +// have two formats: text and binary. However, it's permitted to throw() if +// asked to read in the text format if there is none. The file will be open, if +// the file system has binary/text modes, in the corresponding mode. However, +// the object should have a file-mode in which it can read either text or binary +// output. It announces this via the static IsReadInBinary() function. This +// will generally be the binary mode and it means that where necessary, in text +// formats, we must ignore \r characters. +// +// Memory requirements: if it allocates memory, the destructor should +// free that memory. Copying and assignment of Holder objects may be +// disallowed as the Table code never does this. + + +/// GenericHolder serves to document the requirements of the Holder interface; +/// it's not intended to be used. +template class GenericHolder { + public: + typedef SomeType T; + + /// Must have a constructor that takes no arguments. + GenericHolder() { } + + /// Write() writes this object of type T. Possibly also writes a binary-mode + /// header so that the Read function knows which mode to read in (since the + /// Read function does not get this information). It's a static member so we + /// can write those not inside this class (can use this function with Value() + /// to write from this class). The Write method may throw if it cannot write + /// the object in the given (binary/non-binary) mode. The holder object can + /// assume the stream has been opened in the given mode (where relevant). The + /// object can write the data how it likes. + static bool Write(std::ostream &os, bool binary, const T &t); + + /// Reads into the holder. Must work out from the stream (which will be + /// opened on Windows in binary mode if the IsReadInBinary() function of this + /// class returns true, and text mode otherwise) whether the actual data is + /// binary or not (usually via reading the Kaldi binary-mode header). + /// We put the responsibility for reading the Kaldi binary-mode header in the + /// Read function (rather than making the binary mode an argument to this + /// function), so that for non-Kaldi binary files we don't have to write the + /// header, which would prevent the file being read by non-Kaldi programs + /// (e.g. if we write to individual files using an scp). + /// Read must deallocate any existing data we have here, if applicable (must + /// not assume the object was newly constructed). + /// Returns true on success. + /// If Read() returns false, the contents of this object and hence the value + /// returned by Value() may be undefined. + bool Read(std::istream &is); + + /// IsReadInBinary() will return true if the object wants the file to be + /// opened in binary for reading (if the file system has binary/text modes), + /// and false otherwise. Static function. Kaldi objects always return true + /// as they always read in binary mode. Note that we must be able to read, in + /// this mode, objects written in both text and binary mode by Write (which + /// may mean ignoring "\r" characters). I doubt we will ever want this + /// function to return false. + static bool IsReadInBinary() { return true; } + + /// Returns the value of the object held here. Will only + /// ever be called if Read() has been previously called and it returned + /// true (so OK to throw exception if no object was read). + T &Value() { return t_; } // if t is a pointer, would return *t_; + + /// The Clear() function doesn't have to do anything. Its purpose is to + /// allow the object to free resources if they're no longer needed. + void Clear() { } + + /// This swaps the objects held by *this and *other (preferably a shallow + /// swap). Note, this is just an example. The swap is with the *same type* + /// of holder, not with some nonexistent base-class (remember, GenericHolder is + /// an example for documentation, not a base-class). + void Swap(GenericHolder *other) { std::swap(t_, other->t_); } + + /// At the time of writing this will only do something meaningful + /// KaldiObjectHolder holding matrix objects, in order to extract a holder + /// holding a sub-matrix specified by 'range', e.g. [0:3,2:10], like in Matlab + /// but with zero-based indexing. It returns true with successful extraction + /// of the range, false if the range was invalid or outside the bounds of the + /// matrix. For other types of holder it just throws an error. + bool ExtractRange(const GenericHolder &other, const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + + /// If the object held pointers, the destructor would free them. + ~GenericHolder() { } + + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(GenericHolder); + T t_; // t_ may alternatively be of type T*. +}; + + +// See kaldi-holder-inl.h for examples of some actual Holder +// classes and templates. + + +// The following two typedefs should probably be in their own file, but they're +// here until there are enough of them to warrant their own header. + + +/// \addtogroup holders +/// @{ + +/// KaldiObjectHolder works for Kaldi objects that have the "standard" Read +/// and Write functions, and a copy constructor. +template class KaldiObjectHolder; + +/// BasicHolder is valid for float, double, bool, and integer +/// types. There will be a compile time error otherwise, because +/// we make sure that the {Write, Read}BasicType functions do not +/// get instantiated for other types. +template class BasicHolder; + + +// A Holder for a vector of basic types, e.g. +// std::vector, std::vector, and so on. +// Note: a basic type is defined as a type for which ReadBasicType +// and WriteBasicType are implemented, i.e. integer and floating +// types, and bool. +template class BasicVectorHolder; + + +// A holder for vectors of vectors of basic types, e.g. +// std::vector >, and so on. +// Note: a basic type is defined as a type for which ReadBasicType +// and WriteBasicType are implemented, i.e. integer and floating +// types, and bool. +template class BasicVectorVectorHolder; + +// A holder for vectors of pairs of basic types, e.g. +// std::vector >, and so on. +// Note: a basic type is defined as a type for which ReadBasicType +// and WriteBasicType are implemented, i.e. integer and floating +// types, and bool. Text format is (e.g. for integers), +// "1 12 ; 43 61 ; 17 8 \n" +template class BasicPairVectorHolder; + +/// We define a Token (not a typedef, just a word) as a nonempty, printable, +/// whitespace-free std::string. The binary and text formats here are the same +/// (newline-terminated) and as such we don't bother with the binary-mode +/// headers. +class TokenHolder; + +/// Class TokenVectorHolder is a Holder class for vectors of Tokens +/// (T == std::string). +class TokenVectorHolder; + +/// A class for reading/writing HTK-format matrices. +/// T == std::pair, HtkHeader> +class HtkMatrixHolder; + +/// A class for reading/writing Sphinx format matrices. +template class SphinxMatrixHolder; + +/// This templated function exists so that we can write .scp files with +/// 'object ranges' specified: the canonical example is a [first:last] range +/// of rows of a matrix, or [first-row:last-row,first-column,last-column] +/// of a matrix. We can also support [begin-time:end-time] of a wave +/// file. The string 'range' is whatever is in the square brackets; it is +/// parsed inside this function. +/// This function returns true if the partial object was successfully extracted, +/// and false if there was an error such as an invalid range. +/// The generic version of this function just fails; we overload the template +/// whenever we need it for a specific class. +template +bool ExtractObjectRange(const T &input, const std::string &range, T *output) { + KALDI_ERR << "Ranges not supported for objects of this type."; + return false; +} + +/// The template is specialized with a version that actually does something, +/// for types Matrix and Matrix. We can later add versions of +/// this template for other types, such as Vector, which can meaningfully +/// have ranges extracted. +template +bool ExtractObjectRange(const Matrix &input, const std::string &range, + Matrix *output); + +/// The template is specialized types Vector and Vector. +template +bool ExtractObjectRange(const Vector &input, const std::string &range, + Vector *output); + +/// GeneralMatrix is always of type BaseFloat +bool ExtractObjectRange(const GeneralMatrix &input, const std::string &range, + GeneralMatrix *output); + +/// CompressedMatrix is always of the type BaseFloat but it is more +/// efficient to provide template as it uses CompressedMatrix's own +/// conversion to Matrix +template +bool ExtractObjectRange(const CompressedMatrix &input, const std::string &range, + Matrix *output); + +// In SequentialTableReaderScriptImpl and RandomAccessTableReaderScriptImpl, for +// cases where the scp contained 'range specifiers' (things in square brackets +// identifying parts of objects like matrices), use this function to separate +// the input string 'rxfilename_with_range' (e.g "1.ark:100[1:2,2:10]") into the data_rxfilename +// (e.g. "1.ark:100") and the optional range specifier which will be everything +// inside the square brackets. It returns true if everything seems OK, and +// false if for example the string contained more than one '['. This function +// should only be called if 'line' ends in ']', otherwise it is an error. +bool ExtractRangeSpecifier(const std::string &rxfilename_with_range, + std::string *data_rxfilename, + std::string *range); + + +/// @} end "addtogroup holders" + + +} // end namespace kaldi + +#include "util/kaldi-holder-inl.h" + +#endif // KALDI_UTIL_KALDI_HOLDER_H_ diff --git a/speechx/speechx/kaldi/util/kaldi-io-inl.h b/speechx/speechx/kaldi/util/kaldi-io-inl.h new file mode 100644 index 00000000..2474f701 --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-io-inl.h @@ -0,0 +1,46 @@ +// util/kaldi-io-inl.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_UTIL_KALDI_IO_INL_H_ +#define KALDI_UTIL_KALDI_IO_INL_H_ + +#include + +namespace kaldi { + +bool Input::Open(const std::string &rxfilename, bool *binary) { + return OpenInternal(rxfilename, true, binary); +} + +bool Input::OpenTextMode(const std::string &rxfilename) { + return OpenInternal(rxfilename, false, NULL); +} + +bool Input::IsOpen() { + return impl_ != NULL; +} + +bool Output::IsOpen() { + return impl_ != NULL; +} + + +} // end namespace kaldi. + + +#endif // KALDI_UTIL_KALDI_IO_INL_H_ diff --git a/speechx/speechx/kaldi/util/kaldi-io.cc b/speechx/speechx/kaldi/util/kaldi-io.cc new file mode 100644 index 00000000..96cd8fa1 --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-io.cc @@ -0,0 +1,884 @@ +// util/kaldi-io.cc + +// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky +// 2016 Xiaohui Zhang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#include "util/kaldi-io.h" +#include +#include +#include "base/kaldi-math.h" +#include "util/text-utils.h" +#include "util/parse-options.h" +#include "util/kaldi-holder.h" +#include "util/kaldi-pipebuf.h" +#include "util/kaldi-table.h" // for Classify{W,R}specifier +#include +#include + +#ifdef KALDI_CYGWIN_COMPAT +#include "util/kaldi-cygwin-io-inl.h" +#define MapOsPath(x) MapCygwinPath(x) +#else // KALDI_CYGWIN_COMPAT +#define MapOsPath(x) x +#endif // KALDI_CYGWIN_COMPAT + + +#if defined(_MSC_VER) +static FILE *popen(const char* command, const char* mode) { +#ifdef KALDI_CYGWIN_COMPAT + return kaldi::CygwinCompatPopen(command, mode); +#else // KALDI_CYGWIN_COMPAT + return _popen(command, mode); +#endif // KALDI_CYGWIN_COMPAT +} +#endif // _MSC_VER + +namespace kaldi { + +#ifndef _MSC_VER // on VS, we don't need this type. +// could replace basic_pipebuf with stdio_filebuf on some platforms. +// Would mean we could use less of our own code. +typedef basic_pipebuf PipebufType; +#endif +} + +namespace kaldi { + +std::string PrintableRxfilename(const std::string &rxfilename) { + if (rxfilename == "" || rxfilename == "-") { + return "standard input"; + } else { + // If this call to Escape later causes compilation issues, + // just replace it with "return rxfilename"; it's only a + // pretty-printing issue. + return ParseOptions::Escape(rxfilename); + } +} + + +std::string PrintableWxfilename(const std::string &wxfilename) { + if (wxfilename == "" || wxfilename == "-") { + return "standard output"; + } else { + // If this call to Escape later causes compilation issues, + // just replace it with "return wxfilename"; it's only a + // pretty-printing issue. + return ParseOptions::Escape(wxfilename); + } +} + + +OutputType ClassifyWxfilename(const std::string &filename) { + const char *c = filename.c_str(); + size_t length = filename.length(); + char first_char = c[0], + last_char = (length == 0 ? '\0' : c[filename.length()-1]); + + // if 'filename' is "" or "-", return kStandardOutput. + if (length == 0 || (length == 1 && first_char == '-')) + return kStandardOutput; + else if (first_char == '|') return kPipeOutput; // An output pipe like "|blah". + else if (isspace(first_char) || isspace(last_char) || last_char == '|') { + return kNoOutput; // Leading or trailing space: can't interpret this. + // Final '|' would represent an input pipe, not an + // output pipe. + } else if ((first_char == 'a' || first_char == 's') && + strchr(c, ':') != NULL && + (ClassifyWspecifier(filename, NULL, NULL, NULL) != kNoWspecifier || + ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { + // e.g. ark:something or scp:something... this is almost certainly a + // scripting error, so call it an error rather than treating it as a file. + // In practice in modern kaldi scripts all (r,w)filenames begin with "ark" + // or "scp", even though technically speaking options like "b", "t", "s" or + // "cs" can appear before the ark or scp, like "b,ark". For efficiency, + // and because this code is really just a nicety to catch errors earlier + // than they would otherwise be caught, we only call those extra functions + // for filenames beginning with 'a' or 's'. + return kNoOutput; + } else if (isdigit(last_char)) { + // This could be a file, but we have to see if it's an offset into a file + // (like foo.ark:4314328), which is not allowed for writing (but is + // allowed for reaching). This eliminates some things which would be + // valid UNIX filenames but are not allowed by Kaldi. (Even if we allowed + // such filenames for writing, we woudln't be able to correctly read them). + const char *d = c + length - 1; + while (isdigit(*d) && d > c) d--; + if (*d == ':') return kNoOutput; + // else it could still be a filename; continue to the next check. + } + + // At this point it matched no other pattern so we assume a filename, but we + // check for internal '|' as it's a common source of errors to have pipe + // commands without the pipe in the right place. Say that it can't be + // classified. + if (strchr(c, '|') != NULL) { + KALDI_WARN << "Trying to classify wxfilename with pipe symbol in the" + " wrong place (pipe without | at the beginning?): " << + filename; + return kNoOutput; + } + return kFileOutput; // It matched no other pattern: assume it's a filename. +} + + +InputType ClassifyRxfilename(const std::string &filename) { + const char *c = filename.c_str(); + size_t length = filename.length(); + char first_char = c[0], + last_char = (length == 0 ? '\0' : c[filename.length()-1]); + + // if 'filename' is "" or "-", return kStandardInput. + if (length == 0 || (length == 1 && first_char == '-')) { + return kStandardInput; + } else if (first_char == '|') { + return kNoInput; // An output pipe like "|blah": not + // valid for input. + } else if (last_char == '|') { + return kPipeInput; + } else if (isspace(first_char) || isspace(last_char)) { + return kNoInput; // We don't allow leading or trailing space in a filename. + } else if ((first_char == 'a' || first_char == 's') && + strchr(c, ':') != NULL && + (ClassifyWspecifier(filename, NULL, NULL, NULL) != kNoWspecifier || + ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { + // e.g. ark:something or scp:something... this is almost certainly a + // scripting error, so call it an error rather than treating it as a file. + // In practice in modern kaldi scripts all (r,w)filenames begin with "ark" + // or "scp", even though technically speaking options like "b", "t", "s" or + // "cs" can appear before the ark or scp, like "b,ark". For efficiency, + // and because this code is really just a nicety to catch errors earlier + // than they would otherwise be caught, we only call those extra functions + // for filenames beginning with 'a' or 's'. + return kNoInput; + } else if (isdigit(last_char)) { + const char *d = c + length - 1; + while (isdigit(*d) && d > c) d--; + if (*d == ':') return kOffsetFileInput; // Filename is like + // some_file:12345 + // otherwise it could still be a filename; continue to the next check. + } + + + // At this point it matched no other pattern so we assume a filename, but + // we check for '|' as it's a common source of errors to have pipe + // commands without the pipe in the right place. Say that it can't be + // classified in this case. + if (strchr(c, '|') != NULL) { + KALDI_WARN << "Trying to classify rxfilename with pipe symbol in the" + " wrong place (pipe without | at the end?): " << filename; + return kNoInput; + } + return kFileInput; // It matched no other pattern: assume it's a filename. +} + +class OutputImplBase { + public: + // Open will open it as a file (no header), and return true + // on success. It cannot be called on an already open stream. + virtual bool Open(const std::string &filename, bool binary) = 0; + virtual std::ostream &Stream() = 0; + virtual bool Close() = 0; + virtual ~OutputImplBase() { } +}; + + +class FileOutputImpl: public OutputImplBase { + public: + virtual bool Open(const std::string &filename, bool binary) { + if (os_.is_open()) KALDI_ERR << "FileOutputImpl::Open(), " + << "open called on already open file."; + filename_ = filename; + os_.open(MapOsPath(filename_).c_str(), + binary ? std::ios_base::out | std::ios_base::binary + : std::ios_base::out); + return os_.is_open(); + } + + virtual std::ostream &Stream() { + if (!os_.is_open()) + KALDI_ERR << "FileOutputImpl::Stream(), file is not open."; + // I believe this error can only arise from coding error. + return os_; + } + + virtual bool Close() { + if (!os_.is_open()) + KALDI_ERR << "FileOutputImpl::Close(), file is not open."; + // I believe this error can only arise from coding error. + os_.close(); + return !(os_.fail()); + } + virtual ~FileOutputImpl() { + if (os_.is_open()) { + os_.close(); + if (os_.fail()) + KALDI_ERR << "Error closing output file " << filename_; + } + } + private: + std::string filename_; + std::ofstream os_; +}; + +class StandardOutputImpl: public OutputImplBase { + public: + StandardOutputImpl(): is_open_(false) { } + + virtual bool Open(const std::string &filename, bool binary) { + if (is_open_) KALDI_ERR << "StandardOutputImpl::Open(), " + "open called on already open file."; +#ifdef _MSC_VER + _setmode(_fileno(stdout), binary ? _O_BINARY : _O_TEXT); +#endif + is_open_ = std::cout.good(); + return is_open_; + } + + virtual std::ostream &Stream() { + if (!is_open_) + KALDI_ERR << "StandardOutputImpl::Stream(), object not initialized."; + // I believe this error can only arise from coding error. + return std::cout; + } + + virtual bool Close() { + if (!is_open_) + KALDI_ERR << "StandardOutputImpl::Close(), file is not open."; + is_open_ = false; + std::cout << std::flush; + return !(std::cout.fail()); + } + virtual ~StandardOutputImpl() { + if (is_open_) { + std::cout << std::flush; + if (std::cout.fail()) + KALDI_ERR << "Error writing to standard output"; + } + } + private: + bool is_open_; +}; + +class PipeOutputImpl: public OutputImplBase { + public: + PipeOutputImpl(): f_(NULL), os_(NULL) { } + + virtual bool Open(const std::string &wxfilename, bool binary) { + filename_ = wxfilename; + KALDI_ASSERT(f_ == NULL); // Make sure closed. + KALDI_ASSERT(wxfilename.length() != 0 && wxfilename[0] == '|'); // should + // start with '|' + std::string cmd_name(wxfilename, 1); +#if defined(_MSC_VER) || defined(__CYGWIN__) + f_ = popen(cmd_name.c_str(), (binary ? "wb" : "w")); +#else + f_ = popen(cmd_name.c_str(), "w"); +#endif + if (!f_) { // Failure. + KALDI_WARN << "Failed opening pipe for writing, command is: " + << cmd_name << ", errno is " << strerror(errno); + return false; + } else { +#ifndef _MSC_VER + fb_ = new PipebufType(f_, // Using this constructor won't make the + // destructor try to close the stream when + // we're done. + (binary ? std::ios_base::out| + std::ios_base::binary + :std::ios_base::out)); + KALDI_ASSERT(fb_ != NULL); // or would be alloc error. + os_ = new std::ostream(fb_); +#else + os_ = new std::ofstream(f_); +#endif + return os_->good(); + } + } + + virtual std::ostream &Stream() { + if (os_ == NULL) KALDI_ERR << "PipeOutputImpl::Stream()," + " object not initialized."; + // I believe this error can only arise from coding error. + return *os_; + } + + virtual bool Close() { + if (os_ == NULL) KALDI_ERR << "PipeOutputImpl::Close(), file is not open."; + bool ok = true; + os_->flush(); + if (os_->fail()) ok = false; + delete os_; + os_ = NULL; + int status; +#ifdef _MSC_VER + status = _pclose(f_); +#else + status = pclose(f_); +#endif + if (status) + KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " + << status; + f_ = NULL; +#ifndef _MSC_VER + delete fb_; + fb_ = NULL; +#endif + return ok; + } + virtual ~PipeOutputImpl() { + if (os_) { + if (!Close()) + KALDI_ERR << "Error writing to pipe " << PrintableWxfilename(filename_); + } + } + private: + std::string filename_; + FILE *f_; +#ifndef _MSC_VER + PipebufType *fb_; +#endif + std::ostream *os_; +}; + + + +class InputImplBase { + public: + // Open will open it as a file, and return true on success. + // May be called twice only for kOffsetFileInput (otherwise, + // if called twice, we just create a new Input object, to avoid + // having to deal with the extra hassle of reopening with the + // same object. + // Note that we will to call Open with true (binary) for + // for text-mode Kaldi files; the only actual text-mode input + // is for non-Kaldi files. + virtual bool Open(const std::string &filename, bool binary) = 0; + virtual std::istream &Stream() = 0; + virtual int32 Close() = 0; // We only need to check failure in the case of + // kPipeInput. + // on close for input streams. + virtual InputType MyType() = 0; // Because if it's kOffsetFileInput, we may + // call Open twice + // (has efficiency benefits). + + virtual ~InputImplBase() { } +}; + +class FileInputImpl: public InputImplBase { + public: + virtual bool Open(const std::string &filename, bool binary) { + if (is_.is_open()) KALDI_ERR << "FileInputImpl::Open(), " + << "open called on already open file."; + is_.open(MapOsPath(filename).c_str(), + binary ? std::ios_base::in | std::ios_base::binary + : std::ios_base::in); + return is_.is_open(); + } + + virtual std::istream &Stream() { + if (!is_.is_open()) + KALDI_ERR << "FileInputImpl::Stream(), file is not open."; + // I believe this error can only arise from coding error. + return is_; + } + + virtual int32 Close() { + if (!is_.is_open()) + KALDI_ERR << "FileInputImpl::Close(), file is not open."; + // I believe this error can only arise from coding error. + is_.close(); + // Don't check status. + return 0; + } + + virtual InputType MyType() { return kFileInput; } + + virtual ~FileInputImpl() { + // Stream will automatically be closed, and we don't care about + // whether it fails. + } + private: + std::ifstream is_; +}; + + +class StandardInputImpl: public InputImplBase { + public: + StandardInputImpl(): is_open_(false) { } + + virtual bool Open(const std::string &filename, bool binary) { + if (is_open_) KALDI_ERR << "StandardInputImpl::Open(), " + "open called on already open file."; + is_open_ = true; +#ifdef _MSC_VER + _setmode(_fileno(stdin), binary ? _O_BINARY : _O_TEXT); +#endif + return true; // Don't check good() because would be false if + // eof, which may be valid input. + } + + virtual std::istream &Stream() { + if (!is_open_) + KALDI_ERR << "StandardInputImpl::Stream(), object not initialized."; + // I believe this error can only arise from coding error. + return std::cin; + } + + virtual InputType MyType() { return kStandardInput; } + + virtual int32 Close() { + if (!is_open_) KALDI_ERR << "StandardInputImpl::Close(), file is not open."; + is_open_ = false; + return 0; + } + virtual ~StandardInputImpl() { } + private: + bool is_open_; +}; + +class PipeInputImpl: public InputImplBase { + public: + PipeInputImpl(): f_(NULL), is_(NULL) { } + + virtual bool Open(const std::string &rxfilename, bool binary) { + filename_ = rxfilename; + KALDI_ASSERT(f_ == NULL); // Make sure closed. + KALDI_ASSERT(rxfilename.length() != 0 && + rxfilename[rxfilename.length()-1] == '|'); // should end with '|' + std::string cmd_name(rxfilename, 0, rxfilename.length()-1); +#if defined(_MSC_VER) || defined(__CYGWIN__) + f_ = popen(cmd_name.c_str(), (binary ? "rb" : "r")); +#else + f_ = popen(cmd_name.c_str(), "r"); +#endif + + if (!f_) { // Failure. + KALDI_WARN << "Failed opening pipe for reading, command is: " + << cmd_name << ", errno is " << strerror(errno); + return false; + } else { +#ifndef _MSC_VER + fb_ = new PipebufType(f_, // Using this constructor won't lead the + // destructor to close the stream. + (binary ? std::ios_base::in| + std::ios_base::binary + :std::ios_base::in)); + KALDI_ASSERT(fb_ != NULL); // or would be alloc error. + is_ = new std::istream(fb_); +#else + is_ = new std::ifstream(f_); +#endif + if (is_->fail() || is_->bad()) return false; + if (is_->eof()) { + KALDI_WARN << "Pipe opened with command " + << PrintableRxfilename(rxfilename) + << " is empty."; + // don't return false: empty may be valid. + } + return true; + } + } + + virtual std::istream &Stream() { + if (is_ == NULL) + KALDI_ERR << "PipeInputImpl::Stream(), object not initialized."; + // I believe this error can only arise from coding error. + return *is_; + } + + virtual int32 Close() { + if (is_ == NULL) + KALDI_ERR << "PipeInputImpl::Close(), file is not open."; + delete is_; + is_ = NULL; + int32 status; +#ifdef _MSC_VER + status = _pclose(f_); +#else + status = pclose(f_); +#endif + if (status) + KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " + << status; + f_ = NULL; +#ifndef _MSC_VER + delete fb_; + fb_ = NULL; +#endif + return status; + } + virtual ~PipeInputImpl() { + if (is_) + Close(); + } + virtual InputType MyType() { return kPipeInput; } + private: + std::string filename_; + FILE *f_; +#ifndef _MSC_VER + PipebufType *fb_; +#endif + std::istream *is_; +}; + +/* +#else + +// Just have an empty implementation of the pipe input that crashes if +// called. +class PipeInputImpl: public InputImplBase { + public: + PipeInputImpl() { KALDI_ASSERT(0 && "Pipe input not yet supported on this + platform."); } + virtual bool Open(const std::string, bool) { return 0; } + virtual std::istream &Stream() const { return NULL; } + virtual void Close() {} + virtual InputType MyType() { return kPipeInput; } +}; + +#endif +*/ + +class OffsetFileInputImpl: public InputImplBase { + // This class is a bit more complicated than the + + public: + // splits a filename like /my/file:123 into /my/file and the + // number 123. Crashes if not this format. + static void SplitFilename(const std::string &rxfilename, + std::string *filename, + size_t *offset) { + size_t pos = rxfilename.find_last_of(':'); + KALDI_ASSERT(pos != std::string::npos); // would indicate error in calling + // code, as the filename is supposed to be of the correct form at this + // point. + *filename = std::string(rxfilename, 0, pos); + std::string number(rxfilename, pos+1); + bool ans = ConvertStringToInteger(number, offset); + if (!ans) + KALDI_ERR << "Cannot get offset from filename " << rxfilename + << " (possibly you compiled in 32-bit and have a >32-bit" + << " byte offset into a file; you'll have to compile 64-bit."; + } + + bool Seek(size_t offset) { + size_t cur_pos = is_.tellg(); + if (cur_pos == offset) return true; + else if (cur_pos offset) { + // We're close enough that it may be faster to just + // read that data, rather than seek. + for (size_t i = cur_pos; i < offset; i++) + is_.get(); + return (is_.tellg() == std::streampos(offset)); + } + // Try to actually seek. + is_.seekg(offset, std::ios_base::beg); + if (is_.fail()) { // failbit or badbit is set [error happened] + is_.close(); + return false; // failure. + } else { + is_.clear(); // Clear any failure bits (e.g. eof). + return true; // success. + } + } + + // This Open routine is unusual in that it is designed to work even + // if it was already open. This for efficiency when seeking multiple + // times. + virtual bool Open(const std::string &rxfilename, bool binary) { + if (is_.is_open()) { + // We are opening when we have an already-open file. + // We may have to seek within this file, or else close it and + // open a different one. + std::string tmp_filename; + size_t offset; + SplitFilename(rxfilename, &tmp_filename, &offset); + if (tmp_filename == filename_ && binary == binary_) { // Just seek + is_.clear(); // clear fail bit, etc. + return Seek(offset); + } else { + is_.close(); // don't bother checking error status of is_. + filename_ = tmp_filename; + is_.open(MapOsPath(filename_).c_str(), + binary ? std::ios_base::in | std::ios_base::binary + : std::ios_base::in); + if (!is_.is_open()) return false; + else + return Seek(offset); + } + } else { + size_t offset; + SplitFilename(rxfilename, &filename_, &offset); + binary_ = binary; + is_.open(MapOsPath(filename_).c_str(), + binary ? std::ios_base::in | std::ios_base::binary + : std::ios_base::in); + if (!is_.is_open()) return false; + else + return Seek(offset); + } + } + + virtual std::istream &Stream() { + if (!is_.is_open()) + KALDI_ERR << "FileInputImpl::Stream(), file is not open."; + // I believe this error can only arise from coding error. + return is_; + } + + virtual int32 Close() { + if (!is_.is_open()) + KALDI_ERR << "FileInputImpl::Close(), file is not open."; + // I believe this error can only arise from coding error. + is_.close(); + // Don't check status. + return 0; + } + + virtual InputType MyType() { return kOffsetFileInput; } + + virtual ~OffsetFileInputImpl() { + // Stream will automatically be closed, and we don't care about + // whether it fails. + } + private: + std::string filename_; // the actual filename + bool binary_; // true if was opened in binary mode. + std::ifstream is_; +}; + + +Output::Output(const std::string &wxfilename, bool binary, + bool write_header):impl_(NULL) { + if (!Open(wxfilename, binary, write_header)) { + if (impl_) { + delete impl_; + impl_ = NULL; + } + KALDI_ERR << "Error opening output stream " << + PrintableWxfilename(wxfilename); + } +} + +bool Output::Close() { + if (!impl_) { + return false; // error to call Close if not open. + } else { + bool ans = impl_->Close(); + delete impl_; + impl_ = NULL; + return ans; + } +} + +Output::~Output() { + if (impl_) { + bool ok = impl_->Close(); + delete impl_; + impl_ = NULL; + if (!ok) + KALDI_ERR << "Error closing output file " + << PrintableWxfilename(filename_) + << (ClassifyWxfilename(filename_) == kFileOutput ? + " (disk full?)" : ""); + } +} + +std::ostream &Output::Stream() { // will throw if not open; else returns + // stream. + if (!impl_) KALDI_ERR << "Output::Stream() called but not open."; + return impl_->Stream(); +} + +bool Output::Open(const std::string &wxfn, bool binary, bool header) { + if (IsOpen()) { + if (!Close()) { // Throw here rather than return status, as it's an error + // about something else: if the user wanted to avoid the exception he/she + // could have called Close(). + KALDI_ERR << "Output::Open(), failed to close output stream: " + << PrintableWxfilename(filename_); + } + } + + filename_ = wxfn; + + OutputType type = ClassifyWxfilename(wxfn); + KALDI_ASSERT(impl_ == NULL); + + if (type == kFileOutput) { + impl_ = new FileOutputImpl(); + } else if (type == kStandardOutput) { + impl_ = new StandardOutputImpl(); + } else if (type == kPipeOutput) { + impl_ = new PipeOutputImpl(); + } else { // type == kNoOutput + KALDI_WARN << "Invalid output filename format "<< + PrintableWxfilename(wxfn); + return false; + } + if (!impl_->Open(wxfn, binary)) { + delete impl_; + impl_ = NULL; + return false; // failed to open. + } else { // successfully opened it. + if (header) { + InitKaldiOutputStream(impl_->Stream(), binary); + bool ok = impl_->Stream().good(); // still OK? + if (!ok) { + delete impl_; + impl_ = NULL; + return false; + } + return true; + } else { + return true; + } + } +} + + +Input::Input(const std::string &rxfilename, bool *binary): impl_(NULL) { + if (!Open(rxfilename, binary)) { + KALDI_ERR << "Error opening input stream " + << PrintableRxfilename(rxfilename); + } +} + +int32 Input::Close() { + if (impl_) { + int32 ans = impl_->Close(); + delete impl_; + impl_ = NULL; + return ans; + } else { + return 0; + } +} + +bool Input::OpenInternal(const std::string &rxfilename, + bool file_binary, + bool *contents_binary) { + InputType type = ClassifyRxfilename(rxfilename); + if (IsOpen()) { + // May have to close the stream first. + if (type == kOffsetFileInput && impl_->MyType() == kOffsetFileInput) { + // We want to use the same object to Open... this is in case + // the files are the same, so we can just seek. + if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- + // always open in binary. + delete impl_; + impl_ = NULL; + return false; + } + // read the binary header, if requested. + if (contents_binary != NULL) + return InitKaldiInputStream(impl_->Stream(), contents_binary); + else + return true; + } else { + Close(); + // and fall through to code below which actually opens the file. + } + } + if (type == kFileInput) { + impl_ = new FileInputImpl(); + } else if (type == kStandardInput) { + impl_ = new StandardInputImpl(); + } else if (type == kPipeInput) { + impl_ = new PipeInputImpl(); + } else if (type == kOffsetFileInput) { + impl_ = new OffsetFileInputImpl(); + } else { // type == kNoInput + KALDI_WARN << "Invalid input filename format "<< + PrintableRxfilename(rxfilename); + return false; + } + if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- + // always read in binary. + delete impl_; + impl_ = NULL; + return false; + } + if (contents_binary != NULL) + return InitKaldiInputStream(impl_->Stream(), contents_binary); + else + return true; +} + + +Input::~Input() { if (impl_) Close(); } + + +std::istream &Input::Stream() { + if (!IsOpen()) KALDI_ERR << "Input::Stream(), not open."; + return impl_->Stream(); +} + + +template <> void ReadKaldiObject(const std::string &filename, + Matrix *m) { + if (!filename.empty() && filename[filename.size() - 1] == ']') { + // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. + // (the bit in square brackets is the range). + std::string rxfilename, range; + if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { + KALDI_ERR << "Could not make sense of possible range specifier in filename " + << "while reading matrix: " << filename; + } + Matrix temp; + bool binary_in; + Input ki(rxfilename, &binary_in); + temp.Read(ki.Stream(), binary_in); + if (!ExtractObjectRange(temp, range, m)) { + KALDI_ERR << "Error extracting range of object: " << filename; + } + } else { + // The normal case, there is no range. + bool binary_in; + Input ki(filename, &binary_in); + m->Read(ki.Stream(), binary_in); + } +} + +template <> void ReadKaldiObject(const std::string &filename, + Matrix *m) { + if (!filename.empty() && filename[filename.size() - 1] == ']') { + // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. + // (the bit in square brackets is the range). + std::string rxfilename, range; + if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { + KALDI_ERR << "Could not make sense of possible range specifier in filename " + << "while reading matrix: " << filename; + } + Matrix temp; + bool binary_in; + Input ki(rxfilename, &binary_in); + temp.Read(ki.Stream(), binary_in); + if (!ExtractObjectRange(temp, range, m)) { + KALDI_ERR << "Error extracting range of object: " << filename; + } + } else { + // The normal case, there is no range. + bool binary_in; + Input ki(filename, &binary_in); + m->Read(ki.Stream(), binary_in); + } +} + + + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/util/kaldi-io.h b/speechx/speechx/kaldi/util/kaldi-io.h new file mode 100644 index 00000000..c28be8a6 --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-io.h @@ -0,0 +1,280 @@ +// util/kaldi-io.h + +// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky +// 2016 Xiaohui Zhang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_UTIL_KALDI_IO_H_ +#define KALDI_UTIL_KALDI_IO_H_ + +#ifdef _MSC_VER +# include +# include +#endif +#include // For isspace. +#include +#include +#include "base/kaldi-common.h" +#include "matrix/kaldi-matrix.h" + + +namespace kaldi { + +class OutputImplBase; // Forward decl; defined in a .cc file +class InputImplBase; // Forward decl; defined in a .cc file + +/// \addtogroup io_group +/// @{ + +// The Output and Input classes handle stream-opening for "extended" filenames +// that include actual files, standard-input/standard-output, pipes, and +// offsets into actual files. They also handle reading and writing the +// binary-mode headers for Kaldi files, where applicable. The classes have +// versions of the Open routines that throw and do not throw, depending whether +// the calling code wants to catch the errors or not; there are also versions +// that write (or do not write) the Kaldi binary-mode header that says if it's +// binary mode. Generally files that contain Kaldi objects will have the header +// on, so we know upon reading them whether they have the header. So you would +// use the OpenWithHeader routines for these (or the constructor); but other +// types of objects (e.g. FSTs) would have files without a header so you would +// use OpenNoHeader. + +// We now document the types of extended filenames that we use. +// +// A "wxfilename" is an extended filename for writing. It can take three forms: +// (1) Filename: e.g. "/some/filename", "./a/b/c", "c:\Users\dpovey\My +// Documents\\boo" +// (whatever the actual file-system interprets) +// (2) Standard output: "" or "-" +// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz" +// +// +// A "rxfilename" is an extended filename for reading. It can take four forms: +// (1) An actual filename, whatever the file-system can read, e.g. "/my/file". +// (2) Standard input: "" or "-" +// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" +// (4) An offset into a file, e.g.: "/mnt/blah/data/1.ark:24871" +// [these are created by the Table and TableWriter classes; I may also write +// a program that creates them for arbitrary files] +// + + +// Typical usage: +// ... +// bool binary; +// MyObject.Write(Output(some_filename, binary).Stream(), binary); +// +// ... more extensive example: +// { +// Output ko(some_filename, binary); +// MyObject1.Write(ko.Stream(), binary); +// MyObject2.Write(ko.Stream(), binary); +// } + + + +enum OutputType { + kNoOutput, + kFileOutput, + kStandardOutput, + kPipeOutput +}; + +/// ClassifyWxfilename interprets filenames as follows: +/// - kNoOutput: invalid filenames (leading or trailing space, things that look +/// like wspecifiers and rspecifiers or like pipes to read from with leading +/// |. +/// - kFileOutput: Normal filenames +/// - kStandardOutput: The empty string or "-", interpreted as standard output +/// - kPipeOutput: pipes, e.g. "| gzip -c > /tmp/abc.gz" +OutputType ClassifyWxfilename(const std::string &wxfilename); + +enum InputType { + kNoInput, + kFileInput, + kStandardInput, + kOffsetFileInput, + kPipeInput +}; + +/// ClassifyRxfilenames interprets filenames for reading as follows: +/// - kNoInput: invalid filenames (leading or trailing space, things that +/// look like wspecifiers and rspecifiers or pipes to write to +/// with trailing |. +/// - kFileInput: normal filenames +/// - kStandardInput: the empty string or "-" +/// - kPipeInput: e.g. "gunzip -c /tmp/abc.gz |" +/// - kOffsetFileInput: offsets into files, e.g. /some/filename:12970 +InputType ClassifyRxfilename(const std::string &rxfilename); + + +class Output { + public: + // The normal constructor, provided for convenience. + // Equivalent to calling with default constructor then Open() + // with these arguments. + Output(const std::string &filename, bool binary, bool write_header = true); + + Output(): impl_(NULL) {} + + /// This opens the stream, with the given mode (binary or text). It returns + /// true on success and false on failure. However, it will throw if something + /// was already open and could not be closed (to avoid this, call Close() + /// first. if write_header == true and binary == true, it writes the Kaldi + /// binary-mode header ('\0' then 'B'). You may call Open even if it is + /// already open; it will close the existing stream and reopen (however if + /// closing the old stream failed it will throw). + bool Open(const std::string &wxfilename, bool binary, bool write_header); + + inline bool IsOpen(); // return true if we have an open stream. Does not + // imply stream is good for writing. + + std::ostream &Stream(); // will throw if not open; else returns stream. + + // Close closes the stream. Calling Close is never necessary unless you + // want to avoid exceptions being thrown. There are times when calling + // Close will hurt efficiency (basically, when using offsets into files, + // and using the same Input object), + // but most of the time the user won't be doing this directly, it will + // be done in kaldi-table.{h, cc}, so you don't have to worry about it. + bool Close(); + + // This will throw if stream could not be closed (to check error status, + // call Close()). + ~Output(); + + private: + OutputImplBase *impl_; // non-NULL if open. + std::string filename_; + KALDI_DISALLOW_COPY_AND_ASSIGN(Output); +}; + + +// bool binary_in; +// Input ki(some_filename, &binary_in); +// MyObject.Read(ki.Stream(), binary_in); +// +// ... more extensive example: +// +// { +// bool binary_in; +// Input ki(some_filename, &binary_in); +// MyObject1.Read(ki.Stream(), &binary_in); +// MyObject2.Write(ki.Stream(), &binary_in); +// } +// Note that to catch errors you need to use try.. catch. +// Input communicates errors by throwing exceptions. + + +// Input interprets four kinds of filenames: +// (1) Normal filenames +// (2) The empty string or "-", interpreted as standard output +// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" +// (4) Offsets into [real] files, e.g. "/my/filename:12049" +// The last one has no correspondence in Output. + + +class Input { + public: + /// The normal constructor. Opens the stream in binary mode. + /// Equivalent to calling the default constructor followed by Open(); then, if + /// binary != NULL, it calls ReadHeader(), putting the output in "binary"; it + /// throws on error. + Input(const std::string &rxfilename, bool *contents_binary = NULL); + + Input(): impl_(NULL) {} + + // Open opens the stream for reading (the mode, where relevant, is binary; use + // OpenTextMode for text-mode, we made this a separate function rather than a + // boolean argument, to avoid confusion with Kaldi's text/binary distinction, + // since reading in the file system's text mode is unusual.) If + // contents_binary != NULL, it reads the binary-mode header and puts it in the + // "binary" variable. Returns true on success. If it returns false it will + // not be open. You may call Open even if it is already open; it will close + // the existing stream and reopen (however if closing the old stream failed it + // will throw). + inline bool Open(const std::string &rxfilename, bool *contents_binary = NULL); + + // As Open but (if the file system has text/binary modes) opens in text mode; + // you shouldn't ever have to use this as in Kaldi we read even text files in + // binary mode (and ignore the \r). + inline bool OpenTextMode(const std::string &rxfilename); + + // Return true if currently open for reading and Stream() will + // succeed. Does not guarantee that the stream is good. + inline bool IsOpen(); + + // It is never necessary or helpful to call Close, except if + // you are concerned about to many filehandles being open. + // Close does not throw. It returns the exit code as int32 + // in the case of a pipe [kPipeInput], and always zero otherwise. + int32 Close(); + + // Returns the underlying stream. Throws if !IsOpen() + std::istream &Stream(); + + // Destructor does not throw: input streams may legitimately fail so we + // don't worry about the status when we close them. + ~Input(); + private: + bool OpenInternal(const std::string &rxfilename, bool file_binary, + bool *contents_binary); + InputImplBase *impl_; + KALDI_DISALLOW_COPY_AND_ASSIGN(Input); +}; + +template void ReadKaldiObject(const std::string &filename, + C *c) { + bool binary_in; + Input ki(filename, &binary_in); + c->Read(ki.Stream(), binary_in); +} + +// Specialize the template for reading matrices, because we want to be able to +// support reading 'ranges' (row and column ranges), like foo.mat[10:20]. +template <> void ReadKaldiObject(const std::string &filename, + Matrix *m); + + +template <> void ReadKaldiObject(const std::string &filename, + Matrix *m); + + + +template inline void WriteKaldiObject(const C &c, + const std::string &filename, + bool binary) { + Output ko(filename, binary); + c.Write(ko.Stream(), binary); +} + +/// PrintableRxfilename turns the rxfilename into a more human-readable +/// form for error reporting, i.e. it does quoting and escaping and +/// replaces "" or "-" with "standard input". +std::string PrintableRxfilename(const std::string &rxfilename); + +/// PrintableWxfilename turns the wxfilename into a more human-readable +/// form for error reporting, i.e. it does quoting and escaping and +/// replaces "" or "-" with "standard output". +std::string PrintableWxfilename(const std::string &wxfilename); + +/// @} + +} // end namespace kaldi. + +#include "util/kaldi-io-inl.h" + +#endif // KALDI_UTIL_KALDI_IO_H_ diff --git a/speechx/speechx/kaldi/util/kaldi-pipebuf.h b/speechx/speechx/kaldi/util/kaldi-pipebuf.h new file mode 100644 index 00000000..61034ac2 --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-pipebuf.h @@ -0,0 +1,87 @@ +// util/kaldi-pipebuf.h + +// Copyright 2009-2011 Ondrej Glembek + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +/** @file kaldi-pipebuf.h + * This is an Kaldi C++ Library header. + */ + +#ifndef KALDI_UTIL_KALDI_PIPEBUF_H_ +#define KALDI_UTIL_KALDI_PIPEBUF_H_ + +#include +#if !defined(_LIBCPP_VERSION) // libc++ +#include +#else +#include "util/basic-filebuf.h" +#endif + +namespace kaldi { +// This class provides a way to initialize a filebuf with a FILE* pointer +// directly; it will not close the file pointer when it is deleted. +// The C++ standard does not allow implementations of C++ to provide +// this constructor within basic_filebuf, which makes it hard to deal +// with pipes using completely native C++. This is a workaround + +#ifdef _MSC_VER +#elif defined(_LIBCPP_VERSION) // libc++ +template > +class basic_pipebuf : public basic_filebuf { + public: + typedef basic_pipebuf ThisType; + + public: + basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) + : basic_filebuf() { + this->open(fptr, mode); + if (!this->is_open()) { + KALDI_WARN << "Error initializing pipebuf"; // probably indicates + // code error, if the fptr was good. + return; + } + } +}; // class basic_pipebuf +#else +template > +class basic_pipebuf : public std::basic_filebuf { + public: + typedef basic_pipebuf ThisType; + + public: + basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) + : std::basic_filebuf() { + this->_M_file.sys_open(fptr, mode); + if (!this->is_open()) { + KALDI_WARN << "Error initializing pipebuf"; // probably indicates + // code error, if the fptr was good. + return; + } + this->_M_mode = mode; + this->_M_buf_size = BUFSIZ; + this->_M_allocate_internal_buffer(); + this->_M_reading = false; + this->_M_writing = false; + this->_M_set_buffer(-1); + } +}; // class basic_pipebuf +#endif // _MSC_VER + +} // namespace kaldi + +#endif // KALDI_UTIL_KALDI_PIPEBUF_H_ diff --git a/speechx/speechx/kaldi/util/kaldi-semaphore.cc b/speechx/speechx/kaldi/util/kaldi-semaphore.cc new file mode 100644 index 00000000..f0829ac0 --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-semaphore.cc @@ -0,0 +1,57 @@ +// util/kaldi-semaphore.cc + +// Copyright 2012 Karel Vesely (Brno University of Technology) +// 2017 Dogan Can (University of Southern California) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + + +#include "base/kaldi-error.h" +#include "util/kaldi-semaphore.h" + +namespace kaldi { + +Semaphore::Semaphore(int32 count) { + KALDI_ASSERT(count >= 0); + count_ = count; +} + +Semaphore::~Semaphore() {} + +bool Semaphore::TryWait() { + std::unique_lock lock(mutex_); + if(count_) { + count_--; + return true; + } + return false; +} + +void Semaphore::Wait() { + std::unique_lock lock(mutex_); + while(!count_) + condition_variable_.wait(lock); + count_--; +} + +void Semaphore::Signal() { + std::unique_lock lock(mutex_); + count_++; + condition_variable_.notify_one(); +} + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/util/kaldi-semaphore.h b/speechx/speechx/kaldi/util/kaldi-semaphore.h new file mode 100644 index 00000000..2562053c --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-semaphore.h @@ -0,0 +1,50 @@ +// util/kaldi-semaphore.h + +// Copyright 2012 Karel Vesely (Brno University of Technology) +// 2017 Dogan Can (University of Southern California) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_THREAD_KALDI_SEMAPHORE_H_ +#define KALDI_THREAD_KALDI_SEMAPHORE_H_ 1 + +#include +#include + +namespace kaldi { + +class Semaphore { + public: + Semaphore(int32 count = 0); + + ~Semaphore(); + + bool TryWait(); ///< Returns true if Wait() goes through + void Wait(); ///< decrease the counter + void Signal(); ///< increase the counter + + private: + int32 count_; ///< the semaphore counter, 0 means block on Wait() + + std::mutex mutex_; + std::condition_variable condition_variable_; + KALDI_DISALLOW_COPY_AND_ASSIGN(Semaphore); +}; + +} //namespace + +#endif // KALDI_THREAD_KALDI_SEMAPHORE_H_ diff --git a/speechx/speechx/kaldi/util/kaldi-table-inl.h b/speechx/speechx/kaldi/util/kaldi-table-inl.h new file mode 100644 index 00000000..6aca2f13 --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-table-inl.h @@ -0,0 +1,2672 @@ +// util/kaldi-table-inl.h + +// Copyright 2009-2011 Microsoft Corporation +// 2013 Johns Hopkins University (author: Daniel Povey) +// 2016 Xiaohui Zhang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_KALDI_TABLE_INL_H_ +#define KALDI_UTIL_KALDI_TABLE_INL_H_ + +#include +#include +#include +#include +#include +#include +#include "util/kaldi-io.h" +#include "util/kaldi-holder.h" +#include "util/text-utils.h" +#include "util/stl-utils.h" // for StringHasher. +#include "util/kaldi-semaphore.h" + + +namespace kaldi { + +/// \addtogroup table_impl_types +/// @{ + +template class SequentialTableReaderImplBase { + public: + typedef typename Holder::T T; + // note that Open takes rxfilename not rspecifier. Open will only be + // called on a just-allocated object. + virtual bool Open(const std::string &rxfilename) = 0; + // Done() should be called on a successfully opened, not-closed object. + // only throws if called at the wrong time (i.e. code error). + virtual bool Done() const = 0; + // Returns true if the reader is open [i.e. Open() succeeded and + // the user has not called Close()] + virtual bool IsOpen() const = 0; + // Returns the current key; it is valid to call this if Done() returned false. + // Only throws on code error (i.e. called at the wrong time). + virtual std::string Key() = 0; + // Returns the value associated with the current key. Valid to call it if + // Done() returned false. It throws if the value could not be read. [However + // if you use the ,p modifier it will never throw, unless you call it at the + // wrong time, i.e. unless there is a code error.] + virtual T &Value() = 0; + virtual void FreeCurrent() = 0; + // move to the next object. This won't throw unless called wrongly (e.g. on + // non-open archive.] + virtual void Next() = 0; + // Close the table. Returns its status as bool so it won't throw, unless + // called wrongly [i.e. on non-open archive.] + virtual bool Close() = 0; + // SwapHolder() is not part of the public interface of SequentialTableReader. + // It should be called when it would be valid to call Value() or FreeCurrent() + // (i.e. when a value is stored), and after this it's not valid to get the + // value any more until you call Next(). It swaps the contents of + // this->holder_ with those of 'other_holder'. It's needed as part of how + // we implement SequentialTableReaderBackgroundImpl. + virtual void SwapHolder(Holder *other_holder) = 0; + SequentialTableReaderImplBase() { } + virtual ~SequentialTableReaderImplBase() { } // throws. + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(SequentialTableReaderImplBase); +}; + +// This is the implementation for SequentialTableReader +// when it's actually a script file. +template class SequentialTableReaderScriptImpl: + public SequentialTableReaderImplBase { + public: + typedef typename Holder::T T; + + SequentialTableReaderScriptImpl(): state_(kUninitialized) { } + + // You may call Open from states kUninitialized and kError. + // It may leave the object in any of the states. + virtual bool Open(const std::string &rspecifier) { + if (state_ != kUninitialized && state_ != kError) + if (!Close()) // call Close() yourself to suppress this exception. + KALDI_ERR << "Error closing previous input: " + << "rspecifier was " << rspecifier_; + bool binary; + rspecifier_ = rspecifier; + RspecifierType rs = ClassifyRspecifier(rspecifier, &script_rxfilename_, + &opts_); + KALDI_ASSERT(rs == kScriptRspecifier); + if (!script_input_.Open(script_rxfilename_, &binary)) { // Failure on Open + KALDI_WARN << "Failed to open script file " + << PrintableRxfilename(script_rxfilename_); + state_ = kUninitialized; + return false; + } else { // Open succeeded. + if (binary) { + KALDI_WARN << "Script file should not be binary file."; + SetErrorState(); + return false; + } else { + state_ = kFileStart; + Next(); + if (state_ == kError) + return false; + // any other status, including kEof, is OK from the point of view of + // the 'open' function (empty scp file is not inherently an error). + return true; + } + } + } + + virtual bool IsOpen() const { + switch (state_) { + case kEof: case kHaveScpLine: case kHaveObject: case kHaveRange: + return true; + case kUninitialized: case kError: + return false; + default: KALDI_ERR << "IsOpen() called on invalid object."; + // note: kFileStart is not a valid state for the user to call a member + // function (we never return from a public function in this state). + return false; + } + } + + virtual bool Done() const { + switch (state_) { + case kHaveScpLine: case kHaveObject: case kHaveRange: return false; + case kEof: case kError: return true; // Error condition, like Eof, counts + // as Done(); the destructor/Close() will inform the user of the error. + default: KALDI_ERR << "Done() called on TableReader object at the wrong" + " time."; + return false; + } + } + + virtual std::string Key() { + // Valid to call this whenever Done() returns false. + switch (state_) { + case kHaveScpLine: case kHaveObject: case kHaveRange: break; + default: + // coding error. + KALDI_ERR << "Key() called on TableReader object at the wrong time."; + } + return key_; + } + + T &Value() { + if (!EnsureObjectLoaded()) + KALDI_ERR << "Failed to load object from " + << PrintableRxfilename(data_rxfilename_) + << " (to suppress this error, add the permissive " + << "(p, ) option to the rspecifier."; + // Because EnsureObjectLoaded() returned with success, we know + // that if range_ is nonempty (i.e. a range was requested), the + // state will be kHaveRange. + if (state_ == kHaveRange) { + return range_holder_.Value(); + } else { + KALDI_ASSERT(state_ == kHaveObject); + return holder_.Value(); + } + } + + void FreeCurrent() { + if (state_ == kHaveObject) { + holder_.Clear(); + state_ = kHaveScpLine; + } else if (state_ == kHaveRange) { + range_holder_.Clear(); + state_ = kHaveObject; + } else { + KALDI_WARN << "FreeCurrent called at the wrong time."; + } + } + + void SwapHolder(Holder *other_holder) { + // call Value() to ensure we have a value, and ignore its return value while + // suppressing compiler warnings by casting to void. It will cause the + // program to die with KALDI_ERR if we couldn't get a value. + (void) Value(); + // At this point we know that we successfully loaded an object, + // and if there was a range specified, it's in range_holder_. + if (state_ == kHaveObject) { + holder_.Swap(other_holder); + state_ = kHaveScpLine; + } else if (state_ == kHaveRange) { + range_holder_.Swap(other_holder); + state_ = kHaveObject; + // This indicates that we still have the base object (but no range). + } else { + KALDI_ERR << "Code error"; + } + // Note: after this call there may be some junk left in range_holder_ or + // holder_, but it won't matter. We avoid calling Clear() on them, as this + // function needs to be lightweight for the 'bg' feature to work well. + } + + // Next goes to the next object. + // It can leave the object in most of the statuses, but + // the only circumstances under which it will return are: + // either: + // - if Done() returned true, i.e. kError or kEof. + // or: + // - in non-permissive mode, status kHaveScpLine or kHaveObjecct + // - in permissive mode, only when we successfully have an object, + // which means either (kHaveObject and range_.empty()), or + // kHaveRange. + void Next() { + while (1) { + NextScpLine(); + if (Done()) return; + if (opts_.permissive) { + // Permissive mode means, when reading scp files, we treat keys whose + // scp entry cannot be read as nonexistent. This means trying to read. + if (EnsureObjectLoaded()) return; // Success. + // else try the next scp line. + } else { + return; // We go the next key; Value() will crash if we can't read the + // object on the scp line. + } + } + } + + // This function may be entered at in any state. At exit, the object will be + // in state kUninitialized. It only returns false in the situation where we + // were at the end of the stream (kEof) and the script_input_ was a pipe and + // it ended with error status; this is so that we can catch errors from + // programs that we invoked via a pipe. + virtual bool Close() { + int32 status = 0; + if (script_input_.IsOpen()) + status = script_input_.Close(); + if (data_input_.IsOpen()) + data_input_.Close(); + range_holder_.Clear(); + holder_.Clear(); + if (!this->IsOpen()) + KALDI_ERR << "Close() called on input that was not open."; + StateType old_state = state_; + state_ = kUninitialized; + if (old_state == kError || (old_state == kEof && status != 0)) { + if (opts_.permissive) { + KALDI_WARN << "Close() called on scp file with read error, ignoring the" + " error because permissive mode specified."; + return true; + } else { + return false; // User will do something with the error status. + } + } else { + return true; + } + // Possible states Return value + // kLoadSucceeded/kRangeSucceeded/kRangeFailed true + // kError (if opts_.permissive) true + // kError (if !opts_.permissive) false + // kEof (if script_input_.Close() && !opts.permissive) false + // kEof (if !script_input_.Close() || opts.permissive) true + // kUninitialized/kFileStart/kHaveScpLine true + // kUnitialized true + } + + virtual ~SequentialTableReaderScriptImpl() { + if (this->IsOpen() && !Close()) + KALDI_ERR << "TableReader: reading script file failed: from scp " + << PrintableRxfilename(script_rxfilename_); + } + private: + + // Function EnsureObjectLoaded() ensures that we have fully loaded any object + // (including object range) associated with the current key, and returns true + // on success (i.e. we have the object) and false on failure. + // + // Possible entry states: kHaveScpLine, kLoadSucceeded, kRangeSucceeded + // + // Possible exit states: kHaveScpLine, kLoadSucceeded, kRangeSucceeded. + // + // Note: the return status has information that cannot be deduced from + // just the exit state. If the object could not be loaded we go to state + // kHaveScpLine but return false; and if the range was requested but + // could not be extracted, we go to state kLoadSucceeded but return false. + bool EnsureObjectLoaded() { + if (!(state_ == kHaveScpLine || state_ == kHaveObject || + state_ == kHaveRange)) + KALDI_ERR << "Invalid state (code error)"; + + if (state_ == kHaveScpLine) { // need to load the object into holder_. + bool ans; + // note, NULL means it doesn't read the binary-mode header + if (Holder::IsReadInBinary()) { + ans = data_input_.Open(data_rxfilename_, NULL); + } else { + ans = data_input_.OpenTextMode(data_rxfilename_); + } + if (!ans) { + KALDI_WARN << "Failed to open file " + << PrintableRxfilename(data_rxfilename_); + return false; + } else { + if (holder_.Read(data_input_.Stream())) { + state_ = kHaveObject; + } else { // holder_ will not contain data. + KALDI_WARN << "Failed to load object from " + << PrintableRxfilename(data_rxfilename_); + return false; + } + } + } + // OK, at this point the state must be either + // kHaveObject or kHaveRange. + if (range_.empty()) { + // if range_ is the empty string, we should not be in the state + // kHaveRange. + KALDI_ASSERT(state_ == kHaveObject); + return true; + } + // range_ is nonempty. + if (state_ == kHaveRange) { + // range was already extracted, so there nothing to do. + return true; + } + // OK, range_ is nonempty and state_ is kHaveObject. We attempt to extract + // the range object. Note: ExtractRange() will throw with KALDI_ERR if the + // object type doesn't support ranges. + if (!range_holder_.ExtractRange(holder_, range_)) { + KALDI_WARN << "Failed to load object from " + << PrintableRxfilename(data_rxfilename_) + << "[" << range_ << "]"; + return false; + } else { + state_ = kHaveRange; + return true; + } + } + + void SetErrorState() { + state_ = kError; + script_input_.Close(); + data_input_.Close(); + holder_.Clear(); + range_holder_.Clear(); + } + + // Reads the next line in the script file. + // Possible entry states: kHaveObject, kHaveRange, kHaveScpLine, kFileStart. + // Possible exit states: kEof, kError, kHaveScpLine, kHaveObject. + void NextScpLine() { + switch (state_) { // Check and simplify the state. + case kHaveRange: + range_holder_.Clear(); + state_ = kHaveObject; + break; + case kHaveScpLine: case kHaveObject: case kFileStart: break; + default: + // No other states are valid to call Next() from. + KALDI_ERR << "Reading script file: Next called wrongly."; + } + // at this point the state will be kHaveObject, kHaveScpLine, or kFileStart. + std::string line; + if (getline(script_input_.Stream(), line)) { + // After extracting "key" from "line", we put the rest + // of "line" into "rest", and then extract data_rxfilename_ + // (e.g. 1.ark:100) and possibly the range_ specifer + // (e.g. [1:2,2:10]) from "rest". + std::string data_rxfilename, rest; + SplitStringOnFirstSpace(line, &key_, &rest); + if (!key_.empty() && !rest.empty()) { + // Got a valid line. + if (rest[rest.size()-1] == ']') { + if(!ExtractRangeSpecifier(rest, &data_rxfilename, &range_)) { + KALDI_WARN << "Reading rspecifier '" << rspecifier_ + << ", cannot make sense of scp line " + << line; + SetErrorState(); + return; + } + } else { + data_rxfilename = rest; + range_ = ""; + } + bool filenames_equal = (data_rxfilename_ == data_rxfilename); + if (!filenames_equal) + data_rxfilename_ = data_rxfilename; + if (state_ == kHaveObject) { + if (!filenames_equal) { + holder_.Clear(); + state_ = kHaveScpLine; + } + // else leave state_ at kHaveObject and leave the object in the + // holder. + } else { + state_ = kHaveScpLine; + } + } else { + KALDI_WARN << "We got an invalid line in the scp file. " + << "It should look like: some_key 1.ark:10, got: " + << line; + SetErrorState(); + } + } else { + state_ = kEof; // there is nothing more in the scp file. Might as well + // close input streams as we don't need them. + script_input_.Close(); + if (data_input_.IsOpen()) + data_input_.Close(); + holder_.Clear(); // clear the holder if it was nonempty. + range_holder_.Clear(); // clear the range holder if it was nonempty. + } + } + + std::string rspecifier_; // the rspecifier that this class was opened with. + RspecifierOptions opts_; // options. + std::string script_rxfilename_; // rxfilename of the script file. + + Input script_input_; // Input object for the .scp file + Input data_input_; // Input object for the entries in the script file; + // we make this a class member instead of a local variable, + // so that rspecifiers of the form filename:byte-offset, + // e.g. foo.ark:12345, can be handled using fseek(). + + Holder holder_; // Holds the object. + Holder range_holder_; // Holds the partial object corresponding to the object + // range specifier 'range_'; this is only used when + // 'range_' is specified, i.e. when the .scp file + // contains lines of the form rspecifier[range], like + // foo.ark:242[0:9] (representing a row range of a + // matrix). + + + std::string key_; // the key of the current scp line we're processing + std::string data_rxfilename_; // the rxfilename corresponding to the current key + std::string range_; // the range of object corresponding to the current key, if an + // object range was specified in the script file, else "". + + enum StateType { + // Summary of the states this object can be in (state_). + // + // (*) Does holder_ contain the object corresponding to + // data_rxfilename_ ? + // (*) Does range_holder_ contain a range object? + // (*) is script_input_ open? + // (*) are key_, data_rxfilename_ and range_ [if applicable] set? + // + kUninitialized, // no no no no Uninitialized or closed object. + kFileStart, // no no yes no We just opened the .scp file (we'll never be in this + // state when a user-visible function is called.) + kEof, // no no no no We did Next() and found eof in script file. + kError, // no no no no Error reading or parsing script file. + kHaveScpLine, // no no yes yes Have a line of the script file but nothing else. + kHaveObject, // yes no yes yes holder_ contains an object but range_holder_ does not. + kHaveRange, // yes yes yes yes we have the range object in range_holder_ (implies + // range_ nonempty). + } state_; + + +}; + + +// This is the implementation for SequentialTableReader +// when it's an archive. Note that the archive format is: +// key1 [space] object1 key2 [space] +// object2 ... eof. +// "object1" is the output of the Holder::Write function and will +// typically contain a binary header (in binary mode) and then +// the output of object.Write(os, binary). +// The archive itself does not care whether it is in binary +// or text mode, for reading purposes. + +template class SequentialTableReaderArchiveImpl: + public SequentialTableReaderImplBase { + public: + typedef typename Holder::T T; + + SequentialTableReaderArchiveImpl(): state_(kUninitialized) { } + + virtual bool Open(const std::string &rspecifier) { + if (state_ != kUninitialized) { + if (!Close()) { // call Close() yourself to suppress this exception. + if (opts_.permissive) + KALDI_WARN << "Error closing previous input " + "(only warning, since permissive mode)."; + else + KALDI_ERR << "Error closing previous input."; + } + } + rspecifier_ = rspecifier; + RspecifierType rs = ClassifyRspecifier(rspecifier, + &archive_rxfilename_, + &opts_); + KALDI_ASSERT(rs == kArchiveRspecifier); + + bool ans; + // NULL means don't expect binary-mode header + if (Holder::IsReadInBinary()) + ans = input_.Open(archive_rxfilename_, NULL); + else + ans = input_.OpenTextMode(archive_rxfilename_); + if (!ans) { // header. + KALDI_WARN << "Failed to open stream " + << PrintableRxfilename(archive_rxfilename_); + state_ = kUninitialized; // Failure on Open + return false; // User should print the error message. + } + state_ = kFileStart; + Next(); + if (state_ == kError) { + KALDI_WARN << "Error beginning to read archive file (wrong filename?): " + << PrintableRxfilename(archive_rxfilename_); + input_.Close(); + state_ = kUninitialized; + return false; + } + KALDI_ASSERT(state_ == kHaveObject || state_ == kEof); + return true; + } + + virtual void Next() { + switch (state_) { + case kHaveObject: + holder_.Clear(); + break; + case kFileStart: case kFreedObject: + break; + default: + KALDI_ERR << "Next() called wrongly."; + } + std::istream &is = input_.Stream(); + is.clear(); // Clear any fail bits that may have been set... just in case + // this happened in the Read function. + is >> key_; // This eats up any leading whitespace and gets the string. + if (is.eof()) { + state_ = kEof; + return; + } + if (is.fail()) { // This shouldn't really happen, barring file-system + // errors. + KALDI_WARN << "Error reading archive " + << PrintableRxfilename(archive_rxfilename_); + state_ = kError; + return; + } + int c; + if ((c = is.peek()) != ' ' && c != '\t' && c != '\n') { // We expect a + // space ' ' after the key. + // We also allow tab [which is consumed] and newline [which is not], just + // so we can read archives generated by scripts that may not be fully + // aware of how this format works. + KALDI_WARN << "Invalid archive file format: expected space after key " + << key_ << ", got character " + << CharToString(static_cast(is.peek())) << ", reading " + << PrintableRxfilename(archive_rxfilename_); + state_ = kError; + return; + } + if (c != '\n') is.get(); // Consume the space or tab. + if (holder_.Read(is)) { + state_ = kHaveObject; + return; + } else { + KALDI_WARN << "Object read failed, reading archive " + << PrintableRxfilename(archive_rxfilename_); + state_ = kError; + return; + } + } + + virtual bool IsOpen() const { + switch (state_) { + case kEof: case kError: case kHaveObject: case kFreedObject: return true; + case kUninitialized: return false; + default: KALDI_ERR << "IsOpen() called on invalid object."; // kFileStart + // is not valid state for user to call something on. + return false; + } + } + + virtual bool Done() const { + switch (state_) { + case kHaveObject: + return false; + case kEof: case kError: + return true; // Error-state counts as Done(), but destructor + // will fail (unless you check the status with Close()). + default: + KALDI_ERR << "Done() called on TableReader object at the wrong time."; + return false; + } + } + + virtual std::string Key() { + // Valid to call this whenever Done() returns false + switch (state_) { + case kHaveObject: break; // only valid case. + default: + // coding error. + KALDI_ERR << "Key() called on TableReader object at the wrong time."; + } + return key_; + } + + T &Value() { + switch (state_) { + case kHaveObject: + break; // only valid case. + default: + // coding error. + KALDI_ERR << "Value() called on TableReader object at the wrong time."; + } + return holder_.Value(); + } + + virtual void FreeCurrent() { + if (state_ == kHaveObject) { + holder_.Clear(); + state_ = kFreedObject; + } else { + KALDI_WARN << "FreeCurrent called at the wrong time."; + } + } + + void SwapHolder(Holder *other_holder) { + // call Value() to ensure we have a value, and ignore its return value while + // suppressing compiler warnings by casting to void. + (void) Value(); + if (state_ == kHaveObject) { + holder_.Swap(other_holder); + state_ = kFreedObject; + } else { + KALDI_ERR << "SwapHolder called at the wrong time " + "(error related to ',bg' modifier)."; + } + } + + virtual bool Close() { + // To clean up, Close() also closes the Input object if + // it's open. It will succeed if the stream was not in an error state, + // and the Input object isn't in an error state we've found eof in the archive. + if (!this->IsOpen()) + KALDI_ERR << "Close() called on TableReader twice or otherwise wrongly."; + int32 status = 0; + if (input_.IsOpen()) + status = input_.Close(); + if (state_ == kHaveObject) + holder_.Clear(); + StateType old_state = state_; + state_ = kUninitialized; + if (old_state == kError || (old_state == kEof && status != 0)) { + if (opts_.permissive) { + KALDI_WARN << "Error detected closing TableReader for archive " + << PrintableRxfilename(archive_rxfilename_) + << " but ignoring " + << "it as permissive mode specified."; + return true; + } else { + return false; + } + } else { + return true; + } + } + + virtual ~SequentialTableReaderArchiveImpl() { + if (this->IsOpen() && !Close()) + KALDI_ERR << "TableReader: error detected closing archive " + << PrintableRxfilename(archive_rxfilename_); + } + private: + Input input_; // Input object for the archive + Holder holder_; // Holds the object. + std::string key_; + std::string rspecifier_; + std::string archive_rxfilename_; + RspecifierOptions opts_; + enum StateType { // [The state of the reading process] [does holder_ [is input_ + // have object] open] + kUninitialized, // Uninitialized or closed. no no + kFileStart, // [state we use internally: just opened.] no yes + kEof, // We did Next() and found eof in archive no no + kError, // Some other error no no + kHaveObject, // We read the key and the object after it. yes yes + kFreedObject, // The user called FreeCurrent(). no yes + } state_; +}; + +// this is for when someone adds the 'th' modifier; it wraps around the basic +// implementation and allows it to do the reading in a background thread. +template +class SequentialTableReaderBackgroundImpl: + public SequentialTableReaderImplBase { + public: + typedef typename Holder::T T; + + SequentialTableReaderBackgroundImpl( + SequentialTableReaderImplBase *base_reader): + base_reader_(base_reader) {} + + // This function ignores the rxfilename argument. + // We use the same function signature as the regular Open(), + // for convenience. + virtual bool Open(const std::string &rxfilename) { + KALDI_ASSERT(base_reader_ != NULL && + base_reader_->IsOpen()); // or code error. + { + thread_ = std::thread(SequentialTableReaderBackgroundImpl::run, + this); + } + + if (!base_reader_->Done()) + Next(); + return true; + } + + virtual bool IsOpen() const { + // Close() sets base_reader_ to NULL, and we never initialize this object + // with a non-open base_reader_, so no need to check if it's open. + return base_reader_ != NULL; + } + + void RunInBackground() { + try { + // This function is called in the background thread. The whole point of + // the background thread is that we don't want to do the actual reading + // (inside Next()) in the foreground. + while (base_reader_ != NULL && !base_reader_->Done()) { + consumer_sem_.Signal(); + // Here is where the consumer process (parent thread) gets to do its + // stuff. Principally it calls SwapHolder()-- a shallow swap that is + // cheap. + producer_sem_.Wait(); + // we check that base_reader_ is not NULL in case Close() was + // called in the main thread. + if (base_reader_ != NULL) + base_reader_->Next(); // here is where the work happens. + } + // this signal will be waited on in the Next() function of the foreground + // thread if it is still running, or Close() otherwise. + consumer_sem_.Signal(); + // this signal may be waited on in Close(). + consumer_sem_.Signal(); + } catch (...) { + // There is nothing we called above that could potentially throw due to + // user data. So we treat reaching this point as a code-error condition. + // Closing base_reader_ will trigger an exception in Next() in the main + // thread when it checks that base_reader_->IsOpen(). + if (base_reader_->IsOpen()) { + base_reader_->Close(); + delete base_reader_; + base_reader_ = NULL; + } + consumer_sem_.Signal(); + return; + } + } + static void run(SequentialTableReaderBackgroundImpl *object) { + object->RunInBackground(); + } + virtual bool Done() const { + return key_.empty(); + } + virtual std::string Key() { + if (key_.empty()) + KALDI_ERR << "Calling Key() at the wrong time."; + return key_; + } + virtual T &Value() { + if (key_.empty()) + KALDI_ERR << "Calling Value() at the wrong time."; + return holder_.Value(); + } + void SwapHolder(Holder *other_holder) { + KALDI_ERR << "SwapHolder() should not be called on this class."; + } + virtual void FreeCurrent() { + if (key_.empty()) + KALDI_ERR << "Calling FreeCurrent() at the wrong time."; + // note: ideally a call to Value() should crash if you have just called + // FreeCurrent(). For typical holders such as KaldiObjectHolder this will + // happen inside the holder_.Value() call. This won't be the case for all + // holders, but it's not a great loss (just a missed opportunity to spot a + // code error). + holder_.Clear(); + } + virtual void Next() { + consumer_sem_.Wait(); + if (base_reader_ == NULL || !base_reader_->IsOpen()) + KALDI_ERR << "Error detected (likely code error) in background " + << "reader (',bg' option)"; + if (base_reader_->Done()) { + // there is nothing else to read. + key_ = ""; + } else { + key_ = base_reader_->Key(); + base_reader_->SwapHolder(&holder_); + } + // this Signal() tells the producer thread, in the background, + // that it's now safe to read the next value. + producer_sem_.Signal(); + } + + // note: we can be sure that Close() won't be called twice, as the TableReader + // object will delete this object after calling Close. + virtual bool Close() { + KALDI_ASSERT(base_reader_ != NULL && thread_.joinable()); + // wait until the producer thread is idle. + consumer_sem_.Wait(); + bool ans = true; + try { + ans = base_reader_->Close(); + } catch (...) { + ans = false; + } + delete base_reader_; + // setting base_reader_ to NULL will cause the loop in the producer thread + // to exit. + base_reader_ = NULL; + producer_sem_.Signal(); + + thread_.join(); + return ans; + } + ~SequentialTableReaderBackgroundImpl() { + if (base_reader_) { + if (!Close()) { + KALDI_ERR << "Error detected closing background reader " + << "(relates to ',bg' modifier)"; + } + } + } + private: + std::string key_; + Holder holder_; + // I couldn't figure out what to call these semaphores. consumer_sem_ is the + // one that the consumer (main thread) waits on; producer_sem_ is the one + // that the producer (background thread) waits on. + Semaphore consumer_sem_; + Semaphore producer_sem_; + std::thread thread_; + SequentialTableReaderImplBase *base_reader_; + +}; + +template +SequentialTableReader::SequentialTableReader(const std::string + &rspecifier): impl_(NULL) { + if (rspecifier != "" && !Open(rspecifier)) + KALDI_ERR << "Error constructing TableReader: rspecifier is " << rspecifier; +} + +template +bool SequentialTableReader::Open(const std::string &rspecifier) { + if (IsOpen()) + if (!Close()) + KALDI_ERR << "Could not close previously open object."; + // now impl_ will be NULL. + + RspecifierOptions opts; + RspecifierType wt = ClassifyRspecifier(rspecifier, NULL, &opts); + switch (wt) { + case kArchiveRspecifier: + impl_ = new SequentialTableReaderArchiveImpl(); + break; + case kScriptRspecifier: + impl_ = new SequentialTableReaderScriptImpl(); + break; + case kNoRspecifier: default: + KALDI_WARN << "Invalid rspecifier " << rspecifier; + return false; + } + if (!impl_->Open(rspecifier)) { + delete impl_; + impl_ = NULL; + return false; // sub-object will have printed warnings. + } + if (opts.background) { + impl_ = new SequentialTableReaderBackgroundImpl( + impl_); + if (!impl_->Open("")) { + // the rxfilename is ignored in that Open() call. + // It should only return false on code error. + return false; + } + } + return true; +} + +template +bool SequentialTableReader::Close() { + CheckImpl(); + bool ans = impl_->Close(); + delete impl_; // We don't keep around empty impl_ objects. + impl_ = NULL; + return ans; +} + + +template +bool SequentialTableReader::IsOpen() const { + return (impl_ != NULL); // Because we delete the object whenever + // that object is not open. Thus, the IsOpen functions of the + // Impl objects are not really needed. +} + +template +std::string SequentialTableReader::Key() { + CheckImpl(); + return impl_->Key(); // this call may throw if called wrongly in other ways, + // e.g. eof. +} + + +template +void SequentialTableReader::FreeCurrent() { + CheckImpl(); + impl_->FreeCurrent(); +} + + +template +typename SequentialTableReader::T & +SequentialTableReader::Value() { + CheckImpl(); + return impl_->Value(); // This may throw (if EnsureObjectLoaded() returned false you + // are safe.). +} + + +template +void SequentialTableReader::Next() { + CheckImpl(); + impl_->Next(); +} + +template +bool SequentialTableReader::Done() { + CheckImpl(); + return impl_->Done(); +} + + +template +SequentialTableReader::~SequentialTableReader() { + delete impl_; + // Destructor of impl_ may throw. +} + + + +template class TableWriterImplBase { + public: + typedef typename Holder::T T; + + virtual bool Open(const std::string &wspecifier) = 0; + + // Write returns true on success, false on failure, but + // some errors may not be detected until we call Close(). + // It throws (via KALDI_ERR) if called wrongly. We could + // have just thrown on all errors, since this is what + // TableWriter does; it was designed this way because originally + // TableWriter::Write returned an exit status. + virtual bool Write(const std::string &key, const T &value) = 0; + + // Flush will flush any archive; it does not return error status, + // any errors will be reported on the next Write or Close. + virtual void Flush() = 0; + + virtual bool Close() = 0; + + virtual bool IsOpen() const = 0; + + // May throw on write error if Close was not called. + virtual ~TableWriterImplBase() { } + + TableWriterImplBase() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(TableWriterImplBase); +}; + + +// The implementation of TableWriter we use when writing directly +// to an archive with no associated scp. +template +class TableWriterArchiveImpl: public TableWriterImplBase { + public: + typedef typename Holder::T T; + + virtual bool Open(const std::string &wspecifier) { + switch (state_) { + case kUninitialized: + break; + case kWriteError: + KALDI_ERR << "Opening stream, already open with write error."; + case kOpen: default: + if (!Close()) // throw because this error may not have been previously + // detected by the user. + KALDI_ERR << "Opening stream, error closing previously open stream."; + } + wspecifier_ = wspecifier; + WspecifierType ws = ClassifyWspecifier(wspecifier, + &archive_wxfilename_, + NULL, + &opts_); + KALDI_ASSERT(ws == kArchiveWspecifier); // or wrongly called. + + if (output_.Open(archive_wxfilename_, opts_.binary, false)) { // false + // means no binary header. + state_ = kOpen; + return true; + } else { + // stream will not be open. User will report this error + // (we return bool), so don't bother printing anything. + state_ = kUninitialized; + return false; + } + } + + virtual bool IsOpen() const { + switch (state_) { + case kUninitialized: return false; + case kOpen: case kWriteError: return true; + default: KALDI_ERR << "IsOpen() called on TableWriter in invalid state."; + } + return false; + } + + // Write returns true on success, false on failure, but + // some errors may not be detected till we call Close(). + virtual bool Write(const std::string &key, const T &value) { + switch (state_) { + case kOpen: break; + case kWriteError: + // user should have known from the last + // call to Write that there was a problem. + KALDI_WARN << "Attempting to write to invalid stream."; + return false; + case kUninitialized: default: + KALDI_ERR << "Write called on invalid stream"; + } + // state is now kOpen or kWriteError. + if (!IsToken(key)) // e.g. empty string or has spaces... + KALDI_ERR << "Using invalid key " << key; + output_.Stream() << key << ' '; + if (!Holder::Write(output_.Stream(), opts_.binary, value)) { + KALDI_WARN << "Write failure to " + << PrintableWxfilename(archive_wxfilename_); + state_ = kWriteError; + return false; + } + if (state_ == kWriteError) return false; // Even if this Write seems to + // have succeeded, we fail because a previous Write failed and the archive + // may be corrupted and unreadable. + + if (opts_.flush) + Flush(); + return true; + } + + // Flush will flush any archive; it does not return error status, + // any errors will be reported on the next Write or Close. + virtual void Flush() { + switch (state_) { + case kWriteError: case kOpen: + output_.Stream().flush(); // Don't check error status. + return; + default: + KALDI_WARN << "Flush called on not-open writer."; + } + } + + virtual bool Close() { + if (!this->IsOpen() || !output_.IsOpen()) + KALDI_ERR << "Close called on a stream that was not open." + << this->IsOpen() << ", " << output_.IsOpen(); + bool close_success = output_.Close(); + if (!close_success) { + KALDI_WARN << "Error closing stream: wspecifier is " << wspecifier_; + state_ = kUninitialized; + return false; + } + if (state_ == kWriteError) { + KALDI_WARN << "Closing writer in error state: wspecifier is " + << wspecifier_; + state_ = kUninitialized; + return false; + } + state_ = kUninitialized; + return true; + } + + TableWriterArchiveImpl(): state_(kUninitialized) {} + + // May throw on write error if Close was not called. + virtual ~TableWriterArchiveImpl() { + if (!IsOpen()) return; + else if (!Close()) + KALDI_ERR << "At TableWriter destructor: Write failed or stream close " + << "failed: wspecifier is "<< wspecifier_; + } + + private: + Output output_; + WspecifierOptions opts_; + std::string wspecifier_; + std::string archive_wxfilename_; + enum { // is stream open? + kUninitialized, // no + kOpen, // yes + kWriteError, // yes + } state_; +}; + + + + +// The implementation of TableWriter we use when writing to +// individual files (more generally, wxfilenames) specified +// in an scp file that we read. + +// Note: the code for this class is similar to +// RandomAccessTableReaderScriptImpl; try to keep them in sync. + +template +class TableWriterScriptImpl: public TableWriterImplBase { + public: + typedef typename Holder::T T; + + TableWriterScriptImpl(): last_found_(0), state_(kUninitialized) {} + + virtual bool Open(const std::string &wspecifier) { + switch (state_) { + case kReadScript: + KALDI_ERR << " Opening already open TableWriter: call Close first."; + case kUninitialized: case kNotReadScript: + break; + } + wspecifier_ = wspecifier; + WspecifierType ws = ClassifyWspecifier(wspecifier, + NULL, + &script_rxfilename_, + &opts_); + KALDI_ASSERT(ws == kScriptWspecifier); // or wrongly called. + KALDI_ASSERT(script_.empty()); // no way it could be nonempty at this point. + + if (!ReadScriptFile(script_rxfilename_, + true, // print any warnings + &script_)) { // error reading script file or invalid + // format + state_ = kNotReadScript; + return false; // no need to print further warnings. user gets the error. + } + std::sort(script_.begin(), script_.end()); + for (size_t i = 0; i+1 < script_.size(); i++) { + if (script_[i].first.compare(script_[i+1].first) >= 0) { + // script[i] not < script[i+1] in lexical order... + KALDI_WARN << "Script file " << PrintableRxfilename(script_rxfilename_) + << " contains duplicate key " << script_[i].first; + state_ = kNotReadScript; + return false; + } + } + state_ = kReadScript; + return true; + } + + virtual bool IsOpen() const { return (state_ == kReadScript); } + + virtual bool Close() { + if (!IsOpen()) + KALDI_ERR << "Close() called on TableWriter that was not open."; + state_ = kUninitialized; + last_found_ = 0; + script_.clear(); + return true; + } + + // Write returns true on success, false on failure, but + // some errors may not be detected till we call Close(). + virtual bool Write(const std::string &key, const T &value) { + if (!IsOpen()) + KALDI_ERR << "Write called on invalid stream"; + + if (!IsToken(key)) // e.g. empty string or has spaces... + KALDI_ERR << "Using invalid key " << key; + + std::string wxfilename; + if (!LookupFilename(key, &wxfilename)) { + if (opts_.permissive) { + return true; // In permissive mode, it's as if we're writing to + // /dev/null for missing keys. + } else { + KALDI_WARN << "Script file " + << PrintableRxfilename(script_rxfilename_) + << " has no entry for key " < pr(key, ""); // Important that "" + // compares less than or equal to any string, so lower_bound points to the + // element that has the same key. + typedef typename std::vector > + ::const_iterator IterType; + IterType iter = std::lower_bound(script_.begin(), script_.end(), pr); + if (iter != script_.end() && iter->first == key) { + last_found_ = iter - script_.begin(); + *wxfilename = iter->second; + return true; + } else { + return false; + } + } + + + WspecifierOptions opts_; + std::string wspecifier_; + std::string script_rxfilename_; + + // the script_ variable contains pairs of (key, filename), sorted using + // std::sort. This can be used with binary_search to look up filenames for + // writing. If this becomes inefficient we can use std::unordered_map (but I + // suspect this wouldn't be significantly faster & would use more memory). + // If memory becomes a problem here, the user should probably be passing + // only the relevant part of the scp file rather than expecting us to get too + // clever in the code. + std::vector > script_; + size_t last_found_; // This is for an optimization used in LookupFilename. + + enum { + kUninitialized, + kReadScript, + kNotReadScript, // read of script failed. + } state_; +}; + + +// The implementation of TableWriter we use when writing directly +// to an archive plus an associated scp. +template +class TableWriterBothImpl: public TableWriterImplBase { + public: + typedef typename Holder::T T; + + virtual bool Open(const std::string &wspecifier) { + switch (state_) { + case kUninitialized: + break; + case kWriteError: + KALDI_ERR << "Opening stream, already open with write error."; + case kOpen: default: + if (!Close()) // throw because this error may not have been previously + // detected by user. + KALDI_ERR << "Opening stream, error closing previously open stream."; + } + wspecifier_ = wspecifier; + WspecifierType ws = ClassifyWspecifier(wspecifier, + &archive_wxfilename_, + &script_wxfilename_, + &opts_); + KALDI_ASSERT(ws == kBothWspecifier); // or wrongly called. + if (ClassifyWxfilename(archive_wxfilename_) != kFileOutput) + KALDI_WARN << "When writing to both archive and script, the script file " + "will generally not be interpreted correctly unless the archive is " + "an actual file: wspecifier = " << wspecifier; + + if (!archive_output_.Open(archive_wxfilename_, opts_.binary, false)) { + // false means no binary header. + state_ = kUninitialized; + return false; + } + if (!script_output_.Open(script_wxfilename_, false, false)) { // first + // false means text mode: script files always text-mode. second false + // means don't write header (doesn't matter for text mode). + archive_output_.Close(); // Don't care about status: error anyway. + state_ = kUninitialized; + return false; + } + state_ = kOpen; + return true; + } + + virtual bool IsOpen() const { + switch (state_) { + case kUninitialized: return false; + case kOpen: case kWriteError: return true; + default: KALDI_ERR << "IsOpen() called on TableWriter in invalid state."; + } + return false; + } + + void MakeFilename(typename std::ostream::pos_type streampos, + std::string *output) const { + std::ostringstream ss; + ss << ':' << streampos; + KALDI_ASSERT(ss.str() != ":-1"); + *output = archive_wxfilename_ + ss.str(); + + // e.g. /some/file:12302. + // Note that we warned if archive_wxfilename_ is not an actual filename; + // the philosophy is we give the user rope and if they want to hang + // themselves, with it, fine. + } + + // Write returns true on success, false on failure, but + // some errors may not be detected till we call Close(). + virtual bool Write(const std::string &key, const T &value) { + switch (state_) { + case kOpen: break; + case kWriteError: + // user should have known from the last + // call to Write that there was a problem. Warn about it. + KALDI_WARN << "Writing to non-open TableWriter object."; + return false; + case kUninitialized: default: + KALDI_ERR << "Write called on invalid stream"; + } + // state is now kOpen or kWriteError. + if (!IsToken(key)) // e.g. empty string or has spaces... + KALDI_ERR << "Using invalid key " << key; + std::ostream &archive_os = archive_output_.Stream(); + archive_os << key << ' '; + typename std::ostream::pos_type archive_os_pos = archive_os.tellp(); + // position at start of Write() to archive. We will record this in the + // script file. + std::string offset_rxfilename; // rxfilename with offset into the archive, + // e.g. some_archive_name.ark:431541423 + MakeFilename(archive_os_pos, &offset_rxfilename); + + // Write to the script file first. + // The idea is that we want to get all the information possible into the + // script file, to make it easier to unwind errors later. + std::ostream &script_os = script_output_.Stream(); + script_output_.Stream() << key << ' ' << offset_rxfilename << '\n'; + + if (!Holder::Write(archive_output_.Stream(), opts_.binary, value)) { + KALDI_WARN << "Write failure to" + << PrintableWxfilename(archive_wxfilename_); + state_ = kWriteError; + return false; + } + + if (script_os.fail()) { + KALDI_WARN << "Write failure to script file detected: " + << PrintableWxfilename(script_wxfilename_); + state_ = kWriteError; + return false; + } + + if (archive_os.fail()) { + KALDI_WARN << "Write failure to archive file detected: " + << PrintableWxfilename(archive_wxfilename_); + state_ = kWriteError; + return false; + } + + if (state_ == kWriteError) return false; // Even if this Write seems to + // have succeeded, we fail because a previous Write failed and the archive + // may be corrupted and unreadable. + + if (opts_.flush) + Flush(); + return true; + } + + // Flush will flush any archive; it does not return error status, + // any errors will be reported on the next Write or Close. + virtual void Flush() { + switch (state_) { + case kWriteError: case kOpen: + archive_output_.Stream().flush(); // Don't check error status. + script_output_.Stream().flush(); // Don't check error status. + return; + default: + KALDI_WARN << "Flush called on not-open writer."; + } + } + + virtual bool Close() { + if (!this->IsOpen()) + KALDI_ERR << "Close called on a stream that was not open."; + bool close_success = true; + if (archive_output_.IsOpen()) + if (!archive_output_.Close()) close_success = false; + if (script_output_.IsOpen()) + if (!script_output_.Close()) close_success = false; + bool ans = close_success && (state_ != kWriteError); + state_ = kUninitialized; + return ans; + } + + TableWriterBothImpl(): state_(kUninitialized) {} + + // May throw on write error if Close() was not called. + // User can get the error status by calling Close(). + virtual ~TableWriterBothImpl() { + if (!IsOpen()) return; + else if (!Close()) + KALDI_ERR << "Write failed or stream close failed: " + << wspecifier_; + } + + private: + Output archive_output_; + Output script_output_; + WspecifierOptions opts_; + std::string archive_wxfilename_; + std::string script_wxfilename_; + std::string wspecifier_; + enum { // is stream open? + kUninitialized, // no + kOpen, // yes + kWriteError, // yes + } state_; +}; + + +template +TableWriter::TableWriter(const std::string &wspecifier): impl_(NULL) { + if (wspecifier != "" && !Open(wspecifier)) + KALDI_ERR << "Failed to open table for writing with wspecifier: " << wspecifier + << ": errno (in case it's relevant) is: " << strerror(errno); +} + +template +bool TableWriter::IsOpen() const { + return (impl_ != NULL); +} + + +template +bool TableWriter::Open(const std::string &wspecifier) { + if (IsOpen()) { + if (!Close()) // call Close() yourself to suppress this exception. + KALDI_ERR << "Failed to close previously open writer."; + } + KALDI_ASSERT(impl_ == NULL); + WspecifierType wtype = ClassifyWspecifier(wspecifier, NULL, NULL, NULL); + switch (wtype) { + case kBothWspecifier: + impl_ = new TableWriterBothImpl(); + break; + case kArchiveWspecifier: + impl_ = new TableWriterArchiveImpl(); + break; + case kScriptWspecifier: + impl_ = new TableWriterScriptImpl(); + break; + case kNoWspecifier: default: + KALDI_WARN << "ClassifyWspecifier: invalid wspecifier " << wspecifier; + return false; + } + if (impl_->Open(wspecifier)) { + return true; + } else { // The class will have printed a more specific warning. + delete impl_; + impl_ = NULL; + return false; + } +} + +template +void TableWriter::Write(const std::string &key, + const T &value) const { + CheckImpl(); + if (!impl_->Write(key, value)) + KALDI_ERR << "Error in TableWriter::Write"; + // More specific warning will have + // been printed in the Write function. +} + +template +void TableWriter::Flush() { + CheckImpl(); + impl_->Flush(); +} + +template +bool TableWriter::Close() { + CheckImpl(); + bool ans = impl_->Close(); + delete impl_; // We don't keep around non-open impl_ objects + // [c.f. definition of IsOpen()] + impl_ = NULL; + return ans; +} + +template +TableWriter::~TableWriter() { + if (IsOpen() && !Close()) { + KALDI_ERR << "Error closing TableWriter [in destructor]."; + } +} + + +// Types of RandomAccessTableReader: +// In principle, we would like to have four types of RandomAccessTableReader: +// the 4 combinations [scp, archive], [seekable, not-seekable], +// where if something is seekable we only store a file offset. However, +// it seems sufficient for now to only implement two of these, in both +// cases assuming it's not seekable so we never store file offsets and always +// store either the scp line or the data in the archive. The reasons are: +// (1) +// For scp files, storing the actual entry is not that much more expensive +// than storing the file offsets (since the entries are just filenames), and +// avoids a lot of fseek operations that might be expensive. +// (2) +// For archive files, there is no real reason, if you have the archive file +// on disk somewhere, why you wouldn't access it via its associated scp. +// [i.e. write it as ark, scp]. The main reason to read archives directly +// is if they are part of a pipe, and in this case it's not seekable, so +// we implement only this case. +// +// Note that we will rarely in practice have to keep in memory everything in +// the archive, as long as things are only read once from the archive (the +// "o, " or "once" option) and as long as we keep our keys in sorted order; +// to take advantage of this we need the "s, " (sorted) option, so we would +// read archives as e.g. "s, o, ark:-" (this is the rspecifier we would use if +// it was the standard input and these conditions held). + +template class RandomAccessTableReaderImplBase { + public: + typedef typename Holder::T T; + + virtual bool Open(const std::string &rspecifier) = 0; + + virtual bool HasKey(const std::string &key) = 0; + + virtual const T &Value(const std::string &key) = 0; + + virtual bool Close() = 0; + + virtual ~RandomAccessTableReaderImplBase() {} +}; + + +// Implementation of RandomAccessTableReader for a script file; for simplicity +// we just read it in all in one go, as it's unlikely someone would generate +// this from a pipe. In principle we could read it on-demand as for the +// archives, but this would probably be overkill. + +// Note: the code for this this class is similar to TableWriterScriptImpl: +// try to keep them in sync. +template +class RandomAccessTableReaderScriptImpl: + public RandomAccessTableReaderImplBase { + public: + typedef typename Holder::T T; + + RandomAccessTableReaderScriptImpl(): last_found_(0), state_(kUninitialized) {} + + virtual bool Open(const std::string &rspecifier) { + switch (state_) { + case kNotHaveObject: case kHaveObject: case kHaveRange: + KALDI_ERR << " Opening already open RandomAccessTableReader:" + " call Close first."; + case kUninitialized: case kNotReadScript: + break; + } + rspecifier_ = rspecifier; + RspecifierType rs = ClassifyRspecifier(rspecifier, + &script_rxfilename_, + &opts_); + KALDI_ASSERT(rs == kScriptRspecifier); // or wrongly called. + KALDI_ASSERT(script_.empty()); // no way it could be nonempty at this point + + if (!ReadScriptFile(script_rxfilename_, + true, // print any warnings + &script_)) { // error reading script file or invalid + // format + state_ = kNotReadScript; + return false; // no need to print further warnings. user gets the error. + } + + rspecifier_ = rspecifier; + // If opts_.sorted, the user has asserted that the keys are already sorted. + // Although we could easily sort them, we want to let the user know of this + // mistake. This same mistake could have serious effects if used with an + // archive rather than a script. + if (!opts_.sorted) + std::sort(script_.begin(), script_.end()); + for (size_t i = 0; i + 1 < script_.size(); i++) { + if (script_[i].first.compare(script_[i+1].first) >= 0) { + // script[i] not < script[i+1] in lexical order... + bool same = (script_[i].first == script_[i+1].first); + KALDI_WARN << "Script file " << PrintableRxfilename(script_rxfilename_) + << (same ? " contains duplicate key: " : + " is not sorted (remove s, option or add ns, option):" + " key is ") << script_[i].first; + state_ = kNotReadScript; + return false; + } + } + state_ = kNotHaveObject; + key_ = ""; // make sure we don't have a key set + return true; + } + + virtual bool IsOpen() const { + return (state_ == kNotHaveObject || state_ == kHaveObject || + state_ == kHaveRange); + } + + virtual bool Close() { + if (!IsOpen()) + KALDI_ERR << "Close() called on RandomAccessTableReader that was not" + " open."; + holder_.Clear(); + range_holder_.Clear(); + state_ = kUninitialized; + last_found_ = 0; + script_.clear(); + key_ = ""; + range_ = ""; + data_rxfilename_ = ""; + // This cannot fail because any errors of a "global" nature would have been + // detected when we did Open(). With archives it's different. + return true; + } + + virtual bool HasKey(const std::string &key) { + bool preload = opts_.permissive; + // In permissive mode, we have to check that we can read + // the scp entry before we assert that the key is there. + return HasKeyInternal(key, preload); + } + + + // Write returns true on success, false on failure, but + // some errors may not be detected till we call Close(). + virtual const T& Value(const std::string &key) { + if (!HasKeyInternal(key, true)) // true == preload. + KALDI_ERR << "Could not get item for key " << key + << ", rspecifier is " << rspecifier_ << " [to ignore this, " + << "add the p, (permissive) option to the rspecifier."; + KALDI_ASSERT(key_ == key); + if (state_ == kHaveObject) { + return holder_.Value(); + } else { + KALDI_ASSERT(state_ == kHaveRange); + return range_holder_.Value(); + } + } + + virtual ~RandomAccessTableReaderScriptImpl() { } + + private: + + // HasKeyInternal when called with preload == false just tells us whether the + // key is in the scp. With preload == true, which happens when the ,p + // (permissive) option is given in the rspecifier (or when called from + // Value()), it will also check that we can preload the object from disk + // (loading from the rxfilename in the scp), and only return true if we can. + // This function is called both from HasKey and from Value(). + virtual bool HasKeyInternal(const std::string &key, bool preload) { + switch (state_) { + case kUninitialized: case kNotReadScript: + KALDI_ERR << "HasKey called on RandomAccessTableReader object that is" + " not open."; + case kHaveObject: + if (key == key_ && range_.empty()) + return true; + break; + case kHaveRange: + if (key == key_) + return true; + break; + case kNotHaveObject: default: break; + } + KALDI_ASSERT(IsToken(key)); + size_t key_pos = 0; + if (!LookupKey(key, &key_pos)) { + return false; + } else { + if (!preload) { + return true; // we have the key, and were not asked to verify that the + // object could be read. + } else { // preload specified, so we have to attempt to pre-load the + // object before returning. + std::string data_rxfilename, range; // We will split + // script_[key_pos].second (e.g. "1.ark:100[0:2]" into data_rxfilename + // (e.g. "1.ark:100") and range (if any), e.g. "0:2". + if (script_[key_pos].second[script_[key_pos].second.size()-1] == ']') { + if(!ExtractRangeSpecifier(script_[key_pos].second, + &data_rxfilename, + &range)) { + KALDI_ERR << "TableReader: failed to parse range in '" + << script_[key_pos].second << "'"; + } + } else { + data_rxfilename = script_[key_pos].second; + } + if (state_ == kHaveRange) { + if (data_rxfilename_ == data_rxfilename && range_ == range) { + // the odd situation where two keys had the same rxfilename and range: + // just change the key and keep the object. + key_ = key; + return true; + } else { + range_holder_.Clear(); + state_ = kHaveObject; + } + } + // OK, at this point the state will be kHaveObject or kNotHaveObject. + if (state_ == kHaveObject) { + if (data_rxfilename_ != data_rxfilename) { + // clear out the object. + state_ = kNotHaveObject; + holder_.Clear(); + } + } + // At this point we can safely switch to the new key, data_rxfilename + // and range, and we know that if we have an object, it will already be + // the correct one. The state is now kHaveObject or kNotHaveObject. + key_ = key; + data_rxfilename_ = data_rxfilename; + range_ = range; + if (state_ == kNotHaveObject) { + // we need to read the object. + if (!input_.Open(data_rxfilename)) { + KALDI_WARN << "Error opening stream " + << PrintableRxfilename(data_rxfilename); + return false; + } else { + if (holder_.Read(input_.Stream())) { + state_ = kHaveObject; + } else { + KALDI_WARN << "Error reading object from " + "stream " << PrintableRxfilename(data_rxfilename); + return false; + } + } + } + // At this point the state is kHaveObject. + if (range.empty()) + return true; // we're done: no range was requested. + if (range_holder_.ExtractRange(holder_, range)) { + state_ = kHaveRange; + return true; + } else { + KALDI_WARN << "Failed to load object from " + << PrintableRxfilename(data_rxfilename) + << "[" << range << "]"; + // leave state at kHaveObject. + return false; + } + } + } + } + + // This function attempts to look up the key "key" in the sorted array + // script_. If it was found it returns true and puts the array offset into + // 'script_offset'; otherwise it returns false. + bool LookupKey(const std::string &key, size_t *script_offset) { + // First, an optimization: if we're going consecutively, this will + // make the lookup very fast. Since we may call HasKey and then + // Value(), which both may look up the key, we test if either the + // current or next position are correct. + if (last_found_ < script_.size() && script_[last_found_].first == key) { + *script_offset = last_found_; + return true; + } + last_found_++; + if (last_found_ < script_.size() && script_[last_found_].first == key) { + *script_offset = last_found_; + return true; + } + std::pair pr(key, ""); // Important that "" + // compares less than or equal to any string, so lower_bound points to the + // element that has the same key. + typedef typename std::vector > + ::const_iterator IterType; + IterType iter = std::lower_bound(script_.begin(), script_.end(), pr); + if (iter != script_.end() && iter->first == key) { + last_found_ = *script_offset = iter - script_.begin(); + return true; + } else { + return false; + } + } + + + Input input_; // Use the same input_ object for reading each file, in case + // the scp specifies offsets in an archive so we can keep the + // same file open. + RspecifierOptions opts_; + std::string rspecifier_; // rspecifier used to open this object; used in + // debug messages + std::string script_rxfilename_; // rxfilename of script file that we read. + + std::string key_; // The current key of the object that we have, but see the + // notes regarding states_ for more explanation of the + // semantics. + + Holder holder_; + Holder range_holder_; // Holds the partial object corresponding to the object + // range specifier 'range_'. this is only used when + // 'range_' is specified. + std::string range_; // range within which we read the object from holder_. + // If key_ is set, always correspond to the key. + std::string data_rxfilename_; // the rxfilename corresponding to key_, + // always set when key_ is set. + + + // the script_ variable contains pairs of (key, filename), sorted using + // std::sort. This can be used with binary_search to look up filenames for + // writing. If this becomes inefficient we can use std::unordered_map (but I + // suspect this wouldn't be significantly faster & would use more memory). + // If memory becomes a problem here, the user should probably be passing + // only the relevant part of the scp file rather than expecting us to get too + // clever in the code. + std::vector > script_; + size_t last_found_; // This is for an optimization used in FindFilename. + + enum { + // (*) is script_ set up? + // (*) does holder_ contain an object? + // (*) does range_holder_ contain and object? + // + // + kUninitialized, // no no no + kNotReadScript, // no no no + kNotHaveObject, // yes no no + kHaveObject, // yes yes no + kHaveRange, // yes yes yes + + // If we are in a state where holder_ contains an object, it always contains + // the object from 'key_', and the corresponding rxfilename is always + // 'data_rxfilename_'. If range_holder_ contains an object, it always + // corresponds to the range 'range_' of the object in 'holder_', and always + // corresponds to the current key. + } state_; +}; + + + + +// This is the base-class (with some implemented functions) for the +// implementations of RandomAccessTableReader when it's an archive. This +// base-class handles opening the files, storing the state of the reading +// process, and loading objects. This is the only case in which we have +// an intermediate class in the hierarchy between the virtual ImplBase +// class and the actual Impl classes. +// The child classes vary in the assumptions regarding sorting, etc. + +template +class RandomAccessTableReaderArchiveImplBase: + public RandomAccessTableReaderImplBase { + public: + typedef typename Holder::T T; + + RandomAccessTableReaderArchiveImplBase(): holder_(NULL), + state_(kUninitialized) { } + + virtual bool Open(const std::string &rspecifier) { + if (state_ != kUninitialized) { + if (!this->Close()) // call Close() yourself to suppress this exception. + KALDI_ERR << "Error closing previous input."; + } + rspecifier_ = rspecifier; + RspecifierType rs = ClassifyRspecifier(rspecifier, &archive_rxfilename_, + &opts_); + KALDI_ASSERT(rs == kArchiveRspecifier); + + // NULL means don't expect binary-mode header + bool ans; + if (Holder::IsReadInBinary()) + ans = input_.Open(archive_rxfilename_, NULL); + else + ans = input_.OpenTextMode(archive_rxfilename_); + if (!ans) { // header. + KALDI_WARN << "Failed to open stream " + << PrintableRxfilename(archive_rxfilename_); + state_ = kUninitialized; // Failure on Open + return false; // User should print the error message. + } else { + state_ = kNoObject; + } + return true; + } + + // ReadNextObject() requires that the state be kNoObject, + // and it will try read the next object. If it succeeds, + // it sets the state to kHaveObject, and + // cur_key_ and holder_ have the key and value. If it fails, + // it sets the state to kError or kEof. + void ReadNextObject() { + if (state_ != kNoObject) + KALDI_ERR << "ReadNextObject() called from wrong state."; + // Code error somewhere in this class or a child class. + std::istream &is = input_.Stream(); + is.clear(); // Clear any fail bits that may have been set... just in case + // this happened in the Read function. + is >> cur_key_; // This eats up any leading whitespace and gets the string. + if (is.eof()) { + state_ = kEof; + return; + } + if (is.fail()) { // This shouldn't really happen, barring file-system + // errors. + KALDI_WARN << "Error reading archive: rspecifier is " << rspecifier_; + state_ = kError; + return; + } + int c; + if ((c = is.peek()) != ' ' && c != '\t' && c != '\n') { // We expect a + // space ' ' after the key. + // We also allow tab, just so we can read archives generated by scripts + // that may not be fully aware of how this format works. + KALDI_WARN << "Invalid archive file format: expected space after key " + <(is.peek())) + << ", reading archive " + << PrintableRxfilename(archive_rxfilename_); + state_ = kError; + return; + } + if (c != '\n') is.get(); // Consume the space or tab. + holder_ = new Holder; + if (holder_->Read(is)) { + state_ = kHaveObject; + return; + } else { + KALDI_WARN << "Object read failed, reading archive " + << PrintableRxfilename(archive_rxfilename_); + state_ = kError; + delete holder_; + holder_ = NULL; + return; + } + } + + virtual bool IsOpen() const { + switch (state_) { + case kEof: case kError: case kHaveObject: case kNoObject: return true; + case kUninitialized: return false; + default: KALDI_ERR << "IsOpen() called on invalid object."; + return false; + } + } + + // Called by the child-class virutal Close() functions; does the + // shared parts of the cleanup. + bool CloseInternal() { + if (!this->IsOpen()) + KALDI_ERR << "Close() called on TableReader twice or otherwise wrongly."; + if (input_.IsOpen()) + input_.Close(); + if (state_ == kHaveObject) { + KALDI_ASSERT(holder_ != NULL); + delete holder_; + holder_ = NULL; + } else { + KALDI_ASSERT(holder_ == NULL); + } + bool ans = (state_ != kError); + state_ = kUninitialized; + if (!ans && opts_.permissive) { + KALDI_WARN << "Error state detected closing reader. " + << "Ignoring it because you specified permissive mode."; + return true; + } + return ans; + } + + ~RandomAccessTableReaderArchiveImplBase() { + // The child class has the responsibility to call CloseInternal(). + KALDI_ASSERT(state_ == kUninitialized && holder_ == NULL); + } + private: + Input input_; // Input object for the archive + protected: + // The variables below are accessed by child classes. + + std::string cur_key_; // current key (if state == kHaveObject). + Holder *holder_; // Holds the object we just read (if state == kHaveObject) + + std::string rspecifier_; + std::string archive_rxfilename_; + RspecifierOptions opts_; + + enum { // [The state of the reading process] [does holder_ [is input_ + // have object] open] + kUninitialized, // Uninitialized or closed no no + kNoObject, // Do not have object in holder_ no yes + kHaveObject, // Have object in holder_ yes yes + kEof, // End of file no yes + kError, // Some kind of error-state in the reading. no yes + } state_; +}; + + +// RandomAccessTableReaderDSortedArchiveImpl (DSorted for "doubly sorted") is +// the implementation for random-access reading of archives when both the +// archive, and the calling code, are in sorted order (i.e. we ask for the keys +// in sorted order). This is when the s and cs options are both given. It only +// ever has to keep one object in memory. It inherits from +// RandomAccessTableReaderArchiveImplBase which implements the common parts of +// RandomAccessTableReader that are used when it's an archive we're reading from + +template +class RandomAccessTableReaderDSortedArchiveImpl: + public RandomAccessTableReaderArchiveImplBase { + using RandomAccessTableReaderArchiveImplBase::kUninitialized; + using RandomAccessTableReaderArchiveImplBase::kHaveObject; + using RandomAccessTableReaderArchiveImplBase::kNoObject; + using RandomAccessTableReaderArchiveImplBase::kEof; + using RandomAccessTableReaderArchiveImplBase::kError; + using RandomAccessTableReaderArchiveImplBase::state_; + using RandomAccessTableReaderArchiveImplBase::opts_; + using RandomAccessTableReaderArchiveImplBase::cur_key_; + using RandomAccessTableReaderArchiveImplBase::holder_; + using RandomAccessTableReaderArchiveImplBase::rspecifier_; + using RandomAccessTableReaderArchiveImplBase::archive_rxfilename_; + using RandomAccessTableReaderArchiveImplBase::ReadNextObject; + public: + typedef typename Holder::T T; + + RandomAccessTableReaderDSortedArchiveImpl() { } + + virtual bool Close() { + // We don't have anything additional to clean up, so just + // call generic base-class one. + return this->CloseInternal(); + } + + virtual bool HasKey(const std::string &key) { + return FindKeyInternal(key); + } + virtual const T & Value(const std::string &key) { + if (!FindKeyInternal(key)) { + KALDI_ERR << "Value() called but no such key " << key + << " in archive " << PrintableRxfilename(archive_rxfilename_); + } + KALDI_ASSERT(this->state_ == kHaveObject && key == this->cur_key_ + && holder_ != NULL); + return this->holder_->Value(); + } + + virtual ~RandomAccessTableReaderDSortedArchiveImpl() { + if (this->IsOpen()) + if (!Close()) // more specific warning will already have been printed. + // we are in some kind of error state & user did not find out by + // calling Close(). + KALDI_ERR << "Error closing RandomAccessTableReader: rspecifier is " + << rspecifier_; + } + private: + // FindKeyInternal tries to find the key by calling "ReadNextObject()" + // as many times as necessary till we get to it. It is called from + // both FindKey and Value(). + bool FindKeyInternal(const std::string &key) { + // First check that the user is calling us right: should be + // in sorted order. If not, error. + if (!last_requested_key_.empty()) { + if (key.compare(last_requested_key_) < 0) { // key < last_requested_key_ + KALDI_ERR << "You provided the \"cs\" option " + << "but are not calling with keys in sorted order: " + << key << " < " << last_requested_key_ << ": rspecifier is " + << rspecifier_; + } + } + // last_requested_key_ is just for debugging of order of calling. + last_requested_key_ = key; + + if (state_ == kNoObject) + ReadNextObject(); // This can only happen + // once, the first time someone calls HasKey() or Value(). We don't + // do it in the initializer to stop the program hanging too soon, + // if reading from a pipe. + + if (state_ == kEof || state_ == kError) return false; + + if (state_ == kUninitialized) + KALDI_ERR << "Trying to access a RandomAccessTableReader object that is" + " not open."; + + std::string last_key_; // To check that + // the archive we're reading is in sorted order. + while (1) { + KALDI_ASSERT(state_ == kHaveObject); + int compare = key.compare(cur_key_); + if (compare == 0) { // key == key_ + return true; // we got it.. + } else if (compare < 0) { // key < cur_key_, so we already read past the + // place where we want to be. This implies that we will never find it + // [due to the sorting etc., this means it just isn't in the archive]. + return false; + } else { // compare > 0, key > cur_key_. We need to read further ahead. + last_key_ = cur_key_; + // read next object.. we have to set state to kNoObject first. + KALDI_ASSERT(holder_ != NULL); + delete holder_; + holder_ = NULL; + state_ = kNoObject; + ReadNextObject(); + if (state_ != kHaveObject) + return false; // eof or read error. + if (cur_key_.compare(last_key_) <= 0) { + KALDI_ERR << "You provided the \"s\" option " + << " (sorted order), but keys are out of order or" + " duplicated: " + << last_key_ << " is followed by " << cur_key_ + << ": rspecifier is " << rspecifier_; + } + } + } + } + + /// Last string provided to HasKey() or Value(); + std::string last_requested_key_; +}; + +// RandomAccessTableReaderSortedArchiveImpl is for random-access reading of +// archives when the user specified the sorted (s) option but not the +// called-sorted (cs) options. +template +class RandomAccessTableReaderSortedArchiveImpl: + public RandomAccessTableReaderArchiveImplBase { + using RandomAccessTableReaderArchiveImplBase::kUninitialized; + using RandomAccessTableReaderArchiveImplBase::kHaveObject; + using RandomAccessTableReaderArchiveImplBase::kNoObject; + using RandomAccessTableReaderArchiveImplBase::kEof; + using RandomAccessTableReaderArchiveImplBase::kError; + using RandomAccessTableReaderArchiveImplBase::state_; + using RandomAccessTableReaderArchiveImplBase::opts_; + using RandomAccessTableReaderArchiveImplBase::cur_key_; + using RandomAccessTableReaderArchiveImplBase::holder_; + using RandomAccessTableReaderArchiveImplBase::rspecifier_; + using RandomAccessTableReaderArchiveImplBase::archive_rxfilename_; + using RandomAccessTableReaderArchiveImplBase::ReadNextObject; + + public: + typedef typename Holder::T T; + + RandomAccessTableReaderSortedArchiveImpl(): + last_found_index_(static_cast(-1)), + pending_delete_(static_cast(-1)) { } + + virtual bool Close() { + for (size_t i = 0; i < seen_pairs_.size(); i++) + delete seen_pairs_[i].second; + seen_pairs_.clear(); + + pending_delete_ = static_cast(-1); + last_found_index_ = static_cast(-1); + + return this->CloseInternal(); + } + virtual bool HasKey(const std::string &key) { + HandlePendingDelete(); + size_t index; + bool ans = FindKeyInternal(key, &index); + if (ans && opts_.once && seen_pairs_[index].second == NULL) { + // Just do a check RE the once option. "&&opts_.once" is for + // efficiency since this can only happen in that case. + KALDI_ERR << "Error: HasKey called after Value() already called for " + << " that key, and once (o) option specified: rspecifier is " + << rspecifier_; + } + return ans; + } + virtual const T & Value(const std::string &key) { + HandlePendingDelete(); + size_t index; + if (!FindKeyInternal(key, &index)) { + KALDI_ERR << "Value() called but no such key " << key + << " in archive " << PrintableRxfilename(archive_rxfilename_); + } + if (seen_pairs_[index].second == NULL) { // can happen if opts.once_ + KALDI_ERR << "Error: Value() called more than once for key " + << key << " and once (o) option specified: rspecifier is " + << rspecifier_; + } + if (opts_.once) + pending_delete_ = index; // mark this index to be deleted on next call. + return seen_pairs_[index].second->Value(); + } + virtual ~RandomAccessTableReaderSortedArchiveImpl() { + if (this->IsOpen()) + if (!Close()) // more specific warning will already have been printed. + // we are in some kind of error state & user did not find out by + // calling Close(). + KALDI_ERR << "Error closing RandomAccessTableReader: rspecifier is " + << rspecifier_; + } + private: + void HandlePendingDelete() { + const size_t npos = static_cast(-1); + if (pending_delete_ != npos) { + KALDI_ASSERT(pending_delete_ < seen_pairs_.size()); + KALDI_ASSERT(seen_pairs_[pending_delete_].second != NULL); + delete seen_pairs_[pending_delete_].second; + seen_pairs_[pending_delete_].second = NULL; + pending_delete_ = npos; + } + } + + // FindKeyInternal tries to find the key in the array "seen_pairs_". + // If it is not already there, it reads ahead as far as necessary + // to determine whether we have the key or not. On success it returns + // true and puts the index into the array seen_pairs_, into "index"; + // on failure it returns false. + // It will leave the state as either kNoObject, kEof or kError. + // FindKeyInternal does not do any checking about whether you are asking + // about a key that has been already given (with the "once" option). + // That is the user's responsibility. + + bool FindKeyInternal(const std::string &key, size_t *index) { + // First, an optimization in case the previous call was for the + // same key, and we found it. + if (last_found_index_ < seen_pairs_.size() + && seen_pairs_[last_found_index_].first == key) { + *index = last_found_index_; + return true; + } + + if (state_ == kUninitialized) + KALDI_ERR << "Trying to access a RandomAccessTableReader object that is" + " not open."; + + // Step one is to see whether we have to read ahead for the object.. + // Note, the possible states right now are kNoObject, kEof or kError. + // We are never in the state kHaveObject except just after calling + // ReadNextObject(). + bool looped = false; + while (state_ == kNoObject && + (seen_pairs_.empty() || key.compare(seen_pairs_.back().first) > 0)) { + looped = true; + // Read this as: + // while ( the stream is potentially good for reading && + // ([got no keys] || key > most_recent_key) ) { ... + // Try to read a new object. + // Note that the keys in seen_pairs_ are ordered from least to greatest. + ReadNextObject(); + if (state_ == kHaveObject) { // Successfully read object. + if (!seen_pairs_.empty() && // This is just a check. + cur_key_.compare(seen_pairs_.back().first) <= 0) { + // read the expression above as: !( cur_key_ > previous_key). + // it means we are not in sorted order [the user specified that we + // are, or we would not be using this implementation]. + KALDI_ERR << "You provided the sorted (s) option but keys in archive " + << PrintableRxfilename(archive_rxfilename_) << " are not " + << "in sorted order: " << seen_pairs_.back().first + << " is followed by " << cur_key_; + } + KALDI_ASSERT(holder_ != NULL); + seen_pairs_.push_back(std::make_pair(cur_key_, holder_)); + holder_ = NULL; + state_ = kNoObject; + } + } + if (looped) { // We only need to check the last element of the seen_pairs_ + // array, since we would not have read more after getting "key". + if (!seen_pairs_.empty() && seen_pairs_.back().first == key) { + last_found_index_ = *index = seen_pairs_.size() - 1; + return true; + } else { + return false; + } + } + // Now we have do an actual binary search in the seen_pairs_ array. + std::pair pr(key, static_cast(NULL)); + typename std::vector >::iterator + iter = std::lower_bound(seen_pairs_.begin(), seen_pairs_.end(), + pr, PairCompare()); + if (iter != seen_pairs_.end() && + key == iter->first) { + last_found_index_ = *index = (iter - seen_pairs_.begin()); + return true; + } else { + return false; + } + } + + // These are the pairs of (key, object) we have read. We keep all the keys we + // have read but the actual objects (if they are stored with pointers inside + // the Holder object) may be deallocated if once == true, and the Holder + // pointer set to NULL. + std::vector > seen_pairs_; + size_t last_found_index_; // An optimization s.t. if FindKeyInternal called + // twice with same key (as it often will), it doesn't have to do the key + // search twice. + size_t pending_delete_; // If opts_.once == true, this is the index of + // element of seen_pairs_ that is pending deletion. + struct PairCompare { + // PairCompare is the Less-than operator for the pairs of(key, Holder). + // compares the keys. + inline bool operator() (const std::pair &pr1, + const std::pair &pr2) { + return (pr1.first.compare(pr2.first) < 0); + } + }; +}; + + + +// RandomAccessTableReaderUnsortedArchiveImpl is for random-access reading of +// archives when the user does not specify the sorted (s) option (in this case +// the called-sorted, or "cs" option, is ignored). This is the least efficient +// of the random access archive readers, in general, but it can be as efficient +// as the others, in speed, memory and latency, if the "once" option is +// specified and it happens that the keys of the archive are the same as the +// keys the code is called with (to HasKey() and Value()), and in the same +// order. However, if you ask it for a key that's not present it will have to +// read the archive till the end and store it all in memory. + +template +class RandomAccessTableReaderUnsortedArchiveImpl: + public RandomAccessTableReaderArchiveImplBase { + using RandomAccessTableReaderArchiveImplBase::kUninitialized; + using RandomAccessTableReaderArchiveImplBase::kHaveObject; + using RandomAccessTableReaderArchiveImplBase::kNoObject; + using RandomAccessTableReaderArchiveImplBase::kEof; + using RandomAccessTableReaderArchiveImplBase::kError; + using RandomAccessTableReaderArchiveImplBase::state_; + using RandomAccessTableReaderArchiveImplBase::opts_; + using RandomAccessTableReaderArchiveImplBase::cur_key_; + using RandomAccessTableReaderArchiveImplBase::holder_; + using RandomAccessTableReaderArchiveImplBase::rspecifier_; + using RandomAccessTableReaderArchiveImplBase::archive_rxfilename_; + using RandomAccessTableReaderArchiveImplBase::ReadNextObject; + + typedef typename Holder::T T; + + public: + RandomAccessTableReaderUnsortedArchiveImpl(): to_delete_iter_(map_.end()), + to_delete_iter_valid_(false) { + map_.max_load_factor(0.5); // make it quite empty -> quite efficient. + // default seems to be 1. + } + + virtual bool Close() { + for (typename MapType::iterator iter = map_.begin(); + iter != map_.end(); + ++iter) { + delete iter->second; + } + map_.clear(); + first_deleted_string_ = ""; + to_delete_iter_valid_ = false; + return this->CloseInternal(); + } + + virtual bool HasKey(const std::string &key) { + HandlePendingDelete(); + return FindKeyInternal(key, NULL); + } + virtual const T & Value(const std::string &key) { + HandlePendingDelete(); + const T *ans_ptr = NULL; + if (!FindKeyInternal(key, &ans_ptr)) + KALDI_ERR << "Value() called but no such key " << key + << " in archive " << PrintableRxfilename(archive_rxfilename_); + return *ans_ptr; + } + virtual ~RandomAccessTableReaderUnsortedArchiveImpl() { + if (this->IsOpen()) + if (!Close()) // more specific warning will already have been printed. + // we are in some kind of error state & user did not find out by + // calling Close(). + KALDI_ERR << "Error closing RandomAccessTableReader: rspecifier is " + << rspecifier_; + } + private: + void HandlePendingDelete() { + if (to_delete_iter_valid_) { + to_delete_iter_valid_ = false; + delete to_delete_iter_->second; // Delete Holder object. + if (first_deleted_string_.length() == 0) + first_deleted_string_ = to_delete_iter_->first; + map_.erase(to_delete_iter_); // delete that element. + } + } + + // FindKeyInternal tries to find the key in the map "map_" + // If it is not already there, it reads ahead either until it finds the + // key, or until end of file. If called with value_ptr == NULL, + // it assumes it's called from HasKey() and just returns true or false + // and doesn't otherwise have side effects. If called with value_ptr != + // NULL, it assumes it's called from Value(). Thus, it will crash + // if it cannot find the key. If it can find it it puts its address in + // *value_ptr, and if opts_once == true it will mark that element of the + // map to be deleted. + + bool FindKeyInternal(const std::string &key, const T **value_ptr = NULL) { + typename MapType::iterator iter = map_.find(key); + if (iter != map_.end()) { // Found in the map... + if (value_ptr == NULL) { // called from HasKey + return true; // this is all we have to do. + } else { + *value_ptr = &(iter->second->Value()); + if (opts_.once) { // value won't be needed again, so mark + // for deletion. + to_delete_iter_ = iter; // pending delete. + KALDI_ASSERT(!to_delete_iter_valid_); + to_delete_iter_valid_ = true; + } + return true; + } + } + while (state_ == kNoObject) { + ReadNextObject(); + if (state_ == kHaveObject) { // Successfully read object. + state_ = kNoObject; // we are about to transfer ownership + // of the object in holder_ to map_. + // Insert it into map_. + std::pair pr = + map_.insert(typename MapType::value_type(cur_key_, holder_)); + + if (!pr.second) { // Was not inserted-- previous element w/ same key + delete holder_; // map was not changed, no ownership transferred. + holder_ = NULL; + KALDI_ERR << "Error in RandomAccessTableReader: duplicate key " + << cur_key_ << " in archive " << archive_rxfilename_; + } + holder_ = NULL; // ownership transferred to map_. + if (cur_key_ == key) { // the one we wanted.. + if (value_ptr == NULL) { // called from HasKey + return true; + } else { // called from Value() + *value_ptr = &(pr.first->second->Value()); // this gives us the + // Value() from the Holder in the map. + if (opts_.once) { // mark for deletion, as won't be needed again. + to_delete_iter_ = pr.first; + KALDI_ASSERT(!to_delete_iter_valid_); + to_delete_iter_valid_ = true; + } + return true; + } + } + } + } + if (opts_.once && key == first_deleted_string_) { + KALDI_ERR << "You specified the once (o) option but " + << "you are calling using key " << key + << " more than once: rspecifier is " << rspecifier_; + } + return false; // We read the entire archive (or got to error state) and + // didn't find it. + } + + typedef unordered_map MapType; + MapType map_; + + typename MapType::iterator to_delete_iter_; + bool to_delete_iter_valid_; + + std::string first_deleted_string_; // keep the first string we deleted + // from map_ (if opts_.once == true). It's for an inexact spot-check that the + // "once" option isn't being used incorrectly. +}; + + + + + +template +RandomAccessTableReader::RandomAccessTableReader(const + std::string &rspecifier): + impl_(NULL) { + if (rspecifier != "" && !Open(rspecifier)) + KALDI_ERR << "Error opening RandomAccessTableReader object " + " (rspecifier is: " << rspecifier << ")"; +} + +template +bool RandomAccessTableReader::Open(const std::string &rspecifier) { + if (IsOpen()) + KALDI_ERR << "Already open."; + RspecifierOptions opts; + RspecifierType rs = ClassifyRspecifier(rspecifier, NULL, &opts); + switch (rs) { + case kScriptRspecifier: + impl_ = new RandomAccessTableReaderScriptImpl(); + break; + case kArchiveRspecifier: + if (opts.sorted) { + if (opts.called_sorted) // "doubly" sorted case. + impl_ = new RandomAccessTableReaderDSortedArchiveImpl(); + else + impl_ = new RandomAccessTableReaderSortedArchiveImpl(); + } else { + impl_ = new RandomAccessTableReaderUnsortedArchiveImpl(); + } + break; + case kNoRspecifier: default: + KALDI_WARN << "Invalid rspecifier: " + << rspecifier; + return false; + } + if (!impl_->Open(rspecifier)) { + // A warning will already have been printed. + delete impl_; + impl_ = NULL; + return false; + } + return true; +} + +template +bool RandomAccessTableReader::HasKey(const std::string &key) { + CheckImpl(); + if (!IsToken(key)) + KALDI_ERR << "Invalid key \"" << key << '"'; + return impl_->HasKey(key); +} + + +template +const typename RandomAccessTableReader::T& +RandomAccessTableReader::Value(const std::string &key) { + CheckImpl(); + return impl_->Value(key); +} + +template +bool RandomAccessTableReader::Close() { + CheckImpl(); + bool ans =impl_->Close(); + delete impl_; + impl_ = NULL; + return ans; +} + +template +RandomAccessTableReader::~RandomAccessTableReader() { + if (IsOpen() && !Close()) // call Close() yourself to stop this being thrown. + KALDI_ERR << "failure detected in destructor."; +} + +template +void SequentialTableReader::CheckImpl() const { + if (!impl_) { + KALDI_ERR << "Trying to use empty SequentialTableReader (perhaps you " + << "passed the empty string as an argument to a program?)"; + } +} + +template +void RandomAccessTableReader::CheckImpl() const { + if (!impl_) { + KALDI_ERR << "Trying to use empty RandomAccessTableReader (perhaps you " + << "passed the empty string as an argument to a program?)"; + } +} + +template +void TableWriter::CheckImpl() const { + if (!impl_) { + KALDI_ERR << "Trying to use empty TableWriter (perhaps you " + << "passed the empty string as an argument to a program?)"; + } +} + +template +RandomAccessTableReaderMapped::RandomAccessTableReaderMapped( + const std::string &table_rxfilename, + const std::string &utt2spk_rxfilename): + reader_(table_rxfilename), token_reader_(table_rxfilename.empty() ? "" : + utt2spk_rxfilename), + utt2spk_rxfilename_(utt2spk_rxfilename) { } + +template +bool RandomAccessTableReaderMapped::Open( + const std::string &table_rxfilename, + const std::string &utt2spk_rxfilename) { + if (reader_.IsOpen()) reader_.Close(); + if (token_reader_.IsOpen()) token_reader_.Close(); + KALDI_ASSERT(!table_rxfilename.empty()); + if (!reader_.Open(table_rxfilename)) return false; // will have printed + // warning internally, probably. + if (!utt2spk_rxfilename.empty()) { + if (!token_reader_.Open(utt2spk_rxfilename)) { + reader_.Close(); + return false; + } + } + return true; +} + + +template +bool RandomAccessTableReaderMapped::HasKey(const std::string &utt) { + // We don't check IsOpen, we let the call go through to the member variable + // (reader_), which will crash with a more informative error message than + // we can give here, as we don't any longer know the rxfilename. + if (token_reader_.IsOpen()) { // We need to map the key from utt to spk. + if (!token_reader_.HasKey(utt)) + KALDI_ERR << "Attempting to read key " << utt << ", which is not present " + << "in utt2spk map or similar map being read from " + << PrintableRxfilename(utt2spk_rxfilename_); + const std::string &spk = token_reader_.Value(utt); + return reader_.HasKey(spk); + } else { + return reader_.HasKey(utt); + } +} + +template +const typename Holder::T& RandomAccessTableReaderMapped::Value( + const std::string &utt) { + if (token_reader_.IsOpen()) { // We need to map the key from utt to spk. + if (!token_reader_.HasKey(utt)) + KALDI_ERR << "Attempting to read key " << utt << ", which is not present " + << "in utt2spk map or similar map being read from " + << PrintableRxfilename(utt2spk_rxfilename_); + const std::string &spk = token_reader_.Value(utt); + return reader_.Value(spk); + } else { + return reader_.Value(utt); + } +} + + + +/// @} + +} // end namespace kaldi + + + +#endif // KALDI_UTIL_KALDI_TABLE_INL_H_ diff --git a/speechx/speechx/kaldi/util/kaldi-table.cc b/speechx/speechx/kaldi/util/kaldi-table.cc new file mode 100644 index 00000000..1aeceb2b --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-table.cc @@ -0,0 +1,321 @@ +// util/kaldi-table.cc + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "util/kaldi-table.h" +#include "util/text-utils.h" + +namespace kaldi { + + +bool ReadScriptFile(const std::string &rxfilename, + bool warn, + std::vector > + *script_out) { + bool is_binary; + Input input; + + if (!input.Open(rxfilename, &is_binary)) { + if (warn) KALDI_WARN << "Error opening script file: " << + PrintableRxfilename(rxfilename); + return false; + } + if (is_binary) { + if (warn) KALDI_WARN << "Error: script file appears to be binary: " << + PrintableRxfilename(rxfilename); + return false; + } + + bool ans = ReadScriptFile(input.Stream(), warn, script_out); + if (warn && !ans) + KALDI_WARN << "[script file was: " << PrintableRxfilename(rxfilename) << + "]"; + return ans; +} + +bool ReadScriptFile(std::istream &is, + bool warn, + std::vector > + *script_out) { + KALDI_ASSERT(script_out != NULL); + std::string line; + int line_number = 0; + while (getline(is, line)) { + line_number++; + const char *c = line.c_str(); + if (*c == '\0') { + if (warn) + KALDI_WARN << "Empty " << line_number << "'th line in script file"; + return false; // Empty line so invalid scp file format.. + } + + std::string key, rest; + SplitStringOnFirstSpace(line, &key, &rest); + + if (key.empty() || rest.empty()) { + if (warn) + KALDI_WARN << "Invalid " << line_number << "'th line in script file" + <<":\"" << line << '"'; + return false; + } + script_out->resize(script_out->size()+1); + script_out->back().first = key; + script_out->back().second = rest; + } + return true; +} + +bool WriteScriptFile(std::ostream &os, + const std::vector > + &script) { + if (!os.good()) { + KALDI_WARN << "WriteScriptFile: attempting to write to invalid stream."; + return false; + } + std::vector >::const_iterator iter; + for (iter = script.begin(); iter != script.end(); ++iter) { + if (!IsToken(iter->first)) { + KALDI_WARN << "WriteScriptFile: using invalid token \"" << iter->first << + '"'; + return false; + } + if (iter->second.find('\n') != std::string::npos || + (iter->second.length() != 0 && + (isspace(iter->second[0]) || + isspace(iter->second[iter->second.length()-1])))) { + // second part contains newline or leading or trailing space. + KALDI_WARN << "WriteScriptFile: attempting to write invalid line \"" << + iter->second << '"'; + return false; + } + os << iter->first << ' ' << iter->second << '\n'; + } + if (!os.good()) { + KALDI_WARN << "WriteScriptFile: stream in error state."; + return false; + } + return true; +} + +bool WriteScriptFile(const std::string &wxfilename, + const std::vector > + &script) { + Output output; + if (!output.Open(wxfilename, false, false)) { // false, false means not + // binary, no binary-mode header. + KALDI_ERR << "Error opening output stream for script file: " + << PrintableWxfilename(wxfilename); + return false; + } + if (!WriteScriptFile(output.Stream(), script)) { + KALDI_ERR << "Error writing script file to stream " + << PrintableWxfilename(wxfilename); + return false; + } + return true; +} + + + +WspecifierType ClassifyWspecifier(const std::string &wspecifier, + std::string *archive_wxfilename, + std::string *script_wxfilename, + WspecifierOptions *opts) { + // Examples: + // ark,t:wxfilename -> kArchiveWspecifier + // ark,b:wxfilename -> kArchiveWspecifier + // scp,t:rxfilename -> kScriptWspecifier + // scp,t:rxfilename -> kScriptWspecifier + // ark,scp,t:filename, wxfilename -> kBothWspecifier + // ark,scp:filename, wxfilename -> kBothWspecifier + // Note we can include the flush option (f) or no-flush (nf) + // anywhere: e.g. + // ark,scp,f:filename, wxfilename -> kBothWspecifier + // or: + // scp,t,nf:rxfilename -> kScriptWspecifier + + if (archive_wxfilename) archive_wxfilename->clear(); + if (script_wxfilename) script_wxfilename->clear(); + + size_t pos = wspecifier.find(':'); + if (pos == std::string::npos) return kNoWspecifier; + if (isspace(*(wspecifier.rbegin()))) return kNoWspecifier; // Trailing space + // disallowed. + + std::string before_colon(wspecifier, 0, pos), after_colon(wspecifier, pos+1); + + std::vector split_first_part; // Split part before ':' on ', '. + SplitStringToVector(before_colon, ", ", false, &split_first_part); // false== + // don't omit empty strings between commas. + + WspecifierType ws = kNoWspecifier; + + if (opts != NULL) + *opts = WspecifierOptions(); // Make sure all the defaults are as in the + // default constructor of the options class. + + for (size_t i = 0; i < split_first_part.size(); i++) { + const std::string &str = split_first_part[i]; // e.g. "b", "t", "f", "ark", + // "scp". + const char *c = str.c_str(); + if (!strcmp(c, "b")) { + if (opts) opts->binary = true; + } else if (!strcmp(c, "f")) { + if (opts) opts->flush = true; + } else if (!strcmp(c, "nf")) { + if (opts) opts->flush = false; + } else if (!strcmp(c, "t")) { + if (opts) opts->binary = false; + } else if (!strcmp(c, "p")) { + if (opts) opts->permissive = true; + } else if (!strcmp(c, "ark")) { + if (ws == kNoWspecifier) ws = kArchiveWspecifier; + else + return kNoWspecifier; // We do not allow "scp, ark", only "ark, + // scp". + } else if (!strcmp(c, "scp")) { + if (ws == kNoWspecifier) ws = kScriptWspecifier; + else if (ws == kArchiveWspecifier) ws = kBothWspecifier; + else + return kNoWspecifier; // repeated "scp" option: invalid. + } else { + return kNoWspecifier; // Could not interpret this option. + } + } + + switch (ws) { + case kArchiveWspecifier: + if (archive_wxfilename) + *archive_wxfilename = after_colon; + break; + case kScriptWspecifier: + if (script_wxfilename) + *script_wxfilename = after_colon; + break; + case kBothWspecifier: + pos = after_colon.find(','); // first comma. + if (pos == std::string::npos) return kNoWspecifier; + if (archive_wxfilename) + *archive_wxfilename = std::string(after_colon, 0, pos); + if (script_wxfilename) + *script_wxfilename = std::string(after_colon, pos+1); + break; + case kNoWspecifier: default: break; + } + return ws; +} + + + +RspecifierType ClassifyRspecifier(const std::string &rspecifier, + std::string *rxfilename, + RspecifierOptions *opts) { + // Examples + // ark:rxfilename -> kArchiveRspecifier + // scp:rxfilename -> kScriptRspecifier + // + // We also allow the meaningless prefixes b, and t, + // plus the options o (once), no (not-once), + // s (sorted) and ns (not-sorted), p (permissive) + // and np (not-permissive). + // so the following would be valid: + // + // f, o, b, np, ark:rxfilename -> kArchiveRspecifier + // + // Examples: + // + // b, ark:rxfilename -> kArchiveRspecifier + // t, ark:rxfilename -> kArchiveRspecifier + // b, scp:rxfilename -> kScriptRspecifier + // t, no, s, scp:rxfilename -> kScriptRspecifier + // t, ns, scp:rxfilename -> kScriptRspecifier + + // Improperly formed Rspecifiers will be classified as kNoRspecifier. + + if (rxfilename) rxfilename->clear(); + + if (opts != NULL) + *opts = RspecifierOptions(); // Make sure all the defaults are as in the + // default constructor of the options class. + + size_t pos = rspecifier.find(':'); + if (pos == std::string::npos) return kNoRspecifier; + + if (isspace(*(rspecifier.rbegin()))) return kNoRspecifier; // Trailing space + // disallowed. + + std::string before_colon(rspecifier, 0, pos), + after_colon(rspecifier, pos+1); + + std::vector split_first_part; // Split part before ':' on ', '. + SplitStringToVector(before_colon, ", ", false, &split_first_part); // false== + // don't omit empty strings between commas. + + RspecifierType rs = kNoRspecifier; + + for (size_t i = 0; i < split_first_part.size(); i++) { + const std::string &str = split_first_part[i]; // e.g. "b", "t", "f", "ark", + // "scp". + const char *c = str.c_str(); + if (!strcmp(c, "b")); // Ignore this option. It's so we can use the same + // specifiers for rspecifiers and wspecifiers. + else if (!strcmp(c, "t")); // Ignore this option too. + else if (!strcmp(c, "o")) { + if (opts) opts->once = true; + } else if (!strcmp(c, "no")) { + if (opts) opts->once = false; + } else if (!strcmp(c, "p")) { + if (opts) opts->permissive = true; + } else if (!strcmp(c, "np")) { + if (opts) opts->permissive = false; + } else if (!strcmp(c, "s")) { + if (opts) opts->sorted = true; + } else if (!strcmp(c, "ns")) { + if (opts) opts->sorted = false; + } else if (!strcmp(c, "cs")) { + if (opts) opts->called_sorted = true; + } else if (!strcmp(c, "ncs")) { + if (opts) opts->called_sorted = false; + } else if (!strcmp(c, "bg")) { + if (opts) opts->background = true; + } else if (!strcmp(c, "ark")) { + if (rs == kNoRspecifier) rs = kArchiveRspecifier; + else + return kNoRspecifier; // Repeated or combined ark and scp options + // invalid. + } else if (!strcmp(c, "scp")) { + if (rs == kNoRspecifier) rs = kScriptRspecifier; + else + return kNoRspecifier; // Repeated or combined ark and scp options + // invalid. + } else { + return kNoRspecifier; // Could not interpret this option. + } + } + if ((rs == kArchiveRspecifier || rs == kScriptRspecifier) + && rxfilename != NULL) + *rxfilename = after_colon; + return rs; +} + + + + + + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/util/kaldi-table.h b/speechx/speechx/kaldi/util/kaldi-table.h new file mode 100644 index 00000000..6865cea1 --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-table.h @@ -0,0 +1,471 @@ +// util/kaldi-table.h + +// Copyright 2009-2011 Microsoft Corporation +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_KALDI_TABLE_H_ +#define KALDI_UTIL_KALDI_TABLE_H_ + +#include +#include +#include + +#include "base/kaldi-common.h" +#include "util/kaldi-holder.h" + +namespace kaldi { + +// Forward declarations +template class RandomAccessTableReaderImplBase; +template class SequentialTableReaderImplBase; +template class TableWriterImplBase; + +/// \addtogroup table_group +/// @{ + +// This header defines the Table classes (RandomAccessTableReader, +// SequentialTableReader and TableWriter) and explains what the Holder classes, +// which the Table class requires as a template argument, are like. It also +// explains the "rspecifier" and "wspecifier" concepts (these are strings that +// explain how to read/write objects via archives or scp files. A table is +// conceptually a collection of objects of a particular type T indexed by keys +// of type std::string (these Keys additionally have an order within +// each table). +// The Table classes are templated on a type (call it Holder) such that +// Holder::T is a typedef equal to T. + +// see kaldi-holder.h for detail on the Holder classes. + +typedef std::vector KeyList; + +// Documentation for "wspecifier" +// "wspecifier" describes how we write a set of objects indexed by keys. +// The basic, unadorned wspecifiers are as follows: +// +// ark:wxfilename +// scp:rxfilename +// ark,scp:filename,wxfilename +// ark,scp:filename,wxfilename +// +// +// We also allow the following modifiers: +// t means text mode. +// b means binary mode. +// f means flush the stream after writing each entry. +// (nf means don't flush, and the default is not to flush). +// p means permissive mode, when writing to an "scp" file only: will ignore +// missing scp entries, i.e. won't write anything for those files but will +// return success status). +// +// So the following are valid wspecifiers: +// ark,b,f:foo +// "ark,b,b:| gzip -c > foo" +// "ark,scp,t,nf:foo.ark,|gzip -c > foo.scp.gz" +// ark,b:- +// +// The meanings of rxfilename and wxfilename are as described in +// kaldi-io.h (they are filenames but include pipes, stdin/stdout +// and so on; filename is a regular filename. +// + +// The ark:wxfilename type of wspecifier instructs the class to +// write directly to an archive. For small objects (e.g. lists of ints), +// the text archive format will generally be human readable with one line +// per entry in the archive. +// +// The type "scp:xfilename" refers to an scp file which should +// already exist on disk, and tells us where to write the data for +// each key (usually an actual file); each line of the scp file +// would be: +// key xfilename +// +// The type ark,scp:filename,wxfilename means +// we write both an archive and an scp file that specifies offsets into the +// archive, with lines like: +// key filename:12407 +// where the number is the byte offset into the file. +// In this case we restrict the archive-filename to be an actual filename, +// as we can't see a situation where an extended filename would make sense +// for this (we can't fseek() in pipes). + +enum WspecifierType { + kNoWspecifier, + kArchiveWspecifier, + kScriptWspecifier, + kBothWspecifier +}; + +struct WspecifierOptions { + bool binary; + bool flush; + bool permissive; // will ignore absent scp entries. + WspecifierOptions(): binary(true), flush(false), permissive(false) { } +}; + +// ClassifyWspecifier returns the type of the wspecifier string, +// and (if pointers are non-NULL) outputs the extra information +// about the options, and the script and archive +// filenames. +WspecifierType ClassifyWspecifier(const std::string &wspecifier, + std::string *archive_wxfilename, + std::string *script_wxfilename, + WspecifierOptions *opts); + +// ReadScriptFile reads an .scp file in its entirety, and appends it +// (in order as it was in the scp file) in script_out_, which contains +// pairs of (key, xfilename). The .scp +// file format is: on each line, key xfilename +// where xfilename means rxfilename or wxfilename, and may contain internal +// spaces (we trim away any leading or trailing space). The key is space-free. +// ReadScriptFile returns true if the format was valid (empty files +// are valid). +// If 'print_warnings', it will print out warning messages that explain what +// kind of error there was. +bool ReadScriptFile(const std::string &rxfilename, + bool print_warnings, + std::vector > + *script_out); + +// This version of ReadScriptFile works from an istream. +bool ReadScriptFile(std::istream &is, + bool print_warnings, + std::vector > + *script_out); + +// Writes, for each entry in script, the first element, then ' ', then the +// second element then '\n'. Checks that the keys (first elements of pairs) are +// valid tokens (nonempty, no whitespace), and the values (second elements of +// pairs) are newline-free and contain no leading or trailing space. Returns +// true on success. +bool WriteScriptFile(const std::string &wxfilename, + const std::vector > + &script); + +// This version writes to an ostream. +bool WriteScriptFile(std::ostream &os, + const std::vector > + &script); + +// Documentation for "rspecifier" +// "rspecifier" describes how we read a set of objects indexed by keys. +// The possibilities are: +// +// ark:rxfilename +// scp:rxfilename +// +// We also allow various modifiers: +// o means the program will only ask for each key once, which enables +// the reader to discard already-asked-for values. +// s means the keys are sorted on input (means we don't have to read till +// eof if someone asked for a key that wasn't there). +// cs means that it is called in sorted order (we are generally asserting +// this based on knowledge of how the program works). +// p means "permissive", and causes it to skip over keys whose corresponding +// scp-file entries cannot be read. [and to ignore errors in archives and +// script files, and just consider the "good" entries]. +// We allow the negation of the options above, as in no, ns, np, +// but these aren't currently very useful (just equivalent to omitting the +// corresponding option). +// [any of the above options can be prefixed by n to negate them, e.g. no, +// ns, ncs, np; but these aren't currently useful as you could just omit +// the option]. +// bg means "background". It currently has no effect for random-access readers, +// but for sequential readers it will cause it to "read ahead" to the next +// value, in a background thread. Recommended when reading larger objects +// such as neural-net training examples, especially when you want to +// maximize GPU usage. +// +// b is ignored [for scripting convenience] +// t is ignored [for scripting convenience] +// +// +// So for instance the following would be a valid rspecifier: +// +// "o, s, p, ark:gunzip -c foo.gz|" + +struct RspecifierOptions { + // These options only make a difference for the RandomAccessTableReader class. + bool once; // we assert that the program will only ask for each key once. + bool sorted; // we assert that the keys are sorted. + bool called_sorted; // we assert that the (HasKey(), Value() functions will + // also be called in sorted order. [this implies "once" but not vice versa]. + bool permissive; // If "permissive", when reading from scp files it treats + // scp files that can't be read as if the corresponding key were not there. + // For archive files it will suppress errors getting thrown if the archive + // is corrupted and can't be read to the end. + bool background; // For sequential readers, if the background option ("bg") + // is provided, it will read ahead to the next object in a + // background thread. + RspecifierOptions(): once(false), sorted(false), + called_sorted(false), permissive(false), + background(false) { } +}; + +enum RspecifierType { + kNoRspecifier, + kArchiveRspecifier, + kScriptRspecifier +}; + +RspecifierType ClassifyRspecifier(const std::string &rspecifier, + std::string *rxfilename, + RspecifierOptions *opts); + + +/// Allows random access to a collection +/// of objects in an archive or script file; see \ref io_sec_tables. +template +class RandomAccessTableReader { + public: + typedef typename Holder::T T; + + RandomAccessTableReader(): impl_(NULL) { } + + // This constructor is equivalent to default constructor + "open", but + // throws on error. + explicit RandomAccessTableReader(const std::string &rspecifier); + + // Opens the table. + bool Open(const std::string &rspecifier); + + // Returns true if table is open. + bool IsOpen() const { return (impl_ != NULL); } + + // Close() will close the table [throws if it was not open], + // and returns true on success (false if we were reading an + // archive and we discovered an error in the archive). + bool Close(); + + // Says if it has this key. + // If you are using the "permissive" (p) read option, + // it will return false for keys whose corresponding entry + // in the scp file cannot be read. + + bool HasKey(const std::string &key); + + // Value() may throw if you are reading an scp file, you + // do not have the "permissive" (p) option, and an entry + // in the scp file cannot be read. Typically you won't + // want to catch this error. + const T &Value(const std::string &key); + + ~RandomAccessTableReader(); + + // Allow copy-constructor only for non-opened readers (needed for inclusion in + // stl vector) + RandomAccessTableReader(const RandomAccessTableReader + &other): + impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); } + private: + // Disallow assignment. + RandomAccessTableReader &operator=(const RandomAccessTableReader&); + void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error + // message and dies (with KALDI_ERR) if NULL. + RandomAccessTableReaderImplBase *impl_; +}; + + + +/// A templated class for reading objects sequentially from an archive or script +/// file; see \ref io_sec_tables. +template +class SequentialTableReader { + public: + typedef typename Holder::T T; + + SequentialTableReader(): impl_(NULL) { } + + // This constructor equivalent to default constructor + "open", but + // throws on error. + explicit SequentialTableReader(const std::string &rspecifier); + + // Opens the table. Returns exit status; but does throw if previously open + // stream was in error state. You can call Close to prevent this; anyway, + // calling Open more than once is not usually needed. + bool Open(const std::string &rspecifier); + + // Returns true if we're done. It will also return true if there's some kind + // of error and we can't read any more; in this case, you can detect the + // error by calling Close and checking the return status; otherwise + // the destructor will throw. + inline bool Done(); + + // Only valid to call Key() if Done() returned false. + inline std::string Key(); + + // FreeCurrent() is provided as an optimization to save memory, for large + // objects. It instructs the class to deallocate the current value. The + // reference Value() will be invalidated by this. + void FreeCurrent(); + + // Return reference to the current value. It's only valid to call this if + // Done() returned false. The reference is valid till next call to this + // object. It will throw if you are reading an scp file, did not specify the + // "permissive" (p) option and the file cannot be read. [The permissive + // option makes it behave as if that key does not even exist, if the + // corresponding file cannot be read.] You probably wouldn't want to catch + // this exception; the user can just specify the p option in the rspecifier. + // We make this non-const to enable things like shallow swap on the held + // object in situations where this would avoid making a redundant copy. + T &Value(); + + // Next goes to the next key. It will not throw; any error will + // result in Done() returning true, and then the destructor will + // throw unless you call Close(). + void Next(); + + // Returns true if table is open for reading (does not imply + // stream is in good state). + bool IsOpen() const; + + // Close() will return false (failure) if Done() became true + // because of an error/ condition rather than because we are + // really done [e.g. because of an error or early termination + // in the archive]. + // If there is an error and you don't call Close(), the destructor + // will fail. + // Close() + bool Close(); + + // The destructor may throw. This is the desired behaviour, as it's the way + // we signal the error to the user (to detect it, call Close(). The issue is + // that otherwise the user has no way to tell whether Done() returned true + // because we reached the end of the archive or script, or because there was + // an error that prevented further reading. + ~SequentialTableReader(); + + // Allow copy-constructor only for non-opened readers (needed for inclusion in + // stl vector) + SequentialTableReader(const SequentialTableReader &other): + impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); } + private: + // Disallow assignment. + SequentialTableReader &operator = (const SequentialTableReader&); + void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error + // message and dies (with KALDI_ERR) if NULL. + SequentialTableReaderImplBase *impl_; +}; + + +/// A templated class for writing objects to an +/// archive or script file; see \ref io_sec_tables. +template +class TableWriter { + public: + typedef typename Holder::T T; + + TableWriter(): impl_(NULL) { } + + // This constructor equivalent to default constructor + // + "open", but throws on error. See docs for + // wspecifier above. + explicit TableWriter(const std::string &wspecifier); + + // Opens the table. See docs for wspecifier above. + // If it returns true, it is open. + bool Open(const std::string &wspecifier); + + // Returns true if open for writing. + bool IsOpen() const; + + // Write the object. Throws KaldiFatalError on error via the KALDI_ERR macro. + inline void Write(const std::string &key, const T &value) const; + + + // Flush will flush any archive; it does not return error status + // or throw, any errors will be reported on the next Write or Close. + // Useful if we may be writing to a command in a pipe and want + // to ensure good CPU utilization. + void Flush(); + + // Close() is not necessary to call, as the destructor + // closes it; it's mainly useful if you want to handle + // error states because the destructor will throw on + // error if you do not call Close(). + bool Close(); + + ~TableWriter(); + + // Allow copy-constructor only for non-opened writers (needed for inclusion in + // stl vector) + TableWriter(const TableWriter &other): impl_(NULL) { + KALDI_ASSERT(other.impl_ == NULL); + } + private: + TableWriter &operator = (const TableWriter&); // Disallow assignment. + + void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error + // message and dies (with KALDI_ERR) if NULL. + TableWriterImplBase *impl_; +}; + + +/// This class is for when you are reading something in random access, but +/// it may actually be stored per-speaker (or something similar) but the +/// keys you're using are per utterance. So you also provide an "rxfilename" +/// for a file containing lines like +/// utt1 spk1 +/// utt2 spk1 +/// utt3 spk1 +/// and so on. Note: this is optional; if it is an empty string, we just won't +/// do the mapping. Also, "table_rxfilename" may be the empty string (as for +/// a regular table), in which case the table just won't be opened. +/// We provide only the most frequently used of the functions of +/// RandomAccessTableReader. + +template +class RandomAccessTableReaderMapped { + public: + typedef typename Holder::T T; + /// Note: "utt2spk_rxfilename" will in the normal case be an rxfilename + /// for an utterance to speaker map, but this code is general; it accepts + /// a generic map. + RandomAccessTableReaderMapped(const std::string &table_rxfilename, + const std::string &utt2spk_rxfilename); + + RandomAccessTableReaderMapped() {} + + /// Note: when calling Open, utt2spk_rxfilename may be empty. + bool Open(const std::string &table_rxfilename, + const std::string &utt2spk_rxfilename); + + bool HasKey(const std::string &key); + const T &Value(const std::string &key); + inline bool IsOpen() const { return reader_.IsOpen(); } + inline bool Close() { return reader_.Close(); } + + + + // The default copy-constructor will do what we want: it will crash for + // already-opened readers, by calling the member-variable copy-constructors. + private: + // Disallow assignment. + RandomAccessTableReaderMapped &operator = + (const RandomAccessTableReaderMapped&); + RandomAccessTableReader reader_; + RandomAccessTableReader token_reader_; + std::string utt2spk_rxfilename_; // Used only in diagnostic messages. +}; + + +/// @} end "addtogroup table_group" +} // end namespace kaldi + +#include "util/kaldi-table-inl.h" + +#endif // KALDI_UTIL_KALDI_TABLE_H_ diff --git a/speechx/speechx/kaldi/util/kaldi-thread.cc b/speechx/speechx/kaldi/util/kaldi-thread.cc new file mode 100644 index 00000000..2405d01f --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-thread.cc @@ -0,0 +1,33 @@ +// util/kaldi-thread.cc + +// Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +// Frantisek Skala + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/kaldi-thread.h" + +namespace kaldi { +int32 g_num_threads = 8; // Initialize this global variable. + +MultiThreadable::~MultiThreadable() { + // default implementation does nothing +} + + + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/util/kaldi-thread.h b/speechx/speechx/kaldi/util/kaldi-thread.h new file mode 100644 index 00000000..50bf7dac --- /dev/null +++ b/speechx/speechx/kaldi/util/kaldi-thread.h @@ -0,0 +1,284 @@ +// util/kaldi-thread.h + +// Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +// Frantisek Skala +// 2017 University of Southern California (Author: Dogan Can) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_THREAD_KALDI_THREAD_H_ +#define KALDI_THREAD_KALDI_THREAD_H_ 1 + +#include +#include "util/options-itf.h" +#include "util/kaldi-semaphore.h" + +// This header provides convenient mechanisms for parallelization. +// +// The class MultiThreader, and the function RunMultiThreaded provide a +// mechanism to run a specified number of jobs in parellel and wait for them +// all to finish. They accept objects of some class C that derives from the +// base class MultiThreadable. C needs to define the operator () that takes +// no arguments. See ExampleClass below. +// +// The class TaskSequencer addresses a different problem typically encountered +// in Kaldi command-line programs that process a sequence of items. The items +// to be processed are coming in. They are all of different sizes, e.g. +// utterances with different numbers of frames. We would like them to be +// processed in parallel to make good use of the threads available but they +// must be output in the same order they came in. Here, we again accept objects +// of some class C with an operator () that takes no arguments. C may also have +// a destructor with side effects (typically some kind of output). +// TaskSequencer is responsible for running the jobs in parallel. It has a +// function Run() that will accept a new object of class C; this will block +// until a thread is free, at which time it will spawn a thread that starts +// running the operator () of the object. When threads are finished running, +// the objects will be deleted. TaskSequencer guarantees that the destructors +// will be called sequentially (not in parallel) and in the same order the +// objects were given to the Run() function, so that it is safe for the +// destructor to have side effects such as outputting data. +// Note: the destructor of TaskSequencer will wait for any remaining jobs that +// are still running and will call the destructors. + + +namespace kaldi { + +extern int32 g_num_threads; // Maximum number of threads (for programs that +// use threads, which is not many of them, e.g. the SGMM update program does. +// This is 8 by default. You can change this on the command line, where +// used, with --num-threads. Programs that think they will use threads +// should register it with their ParseOptions, as something like: +// po.Register("num-threads", &g_num_threads, "Number of threads to use."); + +class MultiThreadable { + // To create a function object that does part of the job, inherit from this + // class, implement a copy constructor calling the default copy constructor + // of this base class (so that thread_id_ and num_threads_ are copied to new + // instances), and finally implement the operator() that does part of the job + // based on thread_id_ and num_threads_ variables. + // Note: example implementations are in util/kaldi-thread-test.cc + public: + virtual void operator() () = 0; + // Does the main function of the class + // Subclasses have to redefine this + virtual ~MultiThreadable(); + // Optional destructor. Note: the destructor of the object passed by the user + // will also be called, so watch out. + + public: + // Do not redeclare thread_id_ and num_threads_ in derived classes. + int32 thread_id_; // 0 <= thread_id_ < num_threads_ + int32 num_threads_; + + private: + // Have additional member variables as needed. +}; + + +class ExampleClass: public MultiThreadable { + public: + ExampleClass(int32 *foo); // Typically there will be an initializer that + // takes arguments. + + ExampleClass(const ExampleClass &other); // A copy constructor is also needed; + // some example classes use the default version of this. + + void operator() () { + // Does the main function of the class. This + // function will typically want to look at the values of the + // member variables thread_id_ and num_threads_, inherited + // from MultiThreadable. + } + ~ExampleClass() { + // Optional destructor. Sometimes useful things happen here, + // for example summing up of certain quantities. See code + // that uses RunMultiThreaded for examples. + } + private: + // Have additional member variables as needed. +}; + + +template +class MultiThreader { + public: + MultiThreader(int32 num_threads, const C &c_in) : + threads_(std::max(1, num_threads)), + cvec_(std::max(1, num_threads), c_in) { + if (num_threads == 0) { + // This is a special case with num_threads == 0, which behaves like with + // num_threads == 1 but without creating extra threads. This can be + // useful in GPU computations where threads cannot be used. + cvec_[0].thread_id_ = 0; + cvec_[0].num_threads_ = 1; + (cvec_[0])(); + } else { + for (int32 i = 0; i < threads_.size(); i++) { + cvec_[i].thread_id_ = i; + cvec_[i].num_threads_ = threads_.size(); + threads_[i] = std::thread(std::ref(cvec_[i])); + } + } + } + ~MultiThreader() { + for (size_t i = 0; i < threads_.size(); i++) + if (threads_[i].joinable()) + threads_[i].join(); + } + private: + std::vector threads_; + std::vector cvec_; +}; + +/// Here, class C should inherit from MultiThreadable. Note: if you want to +/// control the number of threads yourself, or need to do something in the main +/// thread of the program while the objects exist, just initialize the +/// MultiThreader object yourself. +template void RunMultiThreaded(const C &c_in) { + MultiThreader m(g_num_threads, c_in); +} + + +struct TaskSequencerConfig { + int32 num_threads; + int32 num_threads_total; + TaskSequencerConfig(): num_threads(1), num_threads_total(0) { } + void Register(OptionsItf *opts) { + opts->Register("num-threads", &num_threads, "Number of actively processing " + "threads to run in parallel"); + opts->Register("num-threads-total", &num_threads_total, "Total number of " + "threads, including those that are waiting on other threads " + "to produce their output. Controls memory use. If <= 0, " + "defaults to --num-threads plus 20. Otherwise, must " + "be >= num-threads."); + } +}; + +// C should have an operator () taking no arguments, that does some kind +// of computation, and a destructor that produces some kind of output (the +// destructors will be run sequentially in the same order Run as called. +template +class TaskSequencer { + public: + TaskSequencer(const TaskSequencerConfig &config): + num_threads_(config.num_threads), + threads_avail_(config.num_threads), + tot_threads_avail_(config.num_threads_total > 0 ? config.num_threads_total : + config.num_threads + 20), + thread_list_(NULL) { + KALDI_ASSERT((config.num_threads_total <= 0 || + config.num_threads_total >= config.num_threads) && + "num-threads-total, if specified, must be >= num-threads"); + } + + /// This function takes ownership of the pointer "c", and will delete it + /// in the same sequence as Run was called on the jobs. + void Run(C *c) { + // run in main thread + if (num_threads_ == 0) { + (*c)(); + delete c; + return; + } + + threads_avail_.Wait(); // wait till we have a thread for computation free. + tot_threads_avail_.Wait(); // this ensures we don't have too many threads + // waiting on I/O, and consume too much memory. + + // put the new RunTaskArgsList object at head of the singly + // linked list thread_list_. + thread_list_ = new RunTaskArgsList(this, c, thread_list_); + thread_list_->thread = std::thread(TaskSequencer::RunTask, + thread_list_); + } + + void Wait() { // You call this at the end if it's more convenient + // than waiting for the destructor. It waits for all tasks to finish. + if (thread_list_ != NULL) { + thread_list_->thread.join(); + KALDI_ASSERT(thread_list_->tail == NULL); // thread would not + // have exited without setting tail to NULL. + delete thread_list_; + thread_list_ = NULL; + } + } + + /// The destructor waits for the last thread to exit. + ~TaskSequencer() { + Wait(); + } + private: + struct RunTaskArgsList { + TaskSequencer *me; // Think of this as a "this" pointer. + C *c; // Clist element of the task we're expected + std::thread thread; + RunTaskArgsList *tail; + RunTaskArgsList(TaskSequencer *me, C *c, RunTaskArgsList *tail): + me(me), c(c), tail(tail) {} + }; + // This static function gets run in the threads that we create. + static void RunTask(RunTaskArgsList *args) { + // (1) run the job. + (*(args->c))(); // call operator () on args->c, which does the computation. + args->me->threads_avail_.Signal(); // Signal that the compute-intensive + // part of the thread is done (we want to run no more than + // config_.num_threads of these.) + + // (2) we want to destroy the object "c" now, by deleting it. But for + // correct sequencing (this is the whole point of this class, it + // is intended to ensure the output of the program is in correct order), + // we first wait till the previous thread, whose details will be in "tail", + // is finished. + if (args->tail != NULL) { + args->tail->thread.join(); + } + + delete args->c; // delete the object "c". This may cause some output, + // e.g. to a stream. We don't need to worry about concurrent access to + // the output stream, because each thread waits for the previous thread + // to be done, before doing this. So there is no risk of concurrent + // access. + args->c = NULL; + + if (args->tail != NULL) { + KALDI_ASSERT(args->tail->tail == NULL); // Because we already + // did join on args->tail->thread, which means that + // thread was done, and before it exited, it would have + // deleted and set to NULL its tail (which is the next line of code). + delete args->tail; + args->tail = NULL; + } + // At this point we are exiting from the thread. Signal the + // "tot_threads_avail_" semaphore which is used to limit the total number of threads that are alive, including + // not onlhy those that are in active computation in c->operator (), but those + // that are waiting on I/O or other threads. + args->me->tot_threads_avail_.Signal(); + } + + int32 num_threads_; // copy of config.num_threads (since Semaphore doesn't store original count) + + Semaphore threads_avail_; // Initialized to the number of threads we are + // supposed to run with; the function Run() waits on this. + + Semaphore tot_threads_avail_; // We use this semaphore to ensure we don't + // consume too much memory... + RunTaskArgsList *thread_list_; + +}; + +} // namespace kaldi + +#endif // KALDI_THREAD_KALDI_THREAD_H_ diff --git a/speechx/speechx/kaldi/util/options-itf.h b/speechx/speechx/kaldi/util/options-itf.h new file mode 100644 index 00000000..204f46d6 --- /dev/null +++ b/speechx/speechx/kaldi/util/options-itf.h @@ -0,0 +1,49 @@ +// itf/options-itf.h + +// Copyright 2013 Tanel Alumae, Tallinn University of Technology + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_ITF_OPTIONS_ITF_H_ +#define KALDI_ITF_OPTIONS_ITF_H_ 1 +#include "base/kaldi-common.h" + +namespace kaldi { + +class OptionsItf { + public: + + virtual void Register(const std::string &name, + bool *ptr, const std::string &doc) = 0; + virtual void Register(const std::string &name, + int32 *ptr, const std::string &doc) = 0; + virtual void Register(const std::string &name, + uint32 *ptr, const std::string &doc) = 0; + virtual void Register(const std::string &name, + float *ptr, const std::string &doc) = 0; + virtual void Register(const std::string &name, + double *ptr, const std::string &doc) = 0; + virtual void Register(const std::string &name, + std::string *ptr, const std::string &doc) = 0; + + virtual ~OptionsItf() {} +}; + +} // namespace Kaldi + +#endif // KALDI_ITF_OPTIONS_ITF_H_ + + diff --git a/speechx/speechx/kaldi/util/parse-options.cc b/speechx/speechx/kaldi/util/parse-options.cc new file mode 100644 index 00000000..4b08ca39 --- /dev/null +++ b/speechx/speechx/kaldi/util/parse-options.cc @@ -0,0 +1,668 @@ +// util/parse-options.cc + +// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; +// Saarland University (Author: Arnab Ghoshal); +// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); +// Frantisek Skala; Arnab Ghoshal +// Copyright 2013 Tanel Alumae +// +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "util/parse-options.h" +#include "util/text-utils.h" +#include "base/kaldi-common.h" + +namespace kaldi { + + +ParseOptions::ParseOptions(const std::string &prefix, + OptionsItf *other): + print_args_(false), help_(false), usage_(""), argc_(0), argv_(NULL) { + ParseOptions *po = dynamic_cast(other); + if (po != NULL && po->other_parser_ != NULL) { + // we get here if this constructor is used twice, recursively. + other_parser_ = po->other_parser_; + } else { + other_parser_ = other; + } + if (po != NULL && po->prefix_ != "") { + prefix_ = po->prefix_ + std::string(".") + prefix; + } else { + prefix_ = prefix; + } +} + +void ParseOptions::Register(const std::string &name, + bool *ptr, const std::string &doc) { + RegisterTmpl(name, ptr, doc); +} + +void ParseOptions::Register(const std::string &name, + int32 *ptr, const std::string &doc) { + RegisterTmpl(name, ptr, doc); +} + +void ParseOptions::Register(const std::string &name, + uint32 *ptr, const std::string &doc) { + RegisterTmpl(name, ptr, doc); +} + +void ParseOptions::Register(const std::string &name, + float *ptr, const std::string &doc) { + RegisterTmpl(name, ptr, doc); +} + +void ParseOptions::Register(const std::string &name, + double *ptr, const std::string &doc) { + RegisterTmpl(name, ptr, doc); +} + +void ParseOptions::Register(const std::string &name, + std::string *ptr, const std::string &doc) { + RegisterTmpl(name, ptr, doc); +} + +// old-style, used for registering application-specific parameters +template +void ParseOptions::RegisterTmpl(const std::string &name, T *ptr, + const std::string &doc) { + if (other_parser_ == NULL) { + this->RegisterCommon(name, ptr, doc, false); + } else { + KALDI_ASSERT(prefix_ != "" && + "Cannot use empty prefix when registering with prefix."); + std::string new_name = prefix_ + '.' + name; // name becomes prefix.name + other_parser_->Register(new_name, ptr, doc); + } +} + +// does the common part of the job of registering a parameter +template +void ParseOptions::RegisterCommon(const std::string &name, T *ptr, + const std::string &doc, bool is_standard) { + KALDI_ASSERT(ptr != NULL); + std::string idx = name; + NormalizeArgName(&idx); + if (doc_map_.find(idx) != doc_map_.end()) + KALDI_WARN << "Registering option twice, ignoring second time: " << name; + this->RegisterSpecific(name, idx, ptr, doc, is_standard); +} + +// used to register standard parameters (those that are present in all of the +// applications) +template +void ParseOptions::RegisterStandard(const std::string &name, T *ptr, + const std::string &doc) { + this->RegisterCommon(name, ptr, doc, true); +} + +void ParseOptions::RegisterSpecific(const std::string &name, + const std::string &idx, + bool *b, + const std::string &doc, + bool is_standard) { + bool_map_[idx] = b; + doc_map_[idx] = DocInfo(name, doc + " (bool, default = " + + ((*b)? "true)" : "false)"), is_standard); +} + +void ParseOptions::RegisterSpecific(const std::string &name, + const std::string &idx, + int32 *i, + const std::string &doc, + bool is_standard) { + int_map_[idx] = i; + std::ostringstream ss; + ss << doc << " (int, default = " << *i << ")"; + doc_map_[idx] = DocInfo(name, ss.str(), is_standard); +} + +void ParseOptions::RegisterSpecific(const std::string &name, + const std::string &idx, + uint32 *u, + const std::string &doc, + bool is_standard) { + uint_map_[idx] = u; + std::ostringstream ss; + ss << doc << " (uint, default = " << *u << ")"; + doc_map_[idx] = DocInfo(name, ss.str(), is_standard); +} + +void ParseOptions::RegisterSpecific(const std::string &name, + const std::string &idx, + float *f, + const std::string &doc, + bool is_standard) { + float_map_[idx] = f; + std::ostringstream ss; + ss << doc << " (float, default = " << *f << ")"; + doc_map_[idx] = DocInfo(name, ss.str(), is_standard); +} + +void ParseOptions::RegisterSpecific(const std::string &name, + const std::string &idx, + double *f, + const std::string &doc, + bool is_standard) { + double_map_[idx] = f; + std::ostringstream ss; + ss << doc << " (double, default = " << *f << ")"; + doc_map_[idx] = DocInfo(name, ss.str(), is_standard); +} + +void ParseOptions::RegisterSpecific(const std::string &name, + const std::string &idx, + std::string *s, + const std::string &doc, + bool is_standard) { + string_map_[idx] = s; + doc_map_[idx] = DocInfo(name, doc + " (string, default = \"" + *s + "\")", + is_standard); +} +void ParseOptions::DisableOption(const std::string &name) { + if (argv_ != NULL) + KALDI_ERR << "DisableOption must not be called after calling Read()."; + if (doc_map_.erase(name) == 0) + KALDI_ERR << "Option " << name + << " was not registered so cannot be disabled: "; + bool_map_.erase(name); + int_map_.erase(name); + uint_map_.erase(name); + float_map_.erase(name); + double_map_.erase(name); + string_map_.erase(name); +} + + +int ParseOptions::NumArgs() const { + return positional_args_.size(); +} + +std::string ParseOptions::GetArg(int i) const { + // use KALDI_ERR if code error + if (i < 1 || i > static_cast(positional_args_.size())) + KALDI_ERR << "ParseOptions::GetArg, invalid index " << i; + return positional_args_[i - 1]; +} + +// We currently do not support any other options. +enum ShellType { kBash = 0 }; + +// This can be changed in the code if it ever does need to be changed (as it's +// unlikely that one compilation of this tool-set would use both shells). +static ShellType kShellType = kBash; + +// Returns true if we need to escape a string before putting it into +// a shell (mainly thinking of bash shell, but should work for others) +// This is for the convenience of the user so command-lines that are +// printed out by ParseOptions::Read (with --print-args=true) are +// paste-able into the shell and will run. If you use a different type of +// shell, it might be necessary to change this function. +// But it's mostly a cosmetic issue as it basically affects how +// the program echoes its command-line arguments to the screen. +static bool MustBeQuoted(const std::string &str, ShellType st) { + // Only Bash is supported (for the moment). + KALDI_ASSERT(st == kBash && "Invalid shell type."); + + const char *c = str.c_str(); + if (*c == '\0') { + return true; // Must quote empty string + } else { + const char *ok_chars[2]; + + // These seem not to be interpreted as long as there are no other "bad" + // characters involved (e.g. "," would be interpreted as part of something + // like a{b,c}, but not on its own. + ok_chars[kBash] = "[]~#^_-+=:.,/"; + + // Just want to make sure that a space character doesn't get automatically + // inserted here via an automated style-checking script, like it did before. + KALDI_ASSERT(!strchr(ok_chars[kBash], ' ')); + + for (; *c != '\0'; c++) { + // For non-alphanumeric characters we have a list of characters which + // are OK. All others are forbidden (this is easier since the shell + // interprets most non-alphanumeric characters). + if (!isalnum(*c)) { + const char *d; + for (d = ok_chars[st]; *d != '\0'; d++) if (*c == *d) break; + // If not alphanumeric or one of the "ok_chars", it must be escaped. + if (*d == '\0') return true; + } + } + return false; // The string was OK. No quoting or escaping. + } +} + +// Returns a quoted and escaped version of "str" +// which has previously been determined to need escaping. +// Our aim is to print out the command line in such a way that if it's +// pasted into a shell of ShellType "st" (only bash for now), it +// will get passed to the program in the same way. +static std::string QuoteAndEscape(const std::string &str, ShellType st) { + // Only Bash is supported (for the moment). + KALDI_ASSERT(st == kBash && "Invalid shell type."); + + // For now we use the following rules: + // In the normal case, we quote with single-quote "'", and to escape + // a single-quote we use the string: '\'' (interpreted as closing the + // single-quote, putting an escaped single-quote from the shell, and + // then reopening the single quote). + char quote_char = '\''; + const char *escape_str = "'\\''"; // e.g. echo 'a'\''b' returns a'b + + // If the string contains single-quotes that would need escaping this + // way, and we determine that the string could be safely double-quoted + // without requiring any escaping, then we double-quote the string. + // This is the case if the characters "`$\ do not appear in the string. + // e.g. see http://www.redhat.com/mirrors/LDP/LDP/abs/html/quotingvar.html + const char *c_str = str.c_str(); + if (strchr(c_str, '\'') && !strpbrk(c_str, "\"`$\\")) { + quote_char = '"'; + escape_str = "\\\""; // should never be accessed. + } + + char buf[2]; + buf[1] = '\0'; + + buf[0] = quote_char; + std::string ans = buf; + const char *c = str.c_str(); + for (;*c != '\0'; c++) { + if (*c == quote_char) { + ans += escape_str; + } else { + buf[0] = *c; + ans += buf; + } + } + buf[0] = quote_char; + ans += buf; + return ans; +} + +// static function +std::string ParseOptions::Escape(const std::string &str) { + return MustBeQuoted(str, kShellType) ? QuoteAndEscape(str, kShellType) : str; +} + + +int ParseOptions::Read(int argc, const char *const argv[]) { + argc_ = argc; + argv_ = argv; + std::string key, value; + int i; + if (argc > 0) { + // set global "const char*" g_program_name (name of the program) + // so it can be printed out in error messages; + // it's useful because often the stderr of different programs will + // be mixed together in the same log file. +#ifdef _MSC_VER + const char *c = strrchr(argv[0], '\\'); +#else + const char *c = strrchr(argv[0], '/'); +#endif + SetProgramName(c == NULL ? argv[0] : c + 1); + } + // first pass: look for config parameter, look for priority + for (i = 1; i < argc; i++) { + if (std::strncmp(argv[i], "--", 2) == 0) { + if (std::strcmp(argv[i], "--") == 0) { + // a lone "--" marks the end of named options + break; + } + bool has_equal_sign; + SplitLongArg(argv[i], &key, &value, &has_equal_sign); + NormalizeArgName(&key); + Trim(&value); + if (key.compare("config") == 0) { + ReadConfigFile(value); + } + if (key.compare("help") == 0) { + PrintUsage(); + exit(0); + } + } + } + bool double_dash_seen = false; + // second pass: add the command line options + for (i = 1; i < argc; i++) { + if (std::strncmp(argv[i], "--", 2) == 0) { + if (std::strcmp(argv[i], "--") == 0) { + // A lone "--" marks the end of named options. + // Skip that option and break the processing of named options + i += 1; + double_dash_seen = true; + break; + } + bool has_equal_sign; + SplitLongArg(argv[i], &key, &value, &has_equal_sign); + NormalizeArgName(&key); + Trim(&value); + if (!SetOption(key, value, has_equal_sign)) { + PrintUsage(true); + KALDI_ERR << "Invalid option " << argv[i]; + } + } else { + break; + } + } + + // process remaining arguments as positional + for (; i < argc; i++) { + if ((std::strcmp(argv[i], "--") == 0) && !double_dash_seen) { + double_dash_seen = true; + } else { + positional_args_.push_back(std::string(argv[i])); + } + } + + // if the user did not suppress this with --print-args = false.... + if (print_args_) { + std::ostringstream strm; + for (int j = 0; j < argc; j++) + strm << Escape(argv[j]) << " "; + strm << '\n'; + std::cerr << strm.str() << std::flush; + } + return i; +} + + +void ParseOptions::PrintUsage(bool print_command_line) { + std::cerr << '\n' << usage_ << '\n'; + DocMapType::iterator it; + // first we print application-specific options + bool app_specific_header_printed = false; + for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { + if (it->second.is_standard_ == false) { // application-specific option + if (app_specific_header_printed == false) { // header was not yet printed + std::cerr << "Options:" << '\n'; + app_specific_header_printed = true; + } + std::cerr << " --" << std::setw(25) << std::left << it->second.name_ + << " : " << it->second.use_msg_ << '\n'; + } + } + if (app_specific_header_printed == true) { + std::cerr << '\n'; + } + + // then the standard options + std::cerr << "Standard options:" << '\n'; + for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { + if (it->second.is_standard_ == true) { // we have standard option + std::cerr << " --" << std::setw(25) << std::left << it->second.name_ + << " : " << it->second.use_msg_ << '\n'; + } + } + std::cerr << '\n'; + if (print_command_line) { + std::ostringstream strm; + strm << "Command line was: "; + for (int j = 0; j < argc_; j++) + strm << Escape(argv_[j]) << " "; + strm << '\n'; + std::cerr << strm.str() << std::flush; + } +} + +void ParseOptions::PrintConfig(std::ostream &os) { + os << '\n' << "[[ Configuration of UI-Registered options ]]" << '\n'; + std::string key; + DocMapType::iterator it; + for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { + key = it->first; + os << it->second.name_ << " = "; + if (bool_map_.end() != bool_map_.find(key)) { + os << (*bool_map_[key] ? "true" : "false"); + } else if (int_map_.end() != int_map_.find(key)) { + os << (*int_map_[key]); + } else if (uint_map_.end() != uint_map_.find(key)) { + os << (*uint_map_[key]); + } else if (float_map_.end() != float_map_.find(key)) { + os << (*float_map_[key]); + } else if (double_map_.end() != double_map_.find(key)) { + os << (*double_map_[key]); + } else if (string_map_.end() != string_map_.find(key)) { + os << "'" << *string_map_[key] << "'"; + } else { + KALDI_ERR << "PrintConfig: unrecognized option " << key << "[code error]"; + } + os << '\n'; + } + os << '\n'; +} + + +void ParseOptions::ReadConfigFile(const std::string &filename) { + std::ifstream is(filename.c_str(), std::ifstream::in); + if (!is.good()) { + KALDI_ERR << "Cannot open config file: " << filename; + } + + std::string line, key, value; + int32 line_number = 0; + while (std::getline(is, line)) { + line_number++; + // trim out the comments + size_t pos; + if ((pos = line.find_first_of('#')) != std::string::npos) { + line.erase(pos); + } + // skip empty lines + Trim(&line); + if (line.length() == 0) continue; + + if (line.substr(0, 2) != "--") { + KALDI_ERR << "Reading config file " << filename + << ": line " << line_number << " does not look like a line " + << "from a Kaldi command-line program's config file: should " + << "be of the form --x=y. Note: config files intended to " + << "be sourced by shell scripts lack the '--'."; + } + + // parse option + bool has_equal_sign; + SplitLongArg(line, &key, &value, &has_equal_sign); + NormalizeArgName(&key); + Trim(&value); + if (!SetOption(key, value, has_equal_sign)) { + PrintUsage(true); + KALDI_ERR << "Invalid option " << line << " in config file " << filename; + } + } +} + + + +void ParseOptions::SplitLongArg(const std::string &in, + std::string *key, + std::string *value, + bool *has_equal_sign) { + KALDI_ASSERT(in.substr(0, 2) == "--"); // precondition. + size_t pos = in.find_first_of('=', 0); + if (pos == std::string::npos) { // we allow --option for bools + // defaults to empty. We handle this differently in different cases. + *key = in.substr(2, in.size()-2); // 2 because starts with --. + *value = ""; + *has_equal_sign = false; + } else if (pos == 2) { // we also don't allow empty keys: --=value + PrintUsage(true); + KALDI_ERR << "Invalid option (no key): " << in; + } else { // normal case: --option=value + *key = in.substr(2, pos-2); // 2 because starts with --. + *value = in.substr(pos + 1); + *has_equal_sign = true; + } +} + + +void ParseOptions::NormalizeArgName(std::string *str) { + std::string out; + std::string::iterator it; + + for (it = str->begin(); it != str->end(); ++it) { + if (*it == '_') + out += '-'; // convert _ to - + else + out += std::tolower(*it); + } + *str = out; + + KALDI_ASSERT(str->length() > 0); +} + + + + +bool ParseOptions::SetOption(const std::string &key, + const std::string &value, + bool has_equal_sign) { + if (bool_map_.end() != bool_map_.find(key)) { + if (has_equal_sign && value == "") + KALDI_ERR << "Invalid option --" << key << "="; + *(bool_map_[key]) = ToBool(value); + } else if (int_map_.end() != int_map_.find(key)) { + *(int_map_[key]) = ToInt(value); + } else if (uint_map_.end() != uint_map_.find(key)) { + *(uint_map_[key]) = ToUint(value); + } else if (float_map_.end() != float_map_.find(key)) { + *(float_map_[key]) = ToFloat(value); + } else if (double_map_.end() != double_map_.find(key)) { + *(double_map_[key]) = ToDouble(value); + } else if (string_map_.end() != string_map_.find(key)) { + if (!has_equal_sign) + KALDI_ERR << "Invalid option --" << key + << " (option format is --x=y)."; + *(string_map_[key]) = value; + } else { + return false; + } + return true; +} + + + +bool ParseOptions::ToBool(std::string str) { + std::transform(str.begin(), str.end(), str.begin(), ::tolower); + + // allow "" as a valid option for "true", so that --x is the same as --x=true + if ((str.compare("true") == 0) || (str.compare("t") == 0) + || (str.compare("1") == 0) || (str.compare("") == 0)) { + return true; + } + if ((str.compare("false") == 0) || (str.compare("f") == 0) + || (str.compare("0") == 0)) { + return false; + } + // if it is neither true nor false: + PrintUsage(true); + KALDI_ERR << "Invalid format for boolean argument [expected true or false]: " + << str; + return false; // never reached +} + + +int32 ParseOptions::ToInt(const std::string &str) { + int32 ret; + if (!ConvertStringToInteger(str, &ret)) + KALDI_ERR << "Invalid integer option \"" << str << "\""; + return ret; +} + +uint32 ParseOptions::ToUint(const std::string &str) { + uint32 ret; + if (!ConvertStringToInteger(str, &ret)) + KALDI_ERR << "Invalid integer option \"" << str << "\""; + return ret; +} + +float ParseOptions::ToFloat(const std::string &str) { + float ret; + if (!ConvertStringToReal(str, &ret)) + KALDI_ERR << "Invalid floating-point option \"" << str << "\""; + return ret; +} + +double ParseOptions::ToDouble(const std::string &str) { + double ret; + if (!ConvertStringToReal(str, &ret)) + KALDI_ERR << "Invalid floating-point option \"" << str << "\""; + return ret; +} + +// instantiate templates +template void ParseOptions::RegisterTmpl(const std::string &name, bool *ptr, + const std::string &doc); +template void ParseOptions::RegisterTmpl(const std::string &name, int32 *ptr, + const std::string &doc); +template void ParseOptions::RegisterTmpl(const std::string &name, uint32 *ptr, + const std::string &doc); +template void ParseOptions::RegisterTmpl(const std::string &name, float *ptr, + const std::string &doc); +template void ParseOptions::RegisterTmpl(const std::string &name, double *ptr, + const std::string &doc); +template void ParseOptions::RegisterTmpl(const std::string &name, + std::string *ptr, const std::string &doc); + +template void ParseOptions::RegisterStandard(const std::string &name, + bool *ptr, + const std::string &doc); +template void ParseOptions::RegisterStandard(const std::string &name, + int32 *ptr, + const std::string &doc); +template void ParseOptions::RegisterStandard(const std::string &name, + uint32 *ptr, + const std::string &doc); +template void ParseOptions::RegisterStandard(const std::string &name, + float *ptr, + const std::string &doc); +template void ParseOptions::RegisterStandard(const std::string &name, + double *ptr, + const std::string &doc); +template void ParseOptions::RegisterStandard(const std::string &name, + std::string *ptr, + const std::string &doc); + +template void ParseOptions::RegisterCommon(const std::string &name, + bool *ptr, + const std::string &doc, bool is_standard); +template void ParseOptions::RegisterCommon(const std::string &name, + int32 *ptr, + const std::string &doc, bool is_standard); +template void ParseOptions::RegisterCommon(const std::string &name, + uint32 *ptr, + const std::string &doc, bool is_standard); +template void ParseOptions::RegisterCommon(const std::string &name, + float *ptr, + const std::string &doc, bool is_standard); +template void ParseOptions::RegisterCommon(const std::string &name, + double *ptr, + const std::string &doc, bool is_standard); +template void ParseOptions::RegisterCommon(const std::string &name, + std::string *ptr, + const std::string &doc, bool is_standard); + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/util/parse-options.h b/speechx/speechx/kaldi/util/parse-options.h new file mode 100644 index 00000000..5e83f996 --- /dev/null +++ b/speechx/speechx/kaldi/util/parse-options.h @@ -0,0 +1,264 @@ +// util/parse-options.h + +// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; +// Saarland University (Author: Arnab Ghoshal); +// Copyright 2012-2013 Frantisek Skala; Arnab Ghoshal + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_PARSE_OPTIONS_H_ +#define KALDI_UTIL_PARSE_OPTIONS_H_ + +#include +#include +#include + +#include "base/kaldi-common.h" +#include "util/options-itf.h" + +namespace kaldi { + +/// The class ParseOptions is for parsing command-line options; see +/// \ref parse_options for more documentation. +class ParseOptions : public OptionsItf { + public: + explicit ParseOptions(const char *usage) : + print_args_(true), help_(false), usage_(usage), argc_(0), argv_(NULL), + prefix_(""), other_parser_(NULL) { +#if !defined(_MSC_VER) && !defined(__CYGWIN__) // This is just a convenient place to set the stderr to line + setlinebuf(stderr); // buffering mode, since it's called at program start. +#endif // This helps ensure different programs' output is not mixed up. + RegisterStandard("config", &config_, "Configuration file to read (this " + "option may be repeated)"); + RegisterStandard("print-args", &print_args_, + "Print the command line arguments (to stderr)"); + RegisterStandard("help", &help_, "Print out usage message"); + RegisterStandard("verbose", &g_kaldi_verbose_level, + "Verbose level (higher->more logging)"); + } + + /** + This is a constructor for the special case where some options are + registered with a prefix to avoid conflicts. The object thus created will + only be used temporarily to register an options class with the original + options parser (which is passed as the *other pointer) using the given + prefix. It should not be used for any other purpose, and the prefix must + not be the empty string. It seems to be the least bad way of implementing + options with prefixes at this point. + Example of usage is: + ParseOptions po; // original ParseOptions object + ParseOptions po_mfcc("mfcc", &po); // object with prefix. + MfccOptions mfcc_opts; + mfcc_opts.Register(&po_mfcc); + The options will now get registered as, e.g., --mfcc.frame-shift=10.0 + instead of just --frame-shift=10.0 + */ + ParseOptions(const std::string &prefix, OptionsItf *other); + + ~ParseOptions() {} + + // Methods from the interface + void Register(const std::string &name, + bool *ptr, const std::string &doc); + void Register(const std::string &name, + int32 *ptr, const std::string &doc); + void Register(const std::string &name, + uint32 *ptr, const std::string &doc); + void Register(const std::string &name, + float *ptr, const std::string &doc); + void Register(const std::string &name, + double *ptr, const std::string &doc); + void Register(const std::string &name, + std::string *ptr, const std::string &doc); + + /// If called after registering an option and before calling + /// Read(), disables that option from being used. Will crash + /// at runtime if that option had not been registered. + void DisableOption(const std::string &name); + + /// This one is used for registering standard parameters of all the programs + template + void RegisterStandard(const std::string &name, + T *ptr, const std::string &doc); + + /** + Parses the command line options and fills the ParseOptions-registered + variables. This must be called after all the variables were registered!!! + + Initially the variables have implicit values, + then the config file values are set-up, + finally the command line values given. + Returns the first position in argv that was not used. + [typically not useful: use NumParams() and GetParam(). ] + */ + int Read(int argc, const char *const *argv); + + /// Prints the usage documentation [provided in the constructor]. + void PrintUsage(bool print_command_line = false); + /// Prints the actual configuration of all the registered variables + void PrintConfig(std::ostream &os); + + /// Reads the options values from a config file. Must be called after + /// registering all options. This is usually used internally after the + /// standard --config option is used, but it may also be called from a + /// program. + void ReadConfigFile(const std::string &filename); + + /// Number of positional parameters (c.f. argc-1). + int NumArgs() const; + + /// Returns one of the positional parameters; 1-based indexing for argc/argv + /// compatibility. Will crash if param is not >=1 and <=NumArgs(). + std::string GetArg(int param) const; + + std::string GetOptArg(int param) const { + return (param <= NumArgs() ? GetArg(param) : ""); + } + + /// The following function will return a possibly quoted and escaped + /// version of "str", according to the current shell. Currently + /// this is just hardwired to bash. It's useful for debug output. + static std::string Escape(const std::string &str); + + private: + /// Template to register various variable types, + /// used for program-specific parameters + template + void RegisterTmpl(const std::string &name, T *ptr, const std::string &doc); + + // Following functions do just the datatype-specific part of the job + /// Register boolean variable + void RegisterSpecific(const std::string &name, const std::string &idx, + bool *b, const std::string &doc, bool is_standard); + /// Register int32 variable + void RegisterSpecific(const std::string &name, const std::string &idx, + int32 *i, const std::string &doc, bool is_standard); + /// Register unsinged int32 variable + void RegisterSpecific(const std::string &name, const std::string &idx, + uint32 *u, + const std::string &doc, bool is_standard); + /// Register float variable + void RegisterSpecific(const std::string &name, const std::string &idx, + float *f, const std::string &doc, bool is_standard); + /// Register double variable [useful as we change BaseFloat type]. + void RegisterSpecific(const std::string &name, const std::string &idx, + double *f, const std::string &doc, bool is_standard); + /// Register string variable + void RegisterSpecific(const std::string &name, const std::string &idx, + std::string *s, const std::string &doc, + bool is_standard); + + /// Does the actual job for both kinds of parameters + /// Does the common part of the job for all datatypes, + /// then calls RegisterSpecific + template + void RegisterCommon(const std::string &name, + T *ptr, const std::string &doc, bool is_standard); + + /// Set option with name "key" to "value"; will crash if can't do it. + /// "has_equal_sign" is used to allow --x for a boolean option x, + /// and --y=, for a string option y. + bool SetOption(const std::string &key, const std::string &value, + bool has_equal_sign); + + bool ToBool(std::string str); + int32 ToInt(const std::string &str); + uint32 ToUint(const std::string &str); + float ToFloat(const std::string &str); + double ToDouble(const std::string &str); + + // maps for option variables + std::map bool_map_; + std::map int_map_; + std::map uint_map_; + std::map float_map_; + std::map double_map_; + std::map string_map_; + + /** + Structure for options' documentation + */ + struct DocInfo { + DocInfo() {} + DocInfo(const std::string &name, const std::string &usemsg) + : name_(name), use_msg_(usemsg), is_standard_(false) {} + DocInfo(const std::string &name, const std::string &usemsg, + bool is_standard) + : name_(name), use_msg_(usemsg), is_standard_(is_standard) {} + + std::string name_; + std::string use_msg_; + bool is_standard_; + }; + typedef std::map DocMapType; + DocMapType doc_map_; ///< map for the documentation + + bool print_args_; ///< variable for the implicit --print-args parameter + bool help_; ///< variable for the implicit --help parameter + std::string config_; ///< variable for the implicit --config parameter + std::vector positional_args_; + const char *usage_; + int argc_; + const char *const *argv_; + + /// These members are not normally used. They are only used when the object + /// is constructed with a prefix + std::string prefix_; + OptionsItf *other_parser_; + protected: + /// SplitLongArg parses an argument of the form --a=b, --a=, or --a, + /// and sets "has_equal_sign" to true if an equals-sign was parsed.. + /// this is needed in order to correctly allow --x for a boolean option + /// x, and --y= for a string option y, and to disallow --x= and --y. + void SplitLongArg(const std::string &in, std::string *key, + std::string *value, bool *has_equal_sign); + + void NormalizeArgName(std::string *str); +}; + +/// This template is provided for convenience in reading config classes from +/// files; this is not the standard way to read configuration options, but may +/// occasionally be needed. This function assumes the config has a function +/// "void Register(OptionsItf *opts)" which it can call to register the +/// ParseOptions object. +template void ReadConfigFromFile(const std::string &config_filename, + C *c) { + std::ostringstream usage_str; + usage_str << "Parsing config from " + << "from '" << config_filename << "'"; + ParseOptions po(usage_str.str().c_str()); + c->Register(&po); + po.ReadConfigFile(config_filename); +} + +/// This variant of the template ReadConfigFromFile is for if you need to read +/// two config classes from the same file. +template void ReadConfigsFromFile(const std::string &conf, + C1 *c1, C2 *c2) { + std::ostringstream usage_str; + usage_str << "Parsing config from " + << "from '" << conf << "'"; + ParseOptions po(usage_str.str().c_str()); + c1->Register(&po); + c2->Register(&po); + po.ReadConfigFile(conf); +} + + + +} // namespace kaldi + +#endif // KALDI_UTIL_PARSE_OPTIONS_H_ diff --git a/speechx/speechx/kaldi/util/simple-io-funcs.cc b/speechx/speechx/kaldi/util/simple-io-funcs.cc new file mode 100644 index 00000000..cb732a10 --- /dev/null +++ b/speechx/speechx/kaldi/util/simple-io-funcs.cc @@ -0,0 +1,81 @@ +// util/simple-io-funcs.cc + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#include "util/simple-io-funcs.h" +#include "util/text-utils.h" + +namespace kaldi { + +bool WriteIntegerVectorSimple(const std::string &wxfilename, + const std::vector &list) { + kaldi::Output ko; + // false, false is: text-mode, no Kaldi header. + if (!ko.Open(wxfilename, false, false)) return false; + for (size_t i = 0; i < list.size(); i++) ko.Stream() << list[i] << '\n'; + return ko.Close(); +} + +bool ReadIntegerVectorSimple(const std::string &rxfilename, + std::vector *list) { + kaldi::Input ki; + if (!ki.OpenTextMode(rxfilename)) return false; + std::istream &is = ki.Stream(); + int32 i; + list->clear(); + while ( !(is >> i).fail() ) + list->push_back(i); + is >> std::ws; + return is.eof(); // should be eof, or junk at end of file. +} + +bool WriteIntegerVectorVectorSimple(const std::string &wxfilename, + const std::vector > &list) { + kaldi::Output ko; + // false, false is: text-mode, no Kaldi header. + if (!ko.Open(wxfilename, false, false)) return false; + std::ostream &os = ko.Stream(); + for (size_t i = 0; i < list.size(); i++) { + for (size_t j = 0; j < list[i].size(); j++) { + os << list[i][j]; + if (j+1 < list[i].size()) os << ' '; + } + os << '\n'; + } + return ko.Close(); +} + +bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, + std::vector > *list) { + kaldi::Input ki; + if (!ki.OpenTextMode(rxfilename)) return false; + std::istream &is = ki.Stream(); + list->clear(); + std::string line; + while (std::getline(is, line)) { + std::vector v; + if (!SplitStringToIntegers(line, " \t\r", true, &v)) { + list->clear(); + return false; + } + list->push_back(v); + } + return is.eof(); // if we're not at EOF, something weird happened. +} + + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/util/simple-io-funcs.h b/speechx/speechx/kaldi/util/simple-io-funcs.h new file mode 100644 index 00000000..30b90acb --- /dev/null +++ b/speechx/speechx/kaldi/util/simple-io-funcs.h @@ -0,0 +1,63 @@ +// util/simple-io-funcs.h + +// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_UTIL_SIMPLE_IO_FUNCS_H_ +#define KALDI_UTIL_SIMPLE_IO_FUNCS_H_ + +#include +#include +#include "util/kaldi-io.h" + +// This header contains some utilities for reading some common, simple text +// formats:integers in files, one per line, and integers in files, possibly +// multiple per line. these are not really fully native Kaldi formats; they are +// mostly for small files that might be generated by scripts, and can be read +// all at one time. for longer files of this type, we would probably use the +// Table code. + +namespace kaldi { + +/// WriteToList attempts to write this list of integers, one per line, +/// to the given file, in text format. +/// returns true if succeeded. +bool WriteIntegerVectorSimple(const std::string &wxfilename, + const std::vector &v); + +/// ReadFromList attempts to read this list of integers, one per line, +/// from the given file, in text format. +/// returns true if succeeded. +bool ReadIntegerVectorSimple(const std::string &rxfilename, + std::vector *v); + +// This is a file format like: +// 1 2 +// 3 +// +// 4 5 6 +// etc. +bool WriteIntegerVectorVectorSimple(const std::string &wxfilename, + const std::vector > &v); + +bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, + std::vector > *v); + + +} // end namespace kaldi. + + +#endif // KALDI_UTIL_SIMPLE_IO_FUNCS_H_ diff --git a/speechx/speechx/kaldi/util/simple-options.cc b/speechx/speechx/kaldi/util/simple-options.cc new file mode 100644 index 00000000..592500e2 --- /dev/null +++ b/speechx/speechx/kaldi/util/simple-options.cc @@ -0,0 +1,184 @@ +// util/simple-options.cc + +// Copyright 2013 Tanel Alumae, Tallinn University of Technology + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "util/simple-options.h" + + +namespace kaldi { + +void SimpleOptions::Register(const std::string &name, + bool *value, + const std::string &doc) { + bool_map_[name] = value; + option_info_list_.push_back(std::make_pair(name, OptionInfo(doc, kBool))); +} + +void SimpleOptions::Register(const std::string &name, + int32 *value, + const std::string &doc) { + int_map_[name] = value; + option_info_list_.push_back(std::make_pair(name, OptionInfo(doc, kInt32))); +} + +void SimpleOptions::Register(const std::string &name, + uint32 *value, + const std::string &doc) { + uint_map_[name] = value; + option_info_list_.push_back(std::make_pair(name, OptionInfo(doc, kUint32))); +} + +void SimpleOptions::Register(const std::string &name, + float *value, + const std::string &doc) { + float_map_[name] = value; + option_info_list_.push_back(std::make_pair(name, OptionInfo(doc, kFloat))); +} + +void SimpleOptions::Register(const std::string &name, + double *value, + const std::string &doc) { + double_map_[name] = value; + option_info_list_.push_back(std::make_pair(name, OptionInfo(doc, kDouble))); +} + +void SimpleOptions::Register(const std::string &name, + std::string *value, + const std::string &doc) { + string_map_[name] = value; + option_info_list_.push_back(std::make_pair(name, OptionInfo(doc, kString))); +} + +template +static bool SetOptionImpl(const std::string &key, const T &value, + std::map &some_map) { + if (some_map.end() != some_map.find(key)) { + *(some_map[key]) = value; + return true; + } + return false; +} + +bool SimpleOptions::SetOption(const std::string &key, const bool &value) { + return SetOptionImpl(key, value, bool_map_); +} + +bool SimpleOptions::SetOption(const std::string &key, const int32 &value) { + if (!SetOptionImpl(key, value, int_map_)) { + if (!SetOptionImpl(key, static_cast(value), uint_map_)) { + return false; + } + } + return true; +} + +bool SimpleOptions::SetOption(const std::string &key, const uint32 &value) { + if (!SetOptionImpl(key, value, uint_map_)) { + if (!SetOptionImpl(key, static_cast(value), int_map_)) { + return false; + } + } + return true; +} + +bool SimpleOptions::SetOption(const std::string &key, const float &value) { + if (!SetOptionImpl(key, value, float_map_)) { + if (!SetOptionImpl(key, static_cast(value), double_map_)) { + return false; + } + } + return true; +} + +bool SimpleOptions::SetOption(const std::string &key, const double &value) { + if (!SetOptionImpl(key, value, double_map_)) { + if (!SetOptionImpl(key, static_cast(value), float_map_)) { + return false; + } + } + return true; +} + +bool SimpleOptions::SetOption(const std::string &key, + const std::string &value) { + return SetOptionImpl(key, value, string_map_); +} + +bool SimpleOptions::SetOption(const std::string &key, const char *value) { + std::string str_value = std::string(value); + return SetOptionImpl(key, str_value, string_map_); +} + + +template +static bool GetOptionImpl(const std::string &key, T *value, + std::map &some_map) { + typename std::map::iterator it = some_map.find(key); + if (it != some_map.end()) { + *value = *(it->second); + return true; + } + return false; +} + +bool SimpleOptions::GetOption(const std::string &key, bool *value) { + return GetOptionImpl(key, value, bool_map_); +} + +bool SimpleOptions::GetOption(const std::string &key, int32 *value) { + return GetOptionImpl(key, value, int_map_); +} + +bool SimpleOptions::GetOption(const std::string &key, uint32 *value) { + return GetOptionImpl(key, value, uint_map_); +} + +bool SimpleOptions::GetOption(const std::string &key, float *value) { + return GetOptionImpl(key, value, float_map_); +} + +bool SimpleOptions::GetOption(const std::string &key, double *value) { + return GetOptionImpl(key, value, double_map_); +} + +bool SimpleOptions::GetOption(const std::string &key, std::string *value) { + return GetOptionImpl(key, value, string_map_); +} + +std::vector > +SimpleOptions::GetOptionInfoList() { + return option_info_list_; +} + +bool SimpleOptions::GetOptionType(const std::string &key, OptionType *type) { + for (std::vector >::iterator dx = option_info_list_.begin(); + dx != option_info_list_.end(); dx++) { + std::pair info_pair = (*dx); + if (info_pair.first == key) { + *type = info_pair.second.type; + return true; + } + } + return false; +} + + + +} // namespace kaldi diff --git a/speechx/speechx/kaldi/util/simple-options.h b/speechx/speechx/kaldi/util/simple-options.h new file mode 100644 index 00000000..f301c7d6 --- /dev/null +++ b/speechx/speechx/kaldi/util/simple-options.h @@ -0,0 +1,113 @@ +// util/simple-options.h + +// Copyright 2013 Tanel Alumae, Tallinn University of Technology + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_SIMPLE_OPTIONS_H_ +#define KALDI_UTIL_SIMPLE_OPTIONS_H_ + +#include +#include +#include + +#include "base/kaldi-common.h" +#include "util/options-itf.h" + +namespace kaldi { + + +/// The class SimpleOptions is an implementation of OptionsItf that allows +/// setting and getting option values programmatically, i.e., via getter +/// and setter methods. It doesn't provide any command line parsing +/// functionality. +/// The class ParseOptions should be used for command-line options. +class SimpleOptions : public OptionsItf { + public: + SimpleOptions() { + } + + virtual ~SimpleOptions() { + } + + // Methods from the interface + void Register(const std::string &name, bool *ptr, const std::string &doc); + void Register(const std::string &name, int32 *ptr, const std::string &doc); + void Register(const std::string &name, uint32 *ptr, const std::string &doc); + void Register(const std::string &name, float *ptr, const std::string &doc); + void Register(const std::string &name, double *ptr, const std::string &doc); + void Register(const std::string &name, std::string *ptr, + const std::string &doc); + + // set option with the specified key, return true if successful + bool SetOption(const std::string &key, const bool &value); + bool SetOption(const std::string &key, const int32 &value); + bool SetOption(const std::string &key, const uint32 &value); + bool SetOption(const std::string &key, const float &value); + bool SetOption(const std::string &key, const double &value); + bool SetOption(const std::string &key, const std::string &value); + bool SetOption(const std::string &key, const char* value); + + // get option with the specified key and put to 'value', + // return true if successful + bool GetOption(const std::string &key, bool *value); + bool GetOption(const std::string &key, int32 *value); + bool GetOption(const std::string &key, uint32 *value); + bool GetOption(const std::string &key, float *value); + bool GetOption(const std::string &key, double *value); + bool GetOption(const std::string &key, std::string *value); + + enum OptionType { + kBool, + kInt32, + kUint32, + kFloat, + kDouble, + kString + }; + + struct OptionInfo { + OptionInfo(const std::string &doc, OptionType type) : + doc(doc), type(type) { + } + std::string doc; + OptionType type; + }; + + std::vector > GetOptionInfoList(); + + /* + * Puts the type of the option with name 'key' in the argument 'type'. + * Return true if such option is found, false otherwise. + */ + bool GetOptionType(const std::string &key, OptionType *type); + + private: + + std::vector > option_info_list_; + + // maps for option variables + std::map bool_map_; + std::map int_map_; + std::map uint_map_; + std::map float_map_; + std::map double_map_; + std::map string_map_; +}; + +} // namespace kaldi + +#endif // KALDI_UTIL_SIMPLE_OPTIONS_H_ diff --git a/speechx/speechx/kaldi/util/stl-utils.h b/speechx/speechx/kaldi/util/stl-utils.h new file mode 100644 index 00000000..647073a2 --- /dev/null +++ b/speechx/speechx/kaldi/util/stl-utils.h @@ -0,0 +1,317 @@ +// util/stl-utils.h + +// Copyright 2009-2011 Microsoft Corporation; Saarland University + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_STL_UTILS_H_ +#define KALDI_UTIL_STL_UTILS_H_ + +#include +#include +using std::unordered_map; +using std::unordered_set; + +#include +#include +#include +#include +#include +#include "base/kaldi-common.h" + +namespace kaldi { + +/// Sorts and uniq's (removes duplicates) from a vector. +template +inline void SortAndUniq(std::vector *vec) { + std::sort(vec->begin(), vec->end()); + vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); +} + + +/// Returns true if the vector is sorted. +template +inline bool IsSorted(const std::vector &vec) { + typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); + if (iter == end) return true; + while (1) { + typename std::vector::const_iterator next_iter = iter; + ++next_iter; + if (next_iter == end) return true; // end of loop and nothing out of order + if (*next_iter < *iter) return false; + iter = next_iter; + } +} + + +/// Returns true if the vector is sorted and contains each element +/// only once. +template +inline bool IsSortedAndUniq(const std::vector &vec) { + typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); + if (iter == end) return true; + while (1) { + typename std::vector::const_iterator next_iter = iter; + ++next_iter; + if (next_iter == end) return true; // end of loop and nothing out of order + if (*next_iter <= *iter) return false; + iter = next_iter; + } +} + + +/// Removes duplicate elements from a sorted list. +template +inline void Uniq(std::vector *vec) { // must be already sorted. + KALDI_PARANOID_ASSERT(IsSorted(*vec)); + KALDI_ASSERT(vec); + vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); +} + +/// Copies the elements of a set to a vector. +template +void CopySetToVector(const std::set &s, std::vector *v) { + // copies members of s into v, in sorted order from lowest to highest + // (because the set was in sorted order). + KALDI_ASSERT(v != NULL); + v->resize(s.size()); + typename std::set::const_iterator siter = s.begin(), send = s.end(); + typename std::vector::iterator viter = v->begin(); + for (; siter != send; ++siter, ++viter) { + *viter = *siter; + } +} + +template +void CopySetToVector(const unordered_set &s, std::vector *v) { + KALDI_ASSERT(v != NULL); + v->resize(s.size()); + typename unordered_set::const_iterator siter = s.begin(), send = s.end(); + typename std::vector::iterator viter = v->begin(); + for (; siter != send; ++siter, ++viter) { + *viter = *siter; + } +} + + +/// Copies the (key, value) pairs in a map to a vector of pairs. +template +void CopyMapToVector(const std::map &m, + std::vector > *v) { + KALDI_ASSERT(v != NULL); + v->resize(m.size()); + typename std::map::const_iterator miter = m.begin(), mend = m.end(); + typename std::vector >::iterator viter = v->begin(); + for (; miter != mend; ++miter, ++viter) { + *viter = std::make_pair(miter->first, miter->second); + // do it like this because of const casting. + } +} + +/// Copies the keys in a map to a vector. +template +void CopyMapKeysToVector(const std::map &m, std::vector *v) { + KALDI_ASSERT(v != NULL); + v->resize(m.size()); + typename std::map::const_iterator miter = m.begin(), mend = m.end(); + typename std::vector::iterator viter = v->begin(); + for (; miter != mend; ++miter, ++viter) { + *viter = miter->first; + } +} + +/// Copies the values in a map to a vector. +template +void CopyMapValuesToVector(const std::map &m, std::vector *v) { + KALDI_ASSERT(v != NULL); + v->resize(m.size()); + typename std::map::const_iterator miter = m.begin(), mend = m.end(); + typename std::vector::iterator viter = v->begin(); + for (; miter != mend; ++miter, ++viter) { + *viter = miter->second; + } +} + +/// Copies the keys in a map to a set. +template +void CopyMapKeysToSet(const std::map &m, std::set *s) { + KALDI_ASSERT(s != NULL); + s->clear(); + typename std::map::const_iterator miter = m.begin(), mend = m.end(); + for (; miter != mend; ++miter) { + s->insert(s->end(), miter->first); + } +} + +/// Copies the values in a map to a set. +template +void CopyMapValuesToSet(const std::map &m, std::set *s) { + KALDI_ASSERT(s != NULL); + s->clear(); + typename std::map::const_iterator miter = m.begin(), mend = m.end(); + for (; miter != mend; ++miter) + s->insert(s->end(), miter->second); +} + + +/// Copies the contents of a vector to a set. +template +void CopyVectorToSet(const std::vector &v, std::set *s) { + KALDI_ASSERT(s != NULL); + s->clear(); + typename std::vector::const_iterator iter = v.begin(), end = v.end(); + for (; iter != end; ++iter) + s->insert(s->end(), *iter); + // s->end() is a hint in case v was sorted. will work regardless. +} + +/// Deletes any non-NULL pointers in the vector v, and sets +/// the corresponding entries of v to NULL +template +void DeletePointers(std::vector *v) { + KALDI_ASSERT(v != NULL); + typename std::vector::iterator iter = v->begin(), end = v->end(); + for (; iter != end; ++iter) { + if (*iter != NULL) { + delete *iter; + *iter = NULL; // set to NULL for extra safety. + } + } +} + +/// Returns true if the vector of pointers contains NULL pointers. +template +bool ContainsNullPointers(const std::vector &v) { + typename std::vector::const_iterator iter = v.begin(), end = v.end(); + for (; iter != end; ++iter) + if (*iter == static_cast (NULL)) return true; + return false; +} + +/// Copies the contents a vector of one type to a vector +/// of another type. +template +void CopyVectorToVector(const std::vector &vec_in, std::vector *vec_out) { + KALDI_ASSERT(vec_out != NULL); + vec_out->resize(vec_in.size()); + for (size_t i = 0; i < vec_in.size(); i++) + (*vec_out)[i] = static_cast (vec_in[i]); +} + +/// A hashing function-object for vectors. +template +struct VectorHasher { // hashing function for vector. + size_t operator()(const std::vector &x) const noexcept { + size_t ans = 0; + typename std::vector::const_iterator iter = x.begin(), end = x.end(); + for (; iter != end; ++iter) { + ans *= kPrime; + ans += *iter; + } + return ans; + } + VectorHasher() { // Check we're instantiated with an integer type. + KALDI_ASSERT_IS_INTEGER_TYPE(Int); + } + private: + static const int kPrime = 7853; +}; + +/// A hashing function-object for pairs of ints +template +struct PairHasher { // hashing function for pair + size_t operator()(const std::pair &x) const noexcept { + // 7853 was chosen at random from a list of primes. + return x.first + x.second * 7853; + } + PairHasher() { // Check we're instantiated with an integer type. + KALDI_ASSERT_IS_INTEGER_TYPE(Int1); + KALDI_ASSERT_IS_INTEGER_TYPE(Int2); + } +}; + + +/// A hashing function object for strings. +struct StringHasher { // hashing function for std::string + size_t operator()(const std::string &str) const noexcept { + size_t ans = 0, len = str.length(); + const char *c = str.c_str(), *end = c + len; + for (; c != end; c++) { + ans *= kPrime; + ans += *c; + } + return ans; + } + private: + static const int kPrime = 7853; +}; + +/// Reverses the contents of a vector. +template +inline void ReverseVector(std::vector *vec) { + KALDI_ASSERT(vec != NULL); + size_t sz = vec->size(); + for (size_t i = 0; i < sz/2; i++) + std::swap( (*vec)[i], (*vec)[sz-1-i]); +} + + +/// Comparator object for pairs that compares only the first pair. +template +struct CompareFirstMemberOfPair { + inline bool operator() (const std::pair &p1, + const std::pair &p2) { + return p1.first < p2.first; + } +}; + +/// For a vector of pair where I is an integer and F a floating-point or +/// integer type, this function sorts a vector of type vector > on +/// the I value and then merges elements with equal I values, summing these over +/// the F component and then removing any F component with zero value. This +/// is for where the vector of pairs represents a map from the integer to float +/// component, with an "adding" type of semantics for combining the elements. +template +inline void MergePairVectorSumming(std::vector > *vec) { + KALDI_ASSERT_IS_INTEGER_TYPE(I); + CompareFirstMemberOfPair c; + std::sort(vec->begin(), vec->end(), c); // sort on 1st element. + typename std::vector >::iterator out = vec->begin(), + in = vec->begin(), end = vec->end(); + // special case: while there is nothing to be changed, skip over + // initial input (avoids unnecessary copying). + while (in + 1 < end && in[0].first != in[1].first && in[0].second != 0.0) { + in++; + out++; + } + while (in < end) { + // We reach this point only at the first element of + // each stretch of identical .first elements. + *out = *in; + ++in; + while (in < end && in->first == out->first) { + out->second += in->second; // this is the merge operation. + ++in; + } + if (out->second != static_cast(0)) // Don't keep zero elements. + out++; + } + vec->erase(out, end); +} + +} // namespace kaldi + +#endif // KALDI_UTIL_STL_UTILS_H_ diff --git a/speechx/speechx/kaldi/util/table-types.h b/speechx/speechx/kaldi/util/table-types.h new file mode 100644 index 00000000..efcdf1b5 --- /dev/null +++ b/speechx/speechx/kaldi/util/table-types.h @@ -0,0 +1,192 @@ +// util/table-types.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_TABLE_TYPES_H_ +#define KALDI_UTIL_TABLE_TYPES_H_ +#include "base/kaldi-common.h" +#include "util/kaldi-table.h" +#include "util/kaldi-holder.h" +#include "matrix/matrix-lib.h" + +namespace kaldi { + +// This header defines typedefs that are specific instantiations of +// the Table types. + +/// \addtogroup table_types +/// @{ + +typedef TableWriter > > + BaseFloatMatrixWriter; +typedef SequentialTableReader > > + SequentialBaseFloatMatrixReader; +typedef RandomAccessTableReader > > + RandomAccessBaseFloatMatrixReader; +typedef RandomAccessTableReaderMapped > > + RandomAccessBaseFloatMatrixReaderMapped; + +typedef TableWriter > > + DoubleMatrixWriter; +typedef SequentialTableReader > > + SequentialDoubleMatrixReader; +typedef RandomAccessTableReader > > + RandomAccessDoubleMatrixReader; +typedef RandomAccessTableReaderMapped > > + RandomAccessDoubleMatrixReaderMapped; + +typedef TableWriter > + CompressedMatrixWriter; + +typedef TableWriter > > + BaseFloatVectorWriter; +typedef SequentialTableReader > > + SequentialBaseFloatVectorReader; +typedef RandomAccessTableReader > > + RandomAccessBaseFloatVectorReader; +typedef RandomAccessTableReaderMapped > > + RandomAccessBaseFloatVectorReaderMapped; + +typedef TableWriter > > + DoubleVectorWriter; +typedef SequentialTableReader > > + SequentialDoubleVectorReader; +typedef RandomAccessTableReader > > + RandomAccessDoubleVectorReader; + +typedef TableWriter > > + BaseFloatCuMatrixWriter; +typedef SequentialTableReader > > + SequentialBaseFloatCuMatrixReader; +typedef RandomAccessTableReader > > + RandomAccessBaseFloatCuMatrixReader; +typedef RandomAccessTableReaderMapped > > + RandomAccessBaseFloatCuMatrixReaderMapped; + +typedef TableWriter > > + DoubleCuMatrixWriter; +typedef SequentialTableReader > > + SequentialDoubleCuMatrixReader; +typedef RandomAccessTableReader > > + RandomAccessDoubleCuMatrixReader; +typedef RandomAccessTableReaderMapped > > + RandomAccessDoubleCuMatrixReaderMapped; + +typedef TableWriter > > + BaseFloatCuVectorWriter; +typedef SequentialTableReader > > + SequentialBaseFloatCuVectorReader; +typedef RandomAccessTableReader > > + RandomAccessBaseFloatCuVectorReader; +typedef RandomAccessTableReaderMapped > > + RandomAccessBaseFloatCuVectorReaderMapped; + +typedef TableWriter > > + DoubleCuVectorWriter; +typedef SequentialTableReader > > + SequentialDoubleCuVectorReader; +typedef RandomAccessTableReader > > + RandomAccessDoubleCuVectorReader; + + +typedef TableWriter > Int32Writer; +typedef SequentialTableReader > SequentialInt32Reader; +typedef RandomAccessTableReader > RandomAccessInt32Reader; + +typedef TableWriter > Int32VectorWriter; +typedef SequentialTableReader > + SequentialInt32VectorReader; +typedef RandomAccessTableReader > + RandomAccessInt32VectorReader; + +typedef TableWriter > Int32VectorVectorWriter; +typedef SequentialTableReader > + SequentialInt32VectorVectorReader; +typedef RandomAccessTableReader > + RandomAccessInt32VectorVectorReader; + +typedef TableWriter > Int32PairVectorWriter; +typedef SequentialTableReader > + SequentialInt32PairVectorReader; +typedef RandomAccessTableReader > + RandomAccessInt32PairVectorReader; + +typedef TableWriter > + BaseFloatPairVectorWriter; +typedef SequentialTableReader > + SequentialBaseFloatPairVectorReader; +typedef RandomAccessTableReader > + RandomAccessBaseFloatPairVectorReader; + +typedef TableWriter > BaseFloatWriter; +typedef SequentialTableReader > + SequentialBaseFloatReader; +typedef RandomAccessTableReader > + RandomAccessBaseFloatReader; +typedef RandomAccessTableReaderMapped > + RandomAccessBaseFloatReaderMapped; + +typedef TableWriter > DoubleWriter; +typedef SequentialTableReader > SequentialDoubleReader; +typedef RandomAccessTableReader > RandomAccessDoubleReader; + +typedef TableWriter > BoolWriter; +typedef SequentialTableReader > SequentialBoolReader; +typedef RandomAccessTableReader > RandomAccessBoolReader; + + + +/// TokenWriter is a writer specialized for std::string where the strings +/// are nonempty and whitespace-free. T == std::string +typedef TableWriter TokenWriter; +typedef SequentialTableReader SequentialTokenReader; +typedef RandomAccessTableReader RandomAccessTokenReader; + + +/// TokenVectorWriter is a writer specialized for sequences of +/// std::string where the strings are nonempty and whitespace-free. +/// T == std::vector +typedef TableWriter TokenVectorWriter; +// Ditto for SequentialTokenVectorReader. +typedef SequentialTableReader SequentialTokenVectorReader; +typedef RandomAccessTableReader + RandomAccessTokenVectorReader; + + +typedef TableWriter > + GeneralMatrixWriter; +typedef SequentialTableReader > + SequentialGeneralMatrixReader; +typedef RandomAccessTableReader > + RandomAccessGeneralMatrixReader; +typedef RandomAccessTableReaderMapped > + RandomAccessGeneralMatrixReaderMapped; + + + +/// @} + +// Note: for FST reader/writer, see ../fstext/fstext-utils.h +// [not done yet]. + +} // end namespace kaldi + + + +#endif // KALDI_UTIL_TABLE_TYPES_H_ diff --git a/speechx/speechx/kaldi/util/text-utils.cc b/speechx/speechx/kaldi/util/text-utils.cc new file mode 100644 index 00000000..bbf38ecc --- /dev/null +++ b/speechx/speechx/kaldi/util/text-utils.cc @@ -0,0 +1,591 @@ +// util/text-utils.cc + +// Copyright 2009-2011 Saarland University; Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "util/text-utils.h" +#include +#include +#include +#include "base/kaldi-common.h" + +namespace kaldi { + + +template +bool SplitStringToFloats(const std::string &full, + const char *delim, + bool omit_empty_strings, // typically false + std::vector *out) { + KALDI_ASSERT(out != NULL); + if (*(full.c_str()) == '\0') { + out->clear(); + return true; + } + std::vector split; + SplitStringToVector(full, delim, omit_empty_strings, &split); + out->resize(split.size()); + for (size_t i = 0; i < split.size(); i++) { + F f = 0; + if (!ConvertStringToReal(split[i], &f)) + return false; + (*out)[i] = f; + } + return true; +} + +// Instantiate the template above for float and double. +template +bool SplitStringToFloats(const std::string &full, + const char *delim, + bool omit_empty_strings, + std::vector *out); +template +bool SplitStringToFloats(const std::string &full, + const char *delim, + bool omit_empty_strings, + std::vector *out); + +void SplitStringToVector(const std::string &full, const char *delim, + bool omit_empty_strings, + std::vector *out) { + size_t start = 0, found = 0, end = full.size(); + out->clear(); + while (found != std::string::npos) { + found = full.find_first_of(delim, start); + // start != end condition is for when the delimiter is at the end + if (!omit_empty_strings || (found != start && start != end)) + out->push_back(full.substr(start, found - start)); + start = found + 1; + } +} + +void JoinVectorToString(const std::vector &vec_in, + const char *delim, bool omit_empty_strings, + std::string *str_out) { + std::string tmp_str; + for (size_t i = 0; i < vec_in.size(); i++) { + if (!omit_empty_strings || !vec_in[i].empty()) { + tmp_str.append(vec_in[i]); + if (i < vec_in.size() - 1) + if (!omit_empty_strings || !vec_in[i+1].empty()) + tmp_str.append(delim); + } + } + str_out->swap(tmp_str); +} + +void Trim(std::string *str) { + const char *white_chars = " \t\n\r\f\v"; + + std::string::size_type pos = str->find_last_not_of(white_chars); + if (pos != std::string::npos) { + str->erase(pos + 1); + pos = str->find_first_not_of(white_chars); + if (pos != std::string::npos) str->erase(0, pos); + } else { + str->erase(str->begin(), str->end()); + } +} + +bool IsToken(const std::string &token) { + size_t l = token.length(); + if (l == 0) return false; + for (size_t i = 0; i < l; i++) { + unsigned char c = token[i]; + if ((!isprint(c) || isspace(c)) && (isascii(c) || c == (unsigned char)255)) + return false; + // The "&& (isascii(c) || c == 255)" was added so that we won't reject + // non-ASCII characters such as French characters with accents [except for + // 255 which is "nbsp", a form of space]. + } + return true; +} + + +void SplitStringOnFirstSpace(const std::string &str, + std::string *first, + std::string *rest) { + const char *white_chars = " \t\n\r\f\v"; + typedef std::string::size_type I; + const I npos = std::string::npos; + I first_nonwhite = str.find_first_not_of(white_chars); + if (first_nonwhite == npos) { + first->clear(); + rest->clear(); + return; + } + // next_white is first whitespace after first nonwhitespace. + I next_white = str.find_first_of(white_chars, first_nonwhite); + + if (next_white == npos) { // no more whitespace... + *first = std::string(str, first_nonwhite); + rest->clear(); + return; + } + I next_nonwhite = str.find_first_not_of(white_chars, next_white); + if (next_nonwhite == npos) { + *first = std::string(str, first_nonwhite, next_white-first_nonwhite); + rest->clear(); + return; + } + + I last_nonwhite = str.find_last_not_of(white_chars); + KALDI_ASSERT(last_nonwhite != npos); // or coding error. + + *first = std::string(str, first_nonwhite, next_white-first_nonwhite); + *rest = std::string(str, next_nonwhite, last_nonwhite+1-next_nonwhite); +} + +bool IsLine(const std::string &line) { + if (line.find('\n') != std::string::npos) return false; + if (line.empty()) return true; + if (isspace(*(line.begin()))) return false; + if (isspace(*(line.rbegin()))) return false; + std::string::const_iterator iter = line.begin(), end = line.end(); + for (; iter != end; iter++) + if (!isprint(*iter)) return false; + return true; +} + +template +class NumberIstream{ + public: + explicit NumberIstream(std::istream &i) : in_(i) {} + + NumberIstream & operator >> (T &x) { + if (!in_.good()) return *this; + in_ >> x; + if (!in_.fail() && RemainderIsOnlySpaces()) return *this; + return ParseOnFail(&x); + } + + private: + std::istream &in_; + + bool RemainderIsOnlySpaces() { + if (in_.tellg() != std::istream::pos_type(-1)) { + std::string rem; + in_ >> rem; + + if (rem.find_first_not_of(' ') != std::string::npos) { + // there is not only spaces + return false; + } + } + + in_.clear(); + return true; + } + + NumberIstream & ParseOnFail(T *x) { + std::string str; + in_.clear(); + in_.seekg(0); + // If the stream is broken even before trying + // to read from it or if there are many tokens, + // it's pointless to try. + if (!(in_ >> str) || !RemainderIsOnlySpaces()) { + in_.setstate(std::ios_base::failbit); + return *this; + } + + std::map inf_nan_map; + // we'll keep just uppercase values. + inf_nan_map["INF"] = std::numeric_limits::infinity(); + inf_nan_map["+INF"] = std::numeric_limits::infinity(); + inf_nan_map["-INF"] = - std::numeric_limits::infinity(); + inf_nan_map["INFINITY"] = std::numeric_limits::infinity(); + inf_nan_map["+INFINITY"] = std::numeric_limits::infinity(); + inf_nan_map["-INFINITY"] = - std::numeric_limits::infinity(); + inf_nan_map["NAN"] = std::numeric_limits::quiet_NaN(); + inf_nan_map["+NAN"] = std::numeric_limits::quiet_NaN(); + inf_nan_map["-NAN"] = - std::numeric_limits::quiet_NaN(); + // MSVC + inf_nan_map["1.#INF"] = std::numeric_limits::infinity(); + inf_nan_map["-1.#INF"] = - std::numeric_limits::infinity(); + inf_nan_map["1.#QNAN"] = std::numeric_limits::quiet_NaN(); + inf_nan_map["-1.#QNAN"] = - std::numeric_limits::quiet_NaN(); + + std::transform(str.begin(), str.end(), str.begin(), ::toupper); + + if (inf_nan_map.find(str) != inf_nan_map.end()) { + *x = inf_nan_map[str]; + } else { + in_.setstate(std::ios_base::failbit); + } + + return *this; + } +}; + + +template +bool ConvertStringToReal(const std::string &str, + T *out) { + std::istringstream iss(str); + + NumberIstream i(iss); + + i >> *out; + + if (iss.fail()) { + // Number conversion failed. + return false; + } + + return true; +} + +template +bool ConvertStringToReal(const std::string &str, + float *out); +template +bool ConvertStringToReal(const std::string &str, + double *out); + + + +/* + This function is a helper function of StringsApproxEqual. It should be + thought of as a recursive function-- it was designed that way-- but rather + than actually recursing (which would cause problems with stack overflow), we + just set the args and return to the start. + + The 'decimal_places_tolerance' argument is just passed in from outside, + see the documentation for StringsApproxEqual in text-utils.h to see an + explanation. The argument 'places_into_number' provides some information + about the strings 'a' and 'b' that precedes the current pointers. + For purposes of this comment, let's define the 'decimal' of a number + as the part that comes after the decimal point, e.g. in '99.123', + '123' would be the decimal. If 'places_into_number' is -1, it means + we're not currently inside some place like that (i.e. it's not the + case that we're pointing to the '1' or the '2' or the '3'). + If it's 0, then we'd be pointing to the first place after the decimal, + '1' in this case. Note if one of the numbers is shorter than the + other, like '99.123' versus '99.1234' and 'a' points to the first '3' + while 'b' points to the second '4', 'places_into_number' referes to the + shorter of the two, i.e. it would be 2 in this example. + + + */ +bool StringsApproxEqualInternal(const char *a, const char *b, + int32 decimal_places_tolerance, + int32 places_into_number) { +start: + char ca = *a, cb = *b; + if (ca == cb) { + if (ca == '\0') { + return true; + } else { + if (places_into_number >= 0) { + if (isdigit(ca)) { + places_into_number++; + } else { + places_into_number = -1; + } + } else { + if (ca == '.') { + places_into_number = 0; + } + } + a++; + b++; + goto start; + } + } else { + if (places_into_number >= decimal_places_tolerance && + (isdigit(ca) || isdigit(cb))) { + // we're potentially willing to accept this difference between the + // strings. + if (isdigit(ca)) a++; + if (isdigit(cb)) b++; + // we'll have advanced at least one of the two strings. + goto start; + } else if (places_into_number >= 0 && + ((ca == '0' && !isdigit(cb)) || (cb == '0' && !isdigit(ca)))) { + // this clause is designed to ensure that, for example, + // "0.1" would count the same as "0.100001". + if (ca == '0') a++; + else b++; + places_into_number++; + goto start; + } else { + return false; + } + } + +} + + +bool StringsApproxEqual(const std::string &a, + const std::string &b, + int32 decimal_places_tolerance) { + return StringsApproxEqualInternal(a.c_str(), b.c_str(), + decimal_places_tolerance, -1); +} + + +bool ConfigLine::ParseLine(const std::string &line) { + data_.clear(); + whole_line_ = line; + if (line.size() == 0) return false; // Empty line + size_t pos = 0, size = line.size(); + while (isspace(line[pos]) && pos < size) pos++; + if (pos == size) + return false; // whitespace-only line + size_t first_token_start_pos = pos; + // first get first_token_. + while (!isspace(line[pos]) && pos < size) { + if (line[pos] == '=') { + // If the first block of non-whitespace looks like "foo-bar=...", + // then we ignore it: there is no initial token, and FirstToken() + // is empty. + pos = first_token_start_pos; + break; + } + pos++; + } + first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos); + // first_token_ is expected to be either empty or something like + // "component-node", which actually is a slightly more restrictive set of + // strings than IsValidName() checks for this is a convenient way to check it. + if (!first_token_.empty() && !IsValidName(first_token_)) + return false; + + while (pos < size) { + if (isspace(line[pos])) { + pos++; + continue; + } + + // OK, at this point we know that we are pointing at nonspace. + size_t next_equals_sign = line.find_first_of("=", pos); + if (next_equals_sign == pos || next_equals_sign == std::string::npos) { + // we're looking for something like 'key=value'. If there is no equals sign, + // or it's not preceded by something, it's a parsing failure. + return false; + } + std::string key(line, pos, next_equals_sign - pos); + if (!IsValidName(key)) return false; + + // handle any quotes. we support key='blah blah' or key="foo bar". + // no escaping is supported. + if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') { + char my_quote = line[next_equals_sign+1]; + size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2); + if (next_quote == std::string::npos) { // no matching quote was found. + KALDI_WARN << "No matching quote for " << my_quote << " in config line '" + << line << "'"; + return false; + } else { + std::string value(line, next_equals_sign + 2, + next_quote - next_equals_sign - 2); + data_.insert(std::make_pair(key, std::make_pair(value, false))); + pos = next_quote + 1; + continue; + } + } else { + // we want to be able to parse something like "... input=Offset(a, -1) foo=bar": + // in general, config values with spaces in them, even without quoting. + + size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1), + terminating_space = size; + + if (next_next_equals_sign != std::string::npos) { // found a later equals sign. + size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign); + if (preceding_space != std::string::npos && + preceding_space > next_equals_sign) + terminating_space = preceding_space; + } + while (isspace(line[terminating_space - 1]) && terminating_space > 0) + terminating_space--; + + std::string value(line, next_equals_sign + 1, + terminating_space - (next_equals_sign + 1)); + data_.insert(std::make_pair(key, std::make_pair(value, false))); + pos = terminating_space; + } + } + return true; +} + +bool ConfigLine::GetValue(const std::string &key, std::string *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + *value = (it->second).first; + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if (!ConvertStringToReal((it->second).first, value)) + return false; + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, int32 *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if (!ConvertStringToInteger((it->second).first, value)) + return false; + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, std::vector *value) { + KALDI_ASSERT(value != NULL); + value->clear(); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if (!SplitStringToIntegers((it->second).first, ":,", true, value)) { + // KALDI_WARN << "Bad option " << (it->second).first; + return false; + } + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, bool *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if ((it->second).first.size() == 0) return false; + switch (((it->second).first)[0]) { + case 'F': + case 'f': + *value = false; + break; + case 'T': + case 't': + *value = true; + break; + default: + return false; + } + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::HasUnusedValues() const { + std::map >::const_iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (!(it->second).second) return true; + } + return false; +} + +std::string ConfigLine::UnusedValues() const { + std::string unused_str; + std::map >::const_iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (!(it->second).second) { + if (unused_str == "") + unused_str = it->first + "=" + (it->second).first; + else + unused_str += " " + it->first + "=" + (it->second).first; + } + } + return unused_str; +} + +// This is like ExpectToken but for two tokens, and it +// will either accept token1 and then token2, or just token2. +// This is useful in Read functions where the first token +// may already have been consumed. +void ExpectOneOrTwoTokens(std::istream &is, bool binary, + const std::string &token1, + const std::string &token2) { + KALDI_ASSERT(token1 != token2); + std::string temp; + ReadToken(is, binary, &temp); + if (temp == token1) { + ExpectToken(is, binary, token2); + } else { + if (temp != token2) { + KALDI_ERR << "Expecting token " << token1 << " or " << token2 + << " but got " << temp; + } + } +} + + +bool IsValidName(const std::string &name) { + if (name.size() == 0) return false; + for (size_t i = 0; i < name.size(); i++) { + if (i == 0 && !isalpha(name[i]) && name[i] != '_') + return false; + if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') + return false; + } + return true; +} + +void ReadConfigLines(std::istream &is, + std::vector *lines) { + KALDI_ASSERT(lines != NULL); + std::string line; + while (std::getline(is, line)) { + if (line.size() == 0) continue; + size_t start = line.find_first_not_of(" \t"); + size_t end = line.find_first_of('#'); + if (start == std::string::npos || start == end) continue; + end = line.find_last_not_of(" \t", end - 1); + KALDI_ASSERT(end >= start); + lines->push_back(line.substr(start, end - start + 1)); + } +} + +void ParseConfigLines(const std::vector &lines, + std::vector *config_lines) { + config_lines->resize(lines.size()); + for (size_t i = 0; i < lines.size(); i++) { + bool ret = (*config_lines)[i].ParseLine(lines[i]); + if (!ret) { + KALDI_ERR << "Error parsing config line: " << lines[i]; + } + } +} + + +} // end namespace kaldi diff --git a/speechx/speechx/kaldi/util/text-utils.h b/speechx/speechx/kaldi/util/text-utils.h new file mode 100644 index 00000000..02f4bf48 --- /dev/null +++ b/speechx/speechx/kaldi/util/text-utils.h @@ -0,0 +1,281 @@ +// util/text-utils.h + +// Copyright 2009-2011 Saarland University; Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_TEXT_UTILS_H_ +#define KALDI_UTIL_TEXT_UTILS_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "base/kaldi-common.h" + + +namespace kaldi { + +/// Split a string using any of the single character delimiters. +/// If omit_empty_strings == true, the output will contain any +/// nonempty strings after splitting on any of the +/// characters in the delimiter. If omit_empty_strings == false, +/// the output will contain n+1 strings if there are n characters +/// in the set "delim" within the input string. In this case +/// the empty string is split to a single empty string. +void SplitStringToVector(const std::string &full, const char *delim, + bool omit_empty_strings, + std::vector *out); + +/// Joins the elements of a vector of strings into a single string using +/// "delim" as the delimiter. If omit_empty_strings == true, any empty strings +/// in the vector are skipped. A vector of empty strings results in an empty +/// string on the output. +void JoinVectorToString(const std::vector &vec_in, + const char *delim, bool omit_empty_strings, + std::string *str_out); + +/** + \brief Split a string (e.g. 1:2:3) into a vector of integers. + + \param [in] delim String containing a list of characters, any of which + is allowed as a delimiter. + \param [in] omit_empty_strings If true, empty strings between delimiters are + allowed and will not produce an output integer; if false, + instances of characters in 'delim' that are consecutive or + at the start or end of the string would be an error. + You'll normally want this to be true if 'delim' consists + of spaces, and false otherwise. + \param [out] out The output list of integers. +*/ +template +bool SplitStringToIntegers(const std::string &full, + const char *delim, + bool omit_empty_strings, // typically false [but + // should probably be true + // if "delim" is spaces]. + std::vector *out) { + KALDI_ASSERT(out != NULL); + KALDI_ASSERT_IS_INTEGER_TYPE(I); + if (*(full.c_str()) == '\0') { + out->clear(); + return true; + } + std::vector split; + SplitStringToVector(full, delim, omit_empty_strings, &split); + out->resize(split.size()); + for (size_t i = 0; i < split.size(); i++) { + const char *this_str = split[i].c_str(); + char *end = NULL; + int64 j = 0; + j = KALDI_STRTOLL(this_str, &end); + if (end == this_str || *end != '\0') { + out->clear(); + return false; + } else { + I jI = static_cast(j); + if (static_cast(jI) != j) { + // output type cannot fit this integer. + out->clear(); + return false; + } + (*out)[i] = jI; + } + } + return true; +} + +// This is defined for F = float and double. +template +bool SplitStringToFloats(const std::string &full, + const char *delim, + bool omit_empty_strings, // typically false + std::vector *out); + + +/// Converts a string into an integer via strtoll and returns false if there was +/// any kind of problem (i.e. the string was not an integer or contained extra +/// non-whitespace junk, or the integer was too large to fit into the type it is +/// being converted into). Only sets *out if everything was OK and it returns +/// true. +template +bool ConvertStringToInteger(const std::string &str, + Int *out) { + KALDI_ASSERT_IS_INTEGER_TYPE(Int); + const char *this_str = str.c_str(); + char *end = NULL; + errno = 0; + int64 i = KALDI_STRTOLL(this_str, &end); + if (end != this_str) + while (isspace(*end)) end++; + if (end == this_str || *end != '\0' || errno != 0) + return false; + Int iInt = static_cast(i); + if (static_cast(iInt) != i || + (i < 0 && !std::numeric_limits::is_signed)) { + return false; + } + *out = iInt; + return true; +} + + +/// ConvertStringToReal converts a string into either float or double +/// and returns false if there was any kind of problem (i.e. the string +/// was not a floating point number or contained extra non-whitespace junk). +/// Be careful- this function will successfully read inf's or nan's. +template +bool ConvertStringToReal(const std::string &str, + T *out); + +/// Removes the beginning and trailing whitespaces from a string +void Trim(std::string *str); + + +/// Removes leading and trailing white space from the string, then splits on the +/// first section of whitespace found (if present), putting the part before the +/// whitespace in "first" and the rest in "rest". If there is no such space, +/// everything that remains after removing leading and trailing whitespace goes +/// in "first". +void SplitStringOnFirstSpace(const std::string &line, + std::string *first, + std::string *rest); + + +/// Returns true if "token" is nonempty, and all characters are +/// printable and whitespace-free. +bool IsToken(const std::string &token); + + +/// Returns true if "line" is free of \n characters and unprintable +/// characters, and does not contain leading or trailing whitespace. +bool IsLine(const std::string &line); + + + +/** + This function returns true when two text strings are approximately equal, and + false when they are not. The definition of 'equal' is normal string + equality, except that two substrings like "0.31134" and "0.311341" would be + considered equal. 'decimal_places_tolerance' controls how many digits after + the '.' have to match up. + E.g. StringsApproxEqual("hello 0.23 there", "hello 0.24 there", 2) would + return false because there is a difference in the 2nd decimal, but with + an argument of 1 it would return true. + */ +bool StringsApproxEqual(const std::string &a, + const std::string &b, + int32 decimal_places_check = 2); + +/** + This class is responsible for parsing input like + hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e" + and giving you access to the fields, in this case + + FirstToken() == "hi-there", and key->value pairs: + + xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123", + bing->"a b c", baz->"a b c d='a b' e" + + The first token is optional, if the line started with a key-value pair then + FirstValue() will be empty. + + Note: it can parse value fields with space inside them only if they are free of the '=' + character. If values are going to contain the '=' character, you need to quote them + with either single or double quotes. + + Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_. + */ +class ConfigLine { + public: + // Tries to parse the line as a config-file line. Returns false + // if it could not for some reason, e.g. parsing failure. In most cases + // prints no warnings; the user should do this. Does not expect comments. + bool ParseLine(const std::string &line); + + // the GetValue functions are overloaded for various types. They return true + // if the key exists with value that can be converted to that type, and false + // otherwise. They also mark the key-value pair as having been read. It is + // not an error to read values twice. + bool GetValue(const std::string &key, std::string *value); + bool GetValue(const std::string &key, BaseFloat *value); + bool GetValue(const std::string &key, int32 *value); + // Values may be separated by ":" or by ",". + bool GetValue(const std::string &key, std::vector *value); + bool GetValue(const std::string &key, bool *value); + + bool HasUnusedValues() const; + /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one + /// of the GetValue() functions. + std::string UnusedValues() const; + + const std::string &FirstToken() const { return first_token_; } + + const std::string WholeLine() { return whole_line_; } + // use default assignment operator and copy constructor. + private: + std::string whole_line_; + // the first token of the line, e.g. if line is + // foo-bar baz=bing + // then first_token_ would be "foo-bar". + std::string first_token_; + + // data_ maps from key to (value, is-this-value-consumed?). + std::map > data_; + +}; + +/// This function is like ExpectToken but for two tokens, and it will either +/// accept token1 and then token2, or just token2. This is useful in Read +/// functions where the first token may already have been consumed. +void ExpectOneOrTwoTokens(std::istream &is, bool binary, + const std::string &token1, + const std::string &token2); + + +/** + This function reads in a config file and *appends* its contents to a vector of + lines; it is responsible for removing comments (anything after '#') and + stripping out any lines that contain only whitespace after comment removal. + */ +void ReadConfigLines(std::istream &is, + std::vector *lines); + + +/** + This function converts config-lines from a simple sequence of strings + as output by ReadConfigLines(), into a sequence of first-tokens and + name-value pairs. The general format is: + "command-type bar=baz xx=yyy" + etc., although there are subtleties as to what exactly is allowed, see + documentation for class ConfigLine for details. + This function will die if there was a parsing failure. + */ +void ParseConfigLines(const std::vector &lines, + std::vector *config_lines); + + +/// Returns true if 'name' would be a valid name for a component or node in a +/// nnet3Nnet. This is a nonempty string beginning with A-Za-z_, and containing only +/// '-', '_', '.', A-Z, a-z, or 0-9. +bool IsValidName(const std::string &name); + +} // namespace kaldi + +#endif // KALDI_UTIL_TEXT_UTILS_H_ diff --git a/speechx/speechx/model/CMakeLists.txt b/speechx/speechx/model/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/protocol/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/third_party/CMakeLists.txt b/speechx/speechx/third_party/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/third_party/README.md b/speechx/speechx/third_party/README.md new file mode 100644 index 00000000..2d620335 --- /dev/null +++ b/speechx/speechx/third_party/README.md @@ -0,0 +1,4 @@ +# third party + +Those libs copied and developed from third pary opensource software projects. +For all of these things, the official websites are the best place to go. diff --git a/speechx/speechx/utils/CMakeLists.txt b/speechx/speechx/utils/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/benchmark/conformer/README.md b/tests/benchmark/conformer/README.md index 22e0009d..72242bad 100644 --- a/tests/benchmark/conformer/README.md +++ b/tests/benchmark/conformer/README.md @@ -1,5 +1,5 @@ ### Prepare the environment -Please follow the instructions shown in [here](../../docs/source/install.md) to install the Deepspeech first. +Please follow the instructions shown in [here](../../../docs/source/install.md) to install the Deepspeech first. ### File list └── benchmark # 模型名 diff --git a/tests/test_tipc/benchmark_train.sh b/tests/test_tipc/benchmark_train.sh new file mode 100644 index 00000000..cc61567e --- /dev/null +++ b/tests/test_tipc/benchmark_train.sh @@ -0,0 +1,258 @@ +#!/bin/bash +source test_tipc/common_func.sh + +# set env +python=python +export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3` +export model_commit=$(git log|head -n1|awk '{print $2}') +export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) +export frame_version=${str_tmp%%.post*} +export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`) + +# run benchmark sh +# Usage: +# bash run_benchmark_train.sh config.txt params +# or +# bash run_benchmark_train.sh config.txt + +function func_parser_params(){ + strs=$1 + IFS="=" + array=(${strs}) + tmp=${array[1]} + echo ${tmp} +} + +function func_sed_params(){ + filename=$1 + line=$2 + param_value=$3 + params=`sed -n "${line}p" $filename` + IFS=":" + array=(${params}) + key=${array[0]} + value=${array[1]} + if [[ $value =~ 'benchmark_train' ]];then + IFS='=' + _val=(${value}) + param_value="${_val[0]}=${param_value}" + fi + new_params="${key}:${param_value}" + IFS=";" + cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'" + eval $cmd +} + +function set_gpu_id(){ + string=$1 + _str=${string:1:6} + IFS="C" + arr=(${_str}) + M=${arr[0]} + P=${arr[1]} + gn=`expr $P - 1` + gpu_num=`expr $gn / $M` + seq=`seq -s "," 0 $gpu_num` + echo $seq +} + +function get_repo_name(){ + IFS=";" + cur_dir=$(pwd) + IFS="/" + arr=(${cur_dir}) + echo ${arr[-1]} +} + +FILENAME=$1 +# copy FILENAME as new +new_filename="./test_tipc/benchmark_train.txt" +cmd=`yes|cp $FILENAME $new_filename` +FILENAME=$new_filename +# MODE must be one of ['benchmark_train'] +MODE=$2 +PARAMS=$3 +# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_null_DP_N1C1 +IFS=$'\n' +# parser params from train_benchmark.txt +dataline=`cat $FILENAME` +# parser params +IFS=$'\n' +lines=(${dataline}) +model_name=$(func_parser_value "${lines[1]}") + +# 获取benchmark_params所在的行数 +line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1` +# for train log parser +batch_size=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +fp_items=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +epoch=$(func_parser_value "${lines[line_num]}") + +line_num=`expr $line_num + 1` +profile_option_key=$(func_parser_key "${lines[line_num]}") +profile_option_params=$(func_parser_value "${lines[line_num]}") +profile_option="${profile_option_key}:${profile_option_params}" + +line_num=`expr $line_num + 1` +flags_value=$(func_parser_value "${lines[line_num]}") +# set flags +IFS=";" +flags_list=(${flags_value}) +for _flag in ${flags_list[*]}; do + cmd="export ${_flag}" + eval $cmd +done + +# set log_name +repo_name=$(get_repo_name ) +SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log +mkdir -p "${SAVE_LOG}/benchmark_log/" +status_log="${SAVE_LOG}/benchmark_log/results.log" + +# The number of lines in which train params can be replaced. +line_python=3 +line_gpuid=4 +line_precision=6 +line_epoch=7 +line_batchsize=9 +line_profile=13 +line_eval_py=24 +line_export_py=30 + +func_sed_params "$FILENAME" "${line_eval_py}" "null" +func_sed_params "$FILENAME" "${line_export_py}" "null" +func_sed_params "$FILENAME" "${line_python}" "$python" + +# if params +if [ ! -n "$PARAMS" ] ;then + # PARAMS input is not a word. + IFS="|" + batch_size_list=(${batch_size}) + fp_items_list=(${fp_items}) + device_num_list=(N1C4) + run_mode="DP" +else + # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num} + IFS="_" + params_list=(${PARAMS}) + model_type=${params_list[0]} + batch_size=${params_list[1]} + batch_size=`echo ${batch_size} | tr -cd "[0-9]" ` + precision=${params_list[2]} + # run_process_type=${params_list[3]} + run_mode=${params_list[3]} + device_num=${params_list[4]} + IFS=";" + + if [ ${precision} = "null" ];then + precision="fp32" + fi + + fp_items_list=($precision) + batch_size_list=($batch_size) + device_num_list=($device_num) +fi + +IFS="|" +for batch_size in ${batch_size_list[*]}; do + for precision in ${fp_items_list[*]}; do + for device_num in ${device_num_list[*]}; do + # sed batchsize and precision + func_sed_params "$FILENAME" "${line_precision}" "$precision" + func_sed_params "$FILENAME" "${line_batchsize}" "$MODE=$batch_size" + func_sed_params "$FILENAME" "${line_epoch}" "$MODE=$epoch" + gpu_id=$(set_gpu_id $device_num) + + if [ ${#gpu_id} -le 1 ];then + run_process_type="SingleP" + log_path="$SAVE_LOG/profiling_log" + mkdir -p $log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling" + func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id + # set profile_option params + tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"` + + # run test_train_inference_python.sh + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + eval $cmd + eval "cat ${log_path}/${log_name}" + + # without profile + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --run_process_type ${run_process_type} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit samples/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + else + IFS=";" + unset_env=`unset CUDA_VISIBLE_DEVICES` + run_process_type="MultiP" + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id + func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --run_process_type ${run_process_type} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit images/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + fi + done + done +done \ No newline at end of file diff --git a/tests/test_tipc/common_func.sh b/tests/test_tipc/common_func.sh new file mode 100644 index 00000000..e2ff5c4d --- /dev/null +++ b/tests/test_tipc/common_func.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +function func_parser_key(){ + strs=$1 + IFS=":" + array=(${strs}) + tmp=${array[0]} + echo ${tmp} +} + +function func_parser_value(){ + strs=$1 + IFS=":" + array=(${strs}) + tmp=${array[1]} + echo ${tmp} +} + +function func_set_params(){ + key=$1 + value=$2 + if [ ${key}x = "null"x ];then + echo " " + elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then + echo " " + else + echo "${key}=${value}" + fi +} + +function func_parser_params(){ + strs=$1 + MODE=$2 + IFS=":" + array=(${strs}) + key=${array[0]} + tmp=${array[1]} + IFS="|" + res="" + for _params in ${tmp[*]}; do + IFS="=" + array=(${_params}) + mode=${array[0]} + value=${array[1]} + if [[ ${mode} = ${MODE} ]]; then + IFS="|" + #echo $(func_set_params "${mode}" "${value}") + echo $value + break + fi + IFS="|" + done + echo ${res} +} + +function status_check(){ + last_status=$1 # the exit code + run_command=$2 + run_log=$3 + if [ $last_status -eq 0 ]; then + echo -e "\033[33m Run successfully with command - ${run_command}! \033[0m" | tee -a ${run_log} + else + echo -e "\033[33m Run failed with command - ${run_command}! \033[0m" | tee -a ${run_log} + fi +} \ No newline at end of file diff --git a/tests/test_tipc/configs/conformer/train_benchmark.txt b/tests/test_tipc/configs/conformer/train_benchmark.txt new file mode 100644 index 00000000..3833f144 --- /dev/null +++ b/tests/test_tipc/configs/conformer/train_benchmark.txt @@ -0,0 +1,57 @@ +===========================train_params=========================== +model_name:conformer +python:python3.7 +gpu_list:0|0,1 +null:null +null:null +--benchmark-max-step:50 +null:null +--benchmark-batch-size:16 +null:null +null:null +null:null +null:null +## +trainer:norm_train +norm_train: ../paddlespeech/s2t/exps/u2/bin/train.py --config test_tipc/conformer/benchmark_train/conf/conformer.yaml --output test_tipc/conformer/benchmark_train/outputs --seed 1024 +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +null:null +null:null +norm_export: null +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +null:null +infer_model:null +infer_export:null +infer_quant:null +inference:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +===========================train_benchmark_params========================== +batch_size:16|30 +fp_items:fp32 +iteration:50 +--profiler-options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 diff --git a/tests/test_tipc/configs/pwgan/train_benchmark.txt b/tests/test_tipc/configs/pwgan/train_benchmark.txt new file mode 100644 index 00000000..e936da3c --- /dev/null +++ b/tests/test_tipc/configs/pwgan/train_benchmark.txt @@ -0,0 +1,57 @@ +===========================train_params=========================== +model_name:pwgan +python:python3.7 +gpu_list:0|0,1 +null:null +null:null +--max-iter:100 +null:null +--batch-size:6 +null:null +null:null +null:null +null:null +## +trainer:norm_train +norm_train: ../paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --ngpu=1 --train-metadata=dump/train/norm/metadata.jsonl --dev-metadata=dump/dev/norm/metadata.jsonl --config=../examples/csmsc/voc1/conf/default.yaml --output-dir=exp/default --run-benchmark=true --max-iter 10 +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +null:null +null:null +norm_export: null +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +null:null +infer_model:null +infer_export:null +infer_quant:null +inference:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +===========================train_benchmark_params========================== +batch_size:6|16 +fp_items:fp32 +iteration:50 +--profiler_options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 diff --git a/tests/test_tipc/conformer/scripts/aishell_tiny.py b/tests/test_tipc/conformer/scripts/aishell_tiny.py new file mode 100644 index 00000000..14f09f17 --- /dev/null +++ b/tests/test_tipc/conformer/scripts/aishell_tiny.py @@ -0,0 +1,159 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os +from pathlib import Path + +import soundfile + +from utils.utility import download +from utils.utility import unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT_TAG +DATA_URL = URL_ROOT + '/data_aishell_tiny.tgz' +MD5_DATA = '337b1b1ea016761d4fd3225c5b8799b4' +RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz' +MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Aishell", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aishell_transcript_v0.8.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': + continue + audio_id, text = line.split(' ', 1) + # remove withespace, charactor text + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['train', 'dev', 'test'] + for dtype in data_types: + del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + + audio_dir = os.path.join(data_dir, 'wav', dtype) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.abspath(os.path.join(subfolder, fname)) + audio_id = os.path.basename(fname)[:-4] + # if no transcription for audio then skipped + if audio_id not in transcript_dict: + continue + + utt2spk = Path(audio_path).parent.name + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'utt': audio_id, + 'utt2spk': str(utt2spk), + 'feat': audio_path, + 'feat_shape': (duration, ), # second + 'text': text + }, + ensure_ascii=False)) + + total_sec += duration + total_text += len(text) + total_num += 1 + + manifest_path = manifest_path_prefix + '.' + dtype + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + manifest_dir = os.path.dirname(manifest_path_prefix) + meta_path = os.path.join(manifest_dir, dtype) + '.meta' + with open(meta_path, 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + + +def prepare_dataset(url, md5sum, target_dir, manifest_path=None): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, 'data_aishell_tiny') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'wav') + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for ftar in filelist: + unpack(os.path.join(subfolder, ftar), subfolder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + + if manifest_path: + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix) + + prepare_dataset( + url=RESOURCE_URL, + md5sum=MD5_RESOURCE, + target_dir=args.target_dir, + manifest_path=None) + + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() diff --git a/tests/test_tipc/docs/benchmark_train.md b/tests/test_tipc/docs/benchmark_train.md new file mode 100644 index 00000000..af61f597 --- /dev/null +++ b/tests/test_tipc/docs/benchmark_train.md @@ -0,0 +1,53 @@ +# TIPC Linux端Benchmark测试文档 + + 该文档为Benchmark测试说明,Benchmark预测功能测试的主程序为`benchmark_train.sh`,用于验证监控模型训练的性能。 + + + # 1. 测试流程 + ## 1.1 准备数据和环境安装 +请在 repo根目录/tests 下运行 +运行`test_tipc/prepare.sh`,完成训练数据准备和安装环境流程。 + + ```shell + # 运行格式:bash test_tipc/prepare.sh train_benchmark.txt mode + bash test_tipc/prepare.sh test_tipc/configs/conformer/train_benchmark.txt benchmark_train + ``` + + ## 1.2 功能测试 + 执行`test_tipc/benchmark_train.sh`,完成模型训练和日志解析 + + ```shell + # 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode + bash test_tipc/benchmark_train.sh test_tipc/configs/conformer/train_benchmark.txt benchmark_train + ``` + + `test_tipc/benchmark_train.sh`支持根据传入的第三个参数实现只运行某一个训练配置,如下: + ```shell + # 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode + bash test_tipc/benchmark_train.sh test_tipc/configs/conformer/train_benchmark.txt benchmark_train dynamic_bs16_fp32_DP_N1C1 + ``` + dynamic_bs16_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数,格式如下: + `${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}` + 包含的信息有:模型类型、batchsize大小、训练精度如fp32,fp16等、分布式运行模式以及分布式训练使用的机器信息如单机单卡(N1C1)。 + + + ## 2. 日志输出 + + 运行后将保存模型的训练日志和解析日志,使用 `test_tipc/configs/conformer/train_benchmark.txt` 参数文件的训练日志解析结果是: + + ``` + {"model_branch": "dygaph", "model_commit": "", "model_name": "conformer_bs16_fp32_SingleP_DP", "batch_size": 16, "fp_item": "fp32", "run_process_type": "SingleP", "run_mode": "DP", "convergence_value": "", "convergence_key": "loss:", "ips": , "speed_unit": "samples/s", "device_num": "N1C1", "model_run_time": "0", "frame_commit": "", "frame_version": "0.0.0"} + ``` + + 训练日志和日志解析结果保存在test目录下,文件组织格式如下: + ``` + test/ + ├── index + │   ├── tests_conformer_bs16_fp32_SingleP_DP_N1C1_speed + │   └── tests_conformer_bs16_fp32_SingleP_DP_N1C8_speed + ├── profiling_log + │   └── tests_conformer_bs16_fp32_SingleP_DP_N1C1_profiling + └── train_log + ├── tests_conformer_bs16_fp32_SingleP_DP_N1C1_log + └── tests_conformer_bs16_fp32_SingleP_DP_N1C8_log + ``` diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh new file mode 100644 index 00000000..0280e5d4 --- /dev/null +++ b/tests/test_tipc/prepare.sh @@ -0,0 +1,76 @@ +#!/bin/bash +source test_tipc/common_func.sh + +FILENAME=$1 + +# MODE be one of ['benchmark_train_lite_infer' 'benchmark_train_whole_infer' 'whole_train_whole_infer', +# 'whole_infer', 'klquant_whole_infer', +# 'cpp_infer', 'serving_infer', 'benchmark_train'] + + +MODE=$2 + +dataline=$(cat ${FILENAME}) + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# The training params +model_name=$(func_parser_value "${lines[1]}") + +echo "model_name:"${model_name} +trainer_list=$(func_parser_value "${lines[14]}") + +if [ ${MODE} = "benchmark_train" ];then + curPath=$(readlink -f "$(dirname "$0")") + echo "curPath:"${curPath} + cd ${curPath}/../.. + pip install . + cd - + if [ ${model_name} == "conformer" ]; then + # set the URL for aishell_tiny dataset + URL='None' + echo "URL:"${URL} + if [ ${URL} == 'None' ];then + echo "please contact author to get the URL.\n" + exit + fi + sed -i "s#^URL_ROOT_TAG#URL_ROOT = '${URL}'#g" ${curPath}/conformer/scripts/aishell_tiny.py + cp ${curPath}/conformer/scripts/aishell_tiny.py ${curPath}/../../dataset/aishell/ + cd ${curPath}/../../examples/aishell/asr1 + source path.sh + # download audio data + sed -i "s#aishell.py#aishell_tiny.py#g" ./local/data.sh + bash ./local/data.sh || exit -1 + if [ $? -ne 0 ]; then + exit 1 + fi + mkdir -p ${curPath}/conformer/benchmark_train/ + cp -rf conf ${curPath}/conformer/benchmark_train/ + cp -rf data ${curPath}/conformer/benchmark_train/ + cd ${curPath} + + sed -i "s#accum_grad: 2#accum_grad: 1#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml + sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml + sed -i "s#conf/#test_tipc/conformer/benchmark_train/conf/#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml + sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/tuning/decode.yaml + sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/preprocess.yaml + + fi + + if [ ${model_name} == "pwgan" ]; then + # 下载 csmsc 数据集并解压缩 + wget -nc https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar + mkdir -p BZNSYP + unrar x BZNSYP.rar BZNSYP + wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt + # 数据预处理 + python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml + python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" + python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/feats_stats.npy + python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/feats_stats.npy + python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy + fi + +fi \ No newline at end of file diff --git a/tests/test_tipc/test_train_inference_python.sh b/tests/test_tipc/test_train_inference_python.sh new file mode 100644 index 00000000..ef5747b4 --- /dev/null +++ b/tests/test_tipc/test_train_inference_python.sh @@ -0,0 +1,377 @@ +#!/bin/bash +source test_tipc/common_func.sh + +FILENAME=$1 +# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer', 'klquant_whole_infer'] +MODE=$2 + +dataline=$(awk 'NR==1, NR==51{print}' $FILENAME) + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# The training params +model_name=$(func_parser_value "${lines[1]}") +python=$(func_parser_value "${lines[2]}") +gpu_list=$(func_parser_value "${lines[3]}") +train_use_gpu_key=$(func_parser_key "${lines[4]}") +train_use_gpu_value=$(func_parser_value "${lines[4]}") +autocast_list=$(func_parser_value "${lines[5]}") +autocast_key=$(func_parser_key "${lines[5]}") +epoch_key=$(func_parser_key "${lines[6]}") +epoch_num=$(func_parser_params "${lines[6]}" "${MODE}") +save_model_key=$(func_parser_key "${lines[7]}") +train_batch_key=$(func_parser_key "${lines[8]}") +train_batch_value=$(func_parser_params "${lines[8]}" "${MODE}") +pretrain_model_key=$(func_parser_key "${lines[9]}") +pretrain_model_value=$(func_parser_value "${lines[9]}") +train_model_name=$(func_parser_value "${lines[10]}") +train_infer_img_dir=$(func_parser_value "${lines[11]}") +train_param_key1=$(func_parser_key "${lines[12]}") +train_param_value1=$(func_parser_value "${lines[12]}") + +trainer_list=$(func_parser_value "${lines[14]}") +trainer_norm=$(func_parser_key "${lines[15]}") +norm_trainer=$(func_parser_value "${lines[15]}") +pact_key=$(func_parser_key "${lines[16]}") +pact_trainer=$(func_parser_value "${lines[16]}") +fpgm_key=$(func_parser_key "${lines[17]}") +fpgm_trainer=$(func_parser_value "${lines[17]}") +distill_key=$(func_parser_key "${lines[18]}") +distill_trainer=$(func_parser_value "${lines[18]}") +trainer_key1=$(func_parser_key "${lines[19]}") +trainer_value1=$(func_parser_value "${lines[19]}") +trainer_key2=$(func_parser_key "${lines[20]}") +trainer_value2=$(func_parser_value "${lines[20]}") + +eval_py=$(func_parser_value "${lines[23]}") +eval_key1=$(func_parser_key "${lines[24]}") +eval_value1=$(func_parser_value "${lines[24]}") + +save_infer_key=$(func_parser_key "${lines[27]}") +export_weight=$(func_parser_key "${lines[28]}") +norm_export=$(func_parser_value "${lines[29]}") +pact_export=$(func_parser_value "${lines[30]}") +fpgm_export=$(func_parser_value "${lines[31]}") +distill_export=$(func_parser_value "${lines[32]}") +export_key1=$(func_parser_key "${lines[33]}") +export_value1=$(func_parser_value "${lines[33]}") +export_key2=$(func_parser_key "${lines[34]}") +export_value2=$(func_parser_value "${lines[34]}") +inference_dir=$(func_parser_value "${lines[35]}") + +# parser inference model +infer_model_dir_list=$(func_parser_value "${lines[36]}") +infer_export_list=$(func_parser_value "${lines[37]}") +infer_is_quant=$(func_parser_value "${lines[38]}") +# parser inference +inference_py=$(func_parser_value "${lines[39]}") +use_gpu_key=$(func_parser_key "${lines[40]}") +use_gpu_list=$(func_parser_value "${lines[40]}") +use_mkldnn_key=$(func_parser_key "${lines[41]}") +use_mkldnn_list=$(func_parser_value "${lines[41]}") +cpu_threads_key=$(func_parser_key "${lines[42]}") +cpu_threads_list=$(func_parser_value "${lines[42]}") +batch_size_key=$(func_parser_key "${lines[43]}") +batch_size_list=$(func_parser_value "${lines[43]}") +use_trt_key=$(func_parser_key "${lines[44]}") +use_trt_list=$(func_parser_value "${lines[44]}") +precision_key=$(func_parser_key "${lines[45]}") +precision_list=$(func_parser_value "${lines[45]}") +infer_model_key=$(func_parser_key "${lines[46]}") +image_dir_key=$(func_parser_key "${lines[47]}") +infer_img_dir=$(func_parser_value "${lines[47]}") +save_log_key=$(func_parser_key "${lines[48]}") +benchmark_key=$(func_parser_key "${lines[49]}") +benchmark_value=$(func_parser_value "${lines[49]}") +infer_key1=$(func_parser_key "${lines[50]}") +infer_value1=$(func_parser_value "${lines[50]}") + +# parser klquant_infer +if [ ${MODE} = "klquant_whole_infer" ]; then + dataline=$(awk 'NR==1, NR==17{print}' $FILENAME) + lines=(${dataline}) + model_name=$(func_parser_value "${lines[1]}") + python=$(func_parser_value "${lines[2]}") + export_weight=$(func_parser_key "${lines[3]}") + save_infer_key=$(func_parser_key "${lines[4]}") + # parser inference model + infer_model_dir_list=$(func_parser_value "${lines[5]}") + infer_export_list=$(func_parser_value "${lines[6]}") + infer_is_quant=$(func_parser_value "${lines[7]}") + # parser inference + inference_py=$(func_parser_value "${lines[8]}") + use_gpu_key=$(func_parser_key "${lines[9]}") + use_gpu_list=$(func_parser_value "${lines[9]}") + use_mkldnn_key=$(func_parser_key "${lines[10]}") + use_mkldnn_list=$(func_parser_value "${lines[10]}") + cpu_threads_key=$(func_parser_key "${lines[11]}") + cpu_threads_list=$(func_parser_value "${lines[11]}") + batch_size_key=$(func_parser_key "${lines[12]}") + batch_size_list=$(func_parser_value "${lines[12]}") + use_trt_key=$(func_parser_key "${lines[13]}") + use_trt_list=$(func_parser_value "${lines[13]}") + precision_key=$(func_parser_key "${lines[14]}") + precision_list=$(func_parser_value "${lines[14]}") + infer_model_key=$(func_parser_key "${lines[15]}") + image_dir_key=$(func_parser_key "${lines[16]}") + infer_img_dir=$(func_parser_value "${lines[16]}") + save_log_key=$(func_parser_key "${lines[17]}") + save_log_value=$(func_parser_value "${lines[17]}") + benchmark_key=$(func_parser_key "${lines[18]}") + benchmark_value=$(func_parser_value "${lines[18]}") + infer_key1=$(func_parser_key "${lines[19]}") + infer_value1=$(func_parser_value "${lines[19]}") +fi + +LOG_PATH="./test_tipc/output" +mkdir -p ${LOG_PATH} +status_log="${LOG_PATH}/results_python.log" + + +function func_inference(){ + IFS='|' + _python=$1 + _script=$2 + _model_dir=$3 + _log_path=$4 + _img_dir=$5 + _flag_quant=$6 + # inference + for use_gpu in ${use_gpu_list[*]}; do + if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then + for use_mkldnn in ${use_mkldnn_list[*]}; do + if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then + continue + fi + for threads in ${cpu_threads_list[*]}; do + for batch_size in ${batch_size_list[*]}; do + for precision in ${precision_list[*]}; do + if [ ${use_mkldnn} = "False" ] && [ ${precision} = "fp16" ]; then + continue + fi # skip when enable fp16 but disable mkldnn + if [ ${_flag_quant} = "True" ] && [ ${precision} != "int8" ]; then + continue + fi # skip when quant model inference but precision is not int8 + set_precision=$(func_set_params "${precision_key}" "${precision}") + + _save_log_path="${_log_path}/python_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log" + set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") + set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") + set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") + set_mkldnn=$(func_set_params "${use_mkldnn_key}" "${use_mkldnn}") + set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}") + set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}") + set_infer_params0=$(func_set_params "${save_log_key}" "${save_log_value}") + set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}") + command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_params0} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} > ${_save_log_path} 2>&1 " + eval $command + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${command}" "${status_log}" + done + done + done + done + elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then + for use_trt in ${use_trt_list[*]}; do + for precision in ${precision_list[*]}; do + if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then + continue + fi + if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then + continue + fi + if [[ ${use_trt} = "False" && ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then + continue + fi + for batch_size in ${batch_size_list[*]}; do + _save_log_path="${_log_path}/python_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log" + set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") + set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") + set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") + set_tensorrt=$(func_set_params "${use_trt_key}" "${use_trt}") + set_precision=$(func_set_params "${precision_key}" "${precision}") + set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}") + set_infer_params0=$(func_set_params "${save_log_key}" "${save_log_value}") + set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}") + command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params0} > ${_save_log_path} 2>&1 " + eval $command + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${command}" "${status_log}" + + done + done + done + else + echo "Does not support hardware other than CPU and GPU Currently!" + fi + done +} + +if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then + GPUID=$3 + if [ ${#GPUID} -le 0 ];then + env=" " + else + env="export CUDA_VISIBLE_DEVICES=${GPUID}" + fi + # set CUDA_VISIBLE_DEVICES + eval $env + export Count=0 + IFS="|" + infer_run_exports=(${infer_export_list}) + infer_quant_flag=(${infer_is_quant}) + for infer_model in ${infer_model_dir_list[*]}; do + # run export + if [ ${infer_run_exports[Count]} != "null" ];then + if [ ${MODE} = "klquant_whole_infer" ]; then + save_infer_dir="${infer_model}_klquant" + fi + if [ ${MODE} = "whole_infer" ]; then + save_infer_dir="${infer_model}" + fi + set_export_weight=$(func_set_params "${export_weight}" "${infer_model}") + set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}") + export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key}" + echo ${infer_run_exports[Count]} + echo $export_cmd + eval $export_cmd + status_export=$? + status_check $status_export "${export_cmd}" "${status_log}" + else + save_infer_dir=${infer_model} + fi + #run inference + is_quant=${infer_quant_flag[Count]} + if [ ${MODE} = "klquant_whole_infer" ]; then + is_quant="True" + fi + func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_img_dir}" ${is_quant} + Count=$(($Count + 1)) + done +else + IFS="|" + export Count=0 + USE_GPU_KEY=(${train_use_gpu_value}) + for gpu in ${gpu_list[*]}; do + train_use_gpu=${USE_GPU_KEY[Count]} + Count=$(($Count + 1)) + ips="" + if [ ${gpu} = "-1" ];then + env="" + elif [ ${#gpu} -le 1 ];then + env="export CUDA_VISIBLE_DEVICES=${gpu}" + elif [ ${#gpu} -le 15 ];then + IFS="," + array=(${gpu}) + env="export CUDA_VISIBLE_DEVICES=${array[0]}" + IFS="|" + else + IFS=";" + array=(${gpu}) + ips=${array[0]} + gpu=${array[1]} + IFS="|" + env=" " + fi + for autocast in ${autocast_list[*]}; do + if [ ${autocast} = "amp" ]; then + set_amp_config="Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True" + else + set_amp_config=" " + fi + for trainer in ${trainer_list[*]}; do + flag_quant=False + if [ ${trainer} = ${pact_key} ]; then + run_train=${pact_trainer} + run_export=${pact_export} + flag_quant=True + elif [ ${trainer} = "${fpgm_key}" ]; then + run_train=${fpgm_trainer} + run_export=${fpgm_export} + elif [ ${trainer} = "${distill_key}" ]; then + run_train=${distill_trainer} + run_export=${distill_export} + elif [ ${trainer} = ${trainer_key1} ]; then + run_train=${trainer_value1} + run_export=${export_value1} + elif [[ ${trainer} = ${trainer_key2} ]]; then + run_train=${trainer_value2} + run_export=${export_value2} + else + run_train=${norm_trainer} + run_export=${norm_export} + fi + + if [ ${run_train} = "null" ]; then + continue + fi + set_autocast=$(func_set_params "${autocast_key}" "${autocast}") + set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}") + set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}") + set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}") + set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}") + set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu}") + if [ ${#ips} -le 26 ];then + save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}" + nodes=1 + else + IFS="," + ips_array=(${ips}) + IFS="|" + nodes=${#ips_array[@]} + save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}" + fi + + + set_save_model=$(func_set_params "${save_model_key}" "${save_log}") + if [ ${#gpu} -le 2 ];then # train with cpu or single gpu + cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} " + elif [ ${#ips} -le 26 ];then # train with multi-gpu + cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + else # train with multi-machine + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + fi + # run train + eval $cmd + status_check $? "${cmd}" "${status_log}" + + set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}") + + # run eval + if [ ${eval_py} != "null" ]; then + eval ${env} + set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}") + eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}" + eval $eval_cmd + status_check $? "${eval_cmd}" "${status_log}" + fi + # run export model + if [ ${run_export} != "null" ]; then + # run export model + save_infer_path="${save_log}" + set_export_weight=$(func_set_params "${export_weight}" "${save_log}/${train_model_name}") + set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_path}") + export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key}" + eval $export_cmd + status_check $? "${export_cmd}" "${status_log}" + + #run inference + eval $env + save_infer_path="${save_log}" + if [[ ${inference_dir} != "null" ]] && [[ ${inference_dir} != '##' ]]; then + infer_model_dir="${save_infer_path}/${inference_dir}" + else + infer_model_dir=${save_infer_path} + fi + func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}" + + eval "unset CUDA_VISIBLE_DEVICES" + fi + done # done with: for trainer in ${trainer_list[*]}; do + done # done with: for autocast in ${autocast_list[*]}; do + done # done with: for gpu in ${gpu_list[*]}; do +fi # end if [ ${MODE} = "infer" ]; then \ No newline at end of file diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 845c5d6a..748e5608 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -19,8 +19,12 @@ paddlespeech tts --voc mb_melgan_csmsc --input "你好,欢迎使用百度飞 paddlespeech tts --voc style_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 -paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" -paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 +paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." +paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 +paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." + # Speech Translation (only support linux) paddlespeech st --input ./en.wav diff --git a/third_party/ctc_decoders/ctc_beam_search_decoder.cpp b/third_party/ctc_decoders/ctc_beam_search_decoder.cpp index db742fbb..ebea5c22 100644 --- a/third_party/ctc_decoders/ctc_beam_search_decoder.cpp +++ b/third_party/ctc_decoders/ctc_beam_search_decoder.cpp @@ -29,7 +29,8 @@ using FSTMATCH = fst::SortedMatcher; -std::vector> ctc_beam_search_decoder( + +std::vector> ctc_beam_search_decoding( const std::vector> &probs_seq, const std::vector &vocabulary, size_t beam_size, @@ -46,6 +47,8 @@ std::vector> ctc_beam_search_decoder( "The shape of probs_seq does not match with " "the shape of the vocabulary"); } + + // assign space id auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE); int space_id = it - vocabulary.begin(); @@ -206,7 +209,7 @@ std::vector> ctc_beam_search_decoder( std::vector>> -ctc_beam_search_decoder_batch( +ctc_beam_search_decoding_batch( const std::vector>> &probs_split, const std::vector &vocabulary, size_t beam_size, @@ -224,7 +227,7 @@ ctc_beam_search_decoder_batch( // enqueue the tasks of decoding std::vector>>> res; for (size_t i = 0; i < batch_size; ++i) { - res.emplace_back(pool.enqueue(ctc_beam_search_decoder, + res.emplace_back(pool.enqueue(ctc_beam_search_decoding, probs_split[i], vocabulary, beam_size, @@ -241,3 +244,364 @@ ctc_beam_search_decoder_batch( } return batch_results; } + +void ctc_beam_search_decode_chunk_begin(PathTrie *root, Scorer *ext_scorer) { + if (ext_scorer != nullptr && !ext_scorer->is_character_based()) { + auto fst_dict = + static_cast(ext_scorer->dictionary); + fst::StdVectorFst *dict_ptr = fst_dict->Copy(true); + root->set_dictionary(dict_ptr); + auto matcher = std::make_shared(*dict_ptr, fst::MATCH_INPUT); + root->set_matcher(matcher); + } +} + +void ctc_beam_search_decode_chunk( + PathTrie *root, + std::vector &prefixes, + const std::vector> &probs_seq, + const std::vector &vocabulary, + size_t beam_size, + double cutoff_prob, + size_t cutoff_top_n, + Scorer *ext_scorer, + size_t blank_id) { + // dimension check + size_t num_time_steps = probs_seq.size(); + for (size_t i = 0; i < num_time_steps; ++i) { + VALID_CHECK_EQ(probs_seq[i].size(), + // vocabulary.size() + 1, + vocabulary.size(), + "The shape of probs_seq does not match with " + "the shape of the vocabulary"); + } + + // assign space id + auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE); + int space_id = it - vocabulary.begin(); + // if no space in vocabulary + if ((size_t)space_id >= vocabulary.size()) { + space_id = -2; + } + // init prefixes' root + // + // prefix search over time + for (size_t time_step = 0; time_step < num_time_steps; ++time_step) { + auto &prob = probs_seq[time_step]; + + float min_cutoff = -NUM_FLT_INF; + bool full_beam = false; + if (ext_scorer != nullptr) { + size_t num_prefixes = std::min(prefixes.size(), beam_size); + std::sort(prefixes.begin(), + prefixes.begin() + num_prefixes, + prefix_compare); + min_cutoff = prefixes[num_prefixes - 1]->score + + std::log(prob[blank_id]) - + std::max(0.0, ext_scorer->beta); + full_beam = (num_prefixes == beam_size); + } + + std::vector> log_prob_idx = + get_pruned_log_probs(prob, cutoff_prob, cutoff_top_n); + // loop over chars + for (size_t index = 0; index < log_prob_idx.size(); index++) { + auto c = log_prob_idx[index].first; + auto log_prob_c = log_prob_idx[index].second; + + for (size_t i = 0; i < prefixes.size() && i < beam_size; ++i) { + auto prefix = prefixes[i]; + if (full_beam && log_prob_c + prefix->score < min_cutoff) { + break; + } + // blank + if (c == blank_id) { + prefix->log_prob_b_cur = log_sum_exp( + prefix->log_prob_b_cur, log_prob_c + prefix->score); + continue; + } + // repeated character + if (c == prefix->character) { + prefix->log_prob_nb_cur = + log_sum_exp(prefix->log_prob_nb_cur, + log_prob_c + prefix->log_prob_nb_prev); + } + // get new prefix + auto prefix_new = prefix->get_path_trie(c); + + if (prefix_new != nullptr) { + float log_p = -NUM_FLT_INF; + + if (c == prefix->character && + prefix->log_prob_b_prev > -NUM_FLT_INF) { + log_p = log_prob_c + prefix->log_prob_b_prev; + } else if (c != prefix->character) { + log_p = log_prob_c + prefix->score; + } + + // language model scoring + if (ext_scorer != nullptr && + (c == space_id || ext_scorer->is_character_based())) { + PathTrie *prefix_to_score = nullptr; + // skip scoring the space + if (ext_scorer->is_character_based()) { + prefix_to_score = prefix_new; + } else { + prefix_to_score = prefix; + } + + float score = 0.0; + std::vector ngram; + ngram = ext_scorer->make_ngram(prefix_to_score); + score = ext_scorer->get_log_cond_prob(ngram) * + ext_scorer->alpha; + log_p += score; + log_p += ext_scorer->beta; + } + prefix_new->log_prob_nb_cur = + log_sum_exp(prefix_new->log_prob_nb_cur, log_p); + } + } // end of loop over prefix + } // end of loop over vocabulary + + prefixes.clear(); + // update log probs + + root->iterate_to_vec(prefixes); + + // only preserve top beam_size prefixes + if (prefixes.size() >= beam_size) { + std::nth_element(prefixes.begin(), + prefixes.begin() + beam_size, + prefixes.end(), + prefix_compare); + for (size_t i = beam_size; i < prefixes.size(); ++i) { + prefixes[i]->remove(); + } + } + } // end of loop over time + + return; +} + + +std::vector> get_decode_result( + std::vector &prefixes, + const std::vector &vocabulary, + size_t beam_size, + Scorer *ext_scorer) { + auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE); + int space_id = it - vocabulary.begin(); + // if no space in vocabulary + if ((size_t)space_id >= vocabulary.size()) { + space_id = -2; + } + // score the last word of each prefix that doesn't end with space + if (ext_scorer != nullptr && !ext_scorer->is_character_based()) { + for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { + auto prefix = prefixes[i]; + if (!prefix->is_empty() && prefix->character != space_id) { + float score = 0.0; + std::vector ngram = ext_scorer->make_ngram(prefix); + score = + ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha; + score += ext_scorer->beta; + prefix->score += score; + } + } + } + + size_t num_prefixes = std::min(prefixes.size(), beam_size); + std::sort( + prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare); + + // compute aproximate ctc score as the return score, without affecting the + // return order of decoding result. To delete when decoder gets stable. + for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { + double approx_ctc = prefixes[i]->score; + if (ext_scorer != nullptr) { + std::vector output; + prefixes[i]->get_path_vec(output); + auto prefix_length = output.size(); + auto words = ext_scorer->split_labels(output); + // remove word insert + approx_ctc = approx_ctc - prefix_length * ext_scorer->beta; + // remove language model weight: + approx_ctc -= + (ext_scorer->get_sent_log_prob(words)) * ext_scorer->alpha; + } + prefixes[i]->approx_ctc = approx_ctc; + } + + std::vector> res = + get_beam_search_result(prefixes, vocabulary, beam_size); + + // pay back the last word of each prefix that doesn't end with space (for + // decoding by chunk) + if (ext_scorer != nullptr && !ext_scorer->is_character_based()) { + for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { + auto prefix = prefixes[i]; + if (!prefix->is_empty() && prefix->character != space_id) { + float score = 0.0; + std::vector ngram = ext_scorer->make_ngram(prefix); + score = + ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha; + score += ext_scorer->beta; + prefix->score -= score; + } + } + } + return res; +} + + +void free_storage(std::unique_ptr &storage) { + storage = nullptr; +} + + +CtcBeamSearchDecoderBatch::~CtcBeamSearchDecoderBatch() {} + +CtcBeamSearchDecoderBatch::CtcBeamSearchDecoderBatch( + const std::vector &vocabulary, + size_t batch_size, + size_t beam_size, + size_t num_processes, + double cutoff_prob, + size_t cutoff_top_n, + Scorer *ext_scorer, + size_t blank_id) + : batch_size(batch_size), + beam_size(beam_size), + num_processes(num_processes), + cutoff_prob(cutoff_prob), + cutoff_top_n(cutoff_top_n), + ext_scorer(ext_scorer), + blank_id(blank_id) { + VALID_CHECK_GT(this->beam_size, 0, "beam_size must be greater than 0!"); + VALID_CHECK_GT( + this->num_processes, 0, "num_processes must be nonnegative!"); + this->vocabulary = vocabulary; + for (size_t i = 0; i < batch_size; i++) { + this->decoder_storage_vector.push_back( + std::unique_ptr( + new CtcBeamSearchDecoderStorage())); + ctc_beam_search_decode_chunk_begin( + this->decoder_storage_vector[i]->root, ext_scorer); + } +}; + +/** + * Input + * probs_split: shape [B, T, D] + */ +void CtcBeamSearchDecoderBatch::next( + const std::vector>> &probs_split, + const std::vector &has_value) { + VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!"); + // thread pool + size_t num_has_value = 0; + for (int i = 0; i < has_value.size(); i++) + if (has_value[i] == "true") num_has_value += 1; + ThreadPool pool(std::min(num_processes, num_has_value)); + // number of samples + size_t probs_num = probs_split.size(); + VALID_CHECK_EQ(this->batch_size, + probs_num, + "The batch size of the current input data should be same " + "with the input data before"); + + // enqueue the tasks of decoding + std::vector> res; + for (size_t i = 0; i < batch_size; ++i) { + if (has_value[i] == "true") { + res.emplace_back(pool.enqueue( + ctc_beam_search_decode_chunk, + std::ref(this->decoder_storage_vector[i]->root), + std::ref(this->decoder_storage_vector[i]->prefixes), + probs_split[i], + this->vocabulary, + this->beam_size, + this->cutoff_prob, + this->cutoff_top_n, + this->ext_scorer, + this->blank_id)); + } + } + + for (size_t i = 0; i < batch_size; ++i) { + res[i].get(); + } + return; +}; + +/** + * Return + * batch_result: shape[B, beam_size,(-approx_ctc score, string)] + */ +std::vector>> +CtcBeamSearchDecoderBatch::decode() { + VALID_CHECK_GT( + this->num_processes, 0, "num_processes must be nonnegative!"); + // thread pool + ThreadPool pool(this->num_processes); + // number of samples + // enqueue the tasks of decoding + std::vector>>> res; + for (size_t i = 0; i < this->batch_size; ++i) { + res.emplace_back( + pool.enqueue(get_decode_result, + std::ref(this->decoder_storage_vector[i]->prefixes), + this->vocabulary, + this->beam_size, + this->ext_scorer)); + } + // get decoding results + std::vector>> batch_results; + for (size_t i = 0; i < this->batch_size; ++i) { + batch_results.emplace_back(res[i].get()); + } + return batch_results; +} + + +/** + * reset the state of ctcBeamSearchDecoderBatch + */ +void CtcBeamSearchDecoderBatch::reset_state(size_t batch_size, + size_t beam_size, + size_t num_processes, + double cutoff_prob, + size_t cutoff_top_n) { + this->batch_size = batch_size; + this->beam_size = beam_size; + this->num_processes = num_processes; + this->cutoff_prob = cutoff_prob; + this->cutoff_top_n = cutoff_top_n; + + VALID_CHECK_GT(this->beam_size, 0, "beam_size must be greater than 0!"); + VALID_CHECK_GT( + this->num_processes, 0, "num_processes must be nonnegative!"); + // thread pool + ThreadPool pool(this->num_processes); + // number of samples + // enqueue the tasks of decoding + std::vector> res; + size_t storage_size = decoder_storage_vector.size(); + for (size_t i = 0; i < storage_size; i++) { + res.emplace_back(pool.enqueue( + free_storage, std::ref(this->decoder_storage_vector[i]))); + } + for (size_t i = 0; i < storage_size; ++i) { + res[i].get(); + } + std::vector>().swap( + decoder_storage_vector); + for (size_t i = 0; i < this->batch_size; i++) { + this->decoder_storage_vector.push_back( + std::unique_ptr( + new CtcBeamSearchDecoderStorage())); + ctc_beam_search_decode_chunk_begin( + this->decoder_storage_vector[i]->root, this->ext_scorer); + } +} \ No newline at end of file diff --git a/third_party/ctc_decoders/ctc_beam_search_decoder.h b/third_party/ctc_decoders/ctc_beam_search_decoder.h index 58422657..92d2b855 100644 --- a/third_party/ctc_decoders/ctc_beam_search_decoder.h +++ b/third_party/ctc_decoders/ctc_beam_search_decoder.h @@ -37,7 +37,7 @@ * A vector that each element is a pair of score and decoding result, * in desending order. */ -std::vector> ctc_beam_search_decoder( +std::vector> ctc_beam_search_decoding( const std::vector> &probs_seq, const std::vector &vocabulary, size_t beam_size, @@ -46,6 +46,7 @@ std::vector> ctc_beam_search_decoder( Scorer *ext_scorer = nullptr, size_t blank_id = 0); + /* CTC Beam Search Decoder for batch data * Parameters: @@ -64,7 +65,7 @@ std::vector> ctc_beam_search_decoder( * result for one audio sample. */ std::vector>> -ctc_beam_search_decoder_batch( +ctc_beam_search_decoding_batch( const std::vector>> &probs_split, const std::vector &vocabulary, size_t beam_size, @@ -74,4 +75,101 @@ ctc_beam_search_decoder_batch( Scorer *ext_scorer = nullptr, size_t blank_id = 0); +/** + * Store the root and prefixes for decoder + */ + +class CtcBeamSearchDecoderStorage { + public: + PathTrie *root = nullptr; + std::vector prefixes; + + CtcBeamSearchDecoderStorage() { + // init prefixes' root + this->root = new PathTrie(); + this->root->log_prob_b_prev = 0.0; + // The score of root is in log scale.Since the prob=1.0, the prob score + // in log scale is 0.0 + this->root->score = root->log_prob_b_prev; + // std::vector prefixes; + this->prefixes.push_back(root); + }; + + ~CtcBeamSearchDecoderStorage() { + if (root != nullptr) { + delete root; + root = nullptr; + } + }; +}; + +/** + * The ctc beam search decoder, support batchsize >= 1 + */ +class CtcBeamSearchDecoderBatch { + public: + CtcBeamSearchDecoderBatch(const std::vector &vocabulary, + size_t batch_size, + size_t beam_size, + size_t num_processes, + double cutoff_prob, + size_t cutoff_top_n, + Scorer *ext_scorer, + size_t blank_id); + + ~CtcBeamSearchDecoderBatch(); + void next(const std::vector>> &probs_split, + const std::vector &has_value); + + std::vector>> decode(); + + void reset_state(size_t batch_size, + size_t beam_size, + size_t num_processes, + double cutoff_prob, + size_t cutoff_top_n); + + private: + std::vector vocabulary; + size_t batch_size; + size_t beam_size; + size_t num_processes; + double cutoff_prob; + size_t cutoff_top_n; + Scorer *ext_scorer; + size_t blank_id; + std::vector> + decoder_storage_vector; +}; + +/** + * function for chunk decoding + */ +void ctc_beam_search_decode_chunk( + PathTrie *root, + std::vector &prefixes, + const std::vector> &probs_seq, + const std::vector &vocabulary, + size_t beam_size, + double cutoff_prob, + size_t cutoff_top_n, + Scorer *ext_scorer, + size_t blank_id); + +std::vector> get_decode_result( + std::vector &prefixes, + const std::vector &vocabulary, + size_t beam_size, + Scorer *ext_scorer); + +/** + * free the CtcBeamSearchDecoderStorage + */ +void free_storage(std::unique_ptr &storage); + +/** + * initialize the root + */ +void ctc_beam_search_decode_chunk_begin(PathTrie *root, Scorer *ext_scorer); + #endif // CTC_BEAM_SEARCH_DECODER_H_ diff --git a/third_party/ctc_decoders/ctc_greedy_decoder.cpp b/third_party/ctc_decoders/ctc_greedy_decoder.cpp index a178c673..6aa3c996 100644 --- a/third_party/ctc_decoders/ctc_greedy_decoder.cpp +++ b/third_party/ctc_decoders/ctc_greedy_decoder.cpp @@ -15,7 +15,7 @@ #include "ctc_greedy_decoder.h" #include "decoder_utils.h" -std::string ctc_greedy_decoder( +std::string ctc_greedy_decoding( const std::vector> &probs_seq, const std::vector &vocabulary, size_t blank_id) { diff --git a/third_party/ctc_decoders/ctc_greedy_decoder.h b/third_party/ctc_decoders/ctc_greedy_decoder.h index 4d60beaf..4451600d 100644 --- a/third_party/ctc_decoders/ctc_greedy_decoder.h +++ b/third_party/ctc_decoders/ctc_greedy_decoder.h @@ -27,7 +27,7 @@ * Return: * The decoding result in string */ -std::string ctc_greedy_decoder( +std::string ctc_greedy_decoding( const std::vector>& probs_seq, const std::vector& vocabulary, size_t blank_id); diff --git a/third_party/ctc_decoders/decoders.i b/third_party/ctc_decoders/decoders.i index 4227d4a3..8fe3b279 100644 --- a/third_party/ctc_decoders/decoders.i +++ b/third_party/ctc_decoders/decoders.i @@ -1,4 +1,4 @@ -%module swig_decoders +%module paddlespeech_ctcdecoders %{ #include "scorer.h" #include "ctc_greedy_decoder.h" diff --git a/third_party/ctc_decoders/path_trie.cpp b/third_party/ctc_decoders/path_trie.cpp index a5e7dd3d..777ca052 100644 --- a/third_party/ctc_decoders/path_trie.cpp +++ b/third_party/ctc_decoders/path_trie.cpp @@ -44,6 +44,7 @@ PathTrie::PathTrie() { PathTrie::~PathTrie() { for (auto child : children_) { delete child.second; + child.second = nullptr; } } @@ -131,26 +132,26 @@ void PathTrie::iterate_to_vec(std::vector& output) { void PathTrie::remove() { exists_ = false; - if (children_.size() == 0) { - auto child = parent->children_.begin(); - for (child = parent->children_.begin(); - child != parent->children_.end(); - ++child) { - if (child->first == character) { - parent->children_.erase(child); - break; + if (parent != nullptr) { + auto child = parent->children_.begin(); + for (child = parent->children_.begin(); + child != parent->children_.end(); + ++child) { + if (child->first == character) { + parent->children_.erase(child); + break; + } + } + if (parent->children_.size() == 0 && !parent->exists_) { + parent->remove(); } } - - if (parent->children_.size() == 0 && !parent->exists_) { - parent->remove(); - } - delete this; } } + void PathTrie::set_dictionary(fst::StdVectorFst* dictionary) { dictionary_ = dictionary; dictionary_state_ = dictionary->Start(); diff --git a/third_party/ctc_decoders/scorer.cpp b/third_party/ctc_decoders/scorer.cpp index 977112d1..6c1d96be 100644 --- a/third_party/ctc_decoders/scorer.cpp +++ b/third_party/ctc_decoders/scorer.cpp @@ -1,4 +1,5 @@ -// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the "COPYING.LESSER.3"); +// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the +// "COPYING.LESSER.3"); #include "scorer.h" diff --git a/third_party/ctc_decoders/scorer.h b/third_party/ctc_decoders/scorer.h index 5739339d..08e109b7 100644 --- a/third_party/ctc_decoders/scorer.h +++ b/third_party/ctc_decoders/scorer.h @@ -1,4 +1,5 @@ -// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the "COPYING.LESSER.3"); +// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the +// "COPYING.LESSER.3"); #ifndef SCORER_H_ #define SCORER_H_ diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py index 6484b87c..4a11b890 100644 --- a/third_party/ctc_decoders/setup.py +++ b/third_party/ctc_decoders/setup.py @@ -112,7 +112,7 @@ os.system('swig -python -c++ ./decoders.i') decoders_module = [ Extension( - name='_swig_decoders', + name='_paddlespeech_ctcdecoders', sources=FILES + glob.glob('*.cxx') + glob.glob('*.cpp'), language='c++', include_dirs=[ @@ -134,4 +134,4 @@ setup( url="https://github.com/PaddlePaddle/PaddleSpeech", license='Apache 2.0, GNU Lesser General Public License v3 (LGPLv3) (LGPL-3)', ext_modules=decoders_module, - py_modules=['swig_decoders']) + py_modules=['paddlespeech_ctcdecoders']) diff --git a/utils/DER.py b/utils/DER.py new file mode 100755 index 00000000..5b62094d --- /dev/null +++ b/utils/DER.py @@ -0,0 +1,192 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Calculates Diarization Error Rate (DER) which is the sum of Missed Speaker (MS), +False Alarm (FA), and Speaker Error Rate (SER) using md-eval-22.pl from NIST RT Evaluation. + +Authors + * Neville Ryant 2018 + * Nauman Dawalatabad 2020 + * qingenz123@126.com (Qingen ZHAO) 2022 + +Credits + This code is adapted from https://github.com/nryant/dscore +""" +import argparse +from distutils.util import strtobool +import os +import re +import subprocess +import numpy as np + +FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)") +SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+") +MISS_SPEAKER_TIME = re.compile(r"(?<=MISSED SPEAKER TIME =)[\d.]+") +FA_SPEAKER_TIME = re.compile(r"(?<=FALARM SPEAKER TIME =)[\d.]+") +ERROR_SPEAKER_TIME = re.compile(r"(?<=SPEAKER ERROR TIME =)[\d.]+") + + +def rectify(arr): + """Corrects corner cases and converts scores into percentage. + """ + # Numerator and denominator both 0. + arr[np.isnan(arr)] = 0 + + # Numerator > 0, but denominator = 0. + arr[np.isinf(arr)] = 1 + arr *= 100.0 + + return arr + + +def DER( + ref_rttm, + sys_rttm, + ignore_overlap=False, + collar=0.25, + individual_file_scores=False, ): + """Computes Missed Speaker percentage (MS), False Alarm (FA), + Speaker Error Rate (SER), and Diarization Error Rate (DER). + + Arguments + --------- + ref_rttm : str + The path of reference/groundtruth RTTM file. + sys_rttm : str + The path of the system generated RTTM file. + individual_file : bool + If True, returns scores for each file in order. + collar : float + Forgiveness collar. + ignore_overlap : bool + If True, ignores overlapping speech during evaluation. + + Returns + ------- + MS : float array + Missed Speech. + FA : float array + False Alarms. + SER : float array + Speaker Error Rates. + DER : float array + Diarization Error Rates. + + Example + ------- + >>> import pytest + >>> pytest.skip('Skipping because of Perl dependency') + >>> ref_rttm = "../../samples/rttm_samples/ref_rttm/ES2014c.rttm" + >>> sys_rttm = "../../samples/rttm_samples/sys_rttm/ES2014c.rttm" + >>> ignore_overlap = True + >>> collar = 0.25 + >>> individual_file_scores = True + >>> Scores = DER(ref_rttm, sys_rttm, ignore_overlap, collar, individual_file_scores) + >>> print (Scores) + (array([0., 0.]), array([0., 0.]), array([7.16923618, 7.16923618]), array([7.16923618, 7.16923618])) + """ + + curr = os.path.abspath(os.path.dirname(__file__)) + mdEval = os.path.join(curr, "./md-eval.pl") + + cmd = [ + mdEval, + "-af", + "-r", + ref_rttm, + "-s", + sys_rttm, + "-c", + str(collar), + ] + if ignore_overlap: + cmd.append("-1") + + try: + stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT) + + except subprocess.CalledProcessError as ex: + stdout = ex.output + + else: + stdout = stdout.decode("utf-8") + + # Get all recording IDs + file_ids = [m.strip() for m in FILE_IDS.findall(stdout)] + file_ids = [ + file_id[2:] if file_id.startswith("f=") else file_id + for file_id in file_ids + ] + + scored_speaker_times = np.array( + [float(m) for m in SCORED_SPEAKER_TIME.findall(stdout)]) + + miss_speaker_times = np.array( + [float(m) for m in MISS_SPEAKER_TIME.findall(stdout)]) + + fa_speaker_times = np.array( + [float(m) for m in FA_SPEAKER_TIME.findall(stdout)]) + + error_speaker_times = np.array( + [float(m) for m in ERROR_SPEAKER_TIME.findall(stdout)]) + + with np.errstate(invalid="ignore", divide="ignore"): + tot_error_times = ( + miss_speaker_times + fa_speaker_times + error_speaker_times) + miss_speaker_frac = miss_speaker_times / scored_speaker_times + fa_speaker_frac = fa_speaker_times / scored_speaker_times + sers_frac = error_speaker_times / scored_speaker_times + ders_frac = tot_error_times / scored_speaker_times + + # Values in percentage of scored_speaker_time + miss_speaker = rectify(miss_speaker_frac) + fa_speaker = rectify(fa_speaker_frac) + sers = rectify(sers_frac) + ders = rectify(ders_frac) + + if individual_file_scores: + return miss_speaker, fa_speaker, sers, ders + else: + return miss_speaker[-1], fa_speaker[-1], sers[-1], ders[-1] + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description='Compute Diarization Error Rate') + parser.add_argument( + '--ref_rttm', + required=True, + help='the path of reference/groundtruth RTTM file') + parser.add_argument( + '--sys_rttm', + required=True, + help='the path of the system generated RTTM file') + parser.add_argument( + '--individual_file', + default=False, + type=strtobool, + help='if True, returns scores for each file in order') + parser.add_argument( + '--collar', default=0.25, type=float, help='forgiveness collar') + parser.add_argument( + '--ignore_overlap', + default=False, + type=strtobool, + help='if True, ignores overlapping speech during evaluation') + args = parser.parse_args() + print(args) + + der = DER(args.ref_rttm, args.sys_rttm) + print("miss_speaker: %.3f%% fa_speaker: %.3f%% sers: %.3f%% ders: %.3f%%" % + (der[0], der[1], der[2], der[-1])) diff --git a/utils/compute_statistics.py b/utils/compute_statistics.py index e8021c19..5b2a5606 100755 --- a/utils/compute_statistics.py +++ b/utils/compute_statistics.py @@ -22,6 +22,7 @@ from sklearn.preprocessing import StandardScaler from tqdm import tqdm from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.utils import str2bool def main(): @@ -41,9 +42,6 @@ def main(): help="path to save statistics. if not provided, " "stats will be saved in the above root directory with name stats.npy") - def str2bool(str): - return True if str.lower() == 'true' else False - parser.add_argument( "--use-relative-path", type=str2bool, diff --git a/utils/generate_infer_yaml.py b/utils/generate_infer_yaml.py index a2eb28c7..b8a797ad 100644 --- a/utils/generate_infer_yaml.py +++ b/utils/generate_infer_yaml.py @@ -1,21 +1,19 @@ #!/usr/bin/env python3 # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - ''' Merge training configs into a single inference config. The single inference config is for CLI, which only takes a single config to do inferencing. The trainig configs includes: model config, preprocess config, decode config, vocab file and cmvn file. ''' - -import yaml -import json -import os import argparse +import json import math +import os +from contextlib import redirect_stdout + from yacs.config import CfgNode from paddlespeech.s2t.frontend.utility import load_dict -from contextlib import redirect_stdout def save(save_path, config): @@ -29,18 +27,21 @@ def load(save_path): config.merge_from_file(save_path) return config + def load_json(json_path): with open(json_path) as f: json_content = json.load(f) return json_content + def remove_config_part(config, key_list): if len(key_list) == 0: return - for i in range(len(key_list) -1): + for i in range(len(key_list) - 1): config = config[key_list[i]] config.pop(key_list[-1]) + def load_cmvn_from_json(cmvn_stats): means = cmvn_stats['mean_stat'] variance = cmvn_stats['var_stat'] @@ -51,17 +52,17 @@ def load_cmvn_from_json(cmvn_stats): if variance[i] < 1.0e-20: variance[i] = 1.0e-20 variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn_stats = {"mean":means, "istd":variance} + cmvn_stats = {"mean": means, "istd": variance} return cmvn_stats + def merge_configs( - conf_path = "conf/conformer.yaml", - preprocess_path = "conf/preprocess.yaml", - decode_path = "conf/tuning/decode.yaml", - vocab_path = "data/vocab.txt", - cmvn_path = "data/mean_std.json", - save_path = "conf/conformer_infer.yaml", - ): + conf_path="conf/conformer.yaml", + preprocess_path="conf/preprocess.yaml", + decode_path="conf/tuning/decode.yaml", + vocab_path="data/vocab.txt", + cmvn_path="data/mean_std.json", + save_path="conf/conformer_infer.yaml", ): # Load the configs config = load(conf_path) @@ -72,17 +73,16 @@ def merge_configs( if cmvn_path.split(".")[-1] == 'json': cmvn_stats = load_json(cmvn_path) if os.path.exists(preprocess_path): - preprocess_config = load(preprocess_path) + preprocess_config = load(preprocess_path) for idx, process in enumerate(preprocess_config["process"]): if process['type'] == "cmvn_json": - preprocess_config["process"][idx][ - "cmvn_path"] = cmvn_stats + preprocess_config["process"][idx]["cmvn_path"] = cmvn_stats break config.preprocess_config = preprocess_config else: cmvn_stats = load_cmvn_from_json(cmvn_stats) - config.mean_std_filepath = [{"cmvn_stats":cmvn_stats}] + config.mean_std_filepath = [{"cmvn_stats": cmvn_stats}] config.augmentation_config = '' # the cmvn file is end with .ark else: @@ -95,7 +95,8 @@ def merge_configs( # Remove some parts of the config if os.path.exists(preprocess_path): - remove_train_list = ["train_manifest", + remove_train_list = [ + "train_manifest", "dev_manifest", "test_manifest", "n_epoch", @@ -124,9 +125,10 @@ def merge_configs( "batch_size", "maxlen_in", "maxlen_out", - ] + ] else: - remove_train_list = ["train_manifest", + remove_train_list = [ + "train_manifest", "dev_manifest", "test_manifest", "n_epoch", @@ -141,43 +143,41 @@ def merge_configs( "weight_decay", "sortagrad", "num_workers", - ] + ] for item in remove_train_list: try: remove_config_part(config, [item]) except: - print ( item + " " +"can not be removed") + print(item + " " + "can not be removed") # Save the config save(save_path, config) - if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog='Config merge', add_help=True) + parser = argparse.ArgumentParser(prog='Config merge', add_help=True) parser.add_argument( - '--cfg_pth', type=str, default = 'conf/transformer.yaml', help='origin config file') + '--cfg_pth', + type=str, + default='conf/transformer.yaml', + help='origin config file') parser.add_argument( - '--pre_pth', type=str, default= "conf/preprocess.yaml", help='') + '--pre_pth', type=str, default="conf/preprocess.yaml", help='') parser.add_argument( - '--dcd_pth', type=str, default= "conf/tuninig/decode.yaml", help='') + '--dcd_pth', type=str, default="conf/tuninig/decode.yaml", help='') parser.add_argument( - '--vb_pth', type=str, default= "data/lang_char/vocab.txt", help='') + '--vb_pth', type=str, default="data/lang_char/vocab.txt", help='') parser.add_argument( - '--cmvn_pth', type=str, default= "data/mean_std.json", help='') + '--cmvn_pth', type=str, default="data/mean_std.json", help='') parser.add_argument( - '--save_pth', type=str, default= "conf/transformer_infer.yaml", help='') + '--save_pth', type=str, default="conf/transformer_infer.yaml", help='') parser_args = parser.parse_args() merge_configs( - conf_path = parser_args.cfg_pth, - decode_path = parser_args.dcd_pth, - preprocess_path = parser_args.pre_pth, - vocab_path = parser_args.vb_pth, - cmvn_path = parser_args.cmvn_pth, - save_path = parser_args.save_pth, - ) - - + conf_path=parser_args.cfg_pth, + decode_path=parser_args.dcd_pth, + preprocess_path=parser_args.pre_pth, + vocab_path=parser_args.vb_pth, + cmvn_path=parser_args.cmvn_pth, + save_path=parser_args.save_pth, ) diff --git a/utils/md-eval.pl b/utils/md-eval.pl new file mode 100755 index 00000000..0356b927 --- /dev/null +++ b/utils/md-eval.pl @@ -0,0 +1,2938 @@ +#!/usr/bin/perl -w +################################# +# NIST. (2009). The 2009 (RT-09) Rich Transcription Meeting Recognition Evaluation Plan. +# https://web.archive.org/web/20100606041157if_/http://www.itl.nist.gov/iad/mig/tests/rt/2009/docs/rt09-meeting-eval-plan-v2.pdf +# Source (dscore): https://github.com/nryant/dscore/blob/master/scorelib/md-eval-22.pl +################################# +# BSD 2-Clause License +# +# Copyright (c) 2018, Neville Ryant +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################# + +use strict; + +my $version = "22"; + +################################# +# History: +# +# version 22: * JGF: added an option '-m FILE' to hold a CSV speaker map file. +# +# version 21: * JGF: added a flag '-n' to not remove the directory paths from the source +# files in the UEM file. +# +# version 20: * change metadata discard rule: rather than discard if the midpoint +# (or endpoint) of the metadata object lies in a no-eval zone, discard +# if there is ANY overlap whatsoever between the metadata object and +# a no-eval zone. This holds for system output objects only if the +# system output metadata object is not mapped to a ref object. +# * optimize IP and SU mapping by giving a secondary bonus mapping score +# to candidate ref-sys MD map pairs if the end-words of both coincide. +# +# version 19: * bug fix in subroutine speakers_match +# * bug fix in tag_ref_words_with_metadata_info +# +# version 18: * cosmetic fix to error message in eval_condition +# * added conditional output options for word coverage performance +# * added secondary MD word coverage optimization to word alignment +# * further optimize word alignment by considering MD subtypes +# * further optimize MD alignment by considering MD subtypes +# * add a new SU discard rule: discard if TEND in no-eval zone +# * enforce legal values for su_extent_limit +# +# version 17: create_speaker_segs modified to accommodate the same speaker +# having multiple overlapping speaker segments. (This is an +# error and pathological condition, but the system must either +# disallow (abort on) the condition, or perform properly under +# the pathological condition. The second option is chosen.) +# +# version 16: * If neither -w nor -W is specified, suppress warnings about +# ref SPEAKER records subsuming no lexemes. +# * Output the overall speaker diarization stats after the +# stats for the individual files +# * Do not alter the case of alphabetic characters in the filename +# field from the ref rttm file +# * Made the format of the overall speaker error line more similar to +# the corresponding line of output from SpkrSegEval, to facilitate +# use of existing "grep" commands in existing scripts. +# +# version 15: * bug fix in create_speaker_segs to accommodate +# contiguous same-speaker segments +# * added conditional file/channel scoring to +# speaker diarization evaluation +# +# version 14: bug fix in md_score +# +# version 13: add DISCOURSE_RESPONSE as a FILLER subtype +# +# version 12: make REF LEXEMES optional if they aren't required +# +# version 11: change default for noscore MD regions +# +# version 10: bug fix +# +# version 09: +# * avoid crash when metadata discard yields no metadata +# * make evaluated ref_wds sensitive to metadata type +# * defer discarding of system output metadata until after +# metadata mapping, then discard only unmapped events. +# * extend 1-speaker scoring inhibition to metadata +# * eliminate demand for SPKR-INFO subtype for speakers +# * correct ref count of IP and SU exact boundary words +# * add official RT-04F scores +# * add conditional analyses for file/chnl/spkr/gender +# +# version 08: +# * bug fixes speaker diarization scoring +# - count of EVAL_WORDS corrected +# - no-score extended to nearest SPEAKER boundary +# +# version 07: +# * warning issued when discarding metadata events +# that cover LEXEMEs in the evaluation region +# +# version 06: +# * eliminated unused speakers from speaker scoring +# * changed discard algorithm for unannotated SU's and +# complex EDIT's to discard sys SU's and EDIT's when +# their midpoints overlap (rather than ANY overlap). +# * fixed display_metadata_mapping +# +# version 05: +# * upgraded display_metadata_mapping +# +# version 04: +# * diagnostic metadata mapping output added +# * uem_from_rttm bug fix +# +# version 03: +# * adjusted times used for speaker diarization +# * changed usage of max_extend to agree with cookbook +# +# version 02: speaker diarization evaluation added +# +# version 01: a merged version of df-eval-v14 and su-eval-v16 +# +################################# + +#global data +my $epsilon = 1E-8; +my $miss_name = " MISS"; +my $fa_name = " FALSE ALARM"; +my %rttm_datatypes = (SEGMENT => {eval => 1, "" => 1}, + NOSCORE => {"" => 1}, + NO_RT_METADATA => {"" => 1}, + LEXEME => {lex => 1, fp => 1, frag => 1, "un-lex" => 1, + "for-lex" => 1, alpha => 1, acronym => 1, + interjection => 1, propernoun => 1, other => 1}, + "NON-LEX" => {laugh => 1, breath => 1, lipsmack => 1, + cough => 1, sneeze => 1, other => 1}, + "NON-SPEECH" => {noise => 1, music => 1, other => 1}, + FILLER => {filled_pause => 1, discourse_marker => 1, + discourse_response => 1, explicit_editing_term => 1, + other => 1}, + EDIT => {repetition => 1, restart => 1, revision => 1, + simple => 1, complex => 1, other => 1}, + IP => {edit => 1, filler => 1, "edit&filler" => 1, + other => 1}, + SU => {statement => 1, backchannel => 1, question => 1, + incomplete => 1, unannotated => 1, other => 1}, + CB => {coordinating => 1, clausal => 1, other => 1}, + "A/P" => {"" => 1}, + SPEAKER => {"" => 1}, + "SPKR-INFO" => {adult_male => 1, adult_female => 1, child => 1, unknown => 1}); +my %md_subtypes = (FILLER => $rttm_datatypes{FILLER}, + EDIT => $rttm_datatypes{EDIT}, + IP => $rttm_datatypes{IP}, + SU => $rttm_datatypes{SU}); +my %spkr_subtypes = (adult_male => 1, adult_female => 1, child => 1, unknown => 1); + +my $noeval_mds = { + DEFAULT => { + NOSCORE => {"" => 1}, + NO_RT_METADATA => {"" => 1}, + }, +}; +my $noscore_mds = { + DEFAULT => { + NOSCORE => {"" => 1}, + LEXEME => {"un-lex" => 1}, + SU => {unannotated => 1}, + }, + MIN => { + NOSCORE => {"" => 1}, + SU => {unannotated => 1}, + }, + FRAG_UNLEX => { + NOSCORE => {"" => 1}, + LEXEME => {frag => 1, "un-lex" => 1}, + SU => {unannotated => 1}, + }, + FRAG => { + NOSCORE => {"" => 1}, + LEXEME => {frag => 1}, + SU => {unannotated => 1}, + }, + NONE => { + }, +}; +my $noeval_sds = { + DEFAULT => { + NOSCORE => {"" => 1}, + }, +}; +my $noscore_sds = { + DEFAULT => { + NOSCORE => {"" => 1}, + "NON-LEX" => {laugh => 1, breath => 1, lipsmack => 1, + cough => 1, sneeze => 1, other => 1}, + }, +}; + +my %speaker_map; + +my $default_extend = 0.50; #the maximum time (in seconds) to extend a no-score zone +my $default_collar = 0.00; #the no-score collar (in +/- seconds) to attach to SPEAKER boundaries +my $default_tgap = 1.00; #the max gap (in seconds) between matching ref/sys words +my $default_Tgap = 1.00; #the max gap (in seconds) between matching ref/sys metadata events +my $default_Wgap = 0.10; #the max gap (in words) between matching ref/sys metadata events +my $default_su_time_limit = 0.50; #the max extent (in seconds) to match for SU's +my $default_su_word_limit = 2.00; #the max extent (in words) to match for SU's +my $default_word_delta_score = 10.0; #the max delta score for word-based DP alignment of ref/sys words +my $default_time_delta_score = 1.00; #the max delta score for time-based DP alignment of ref/sys words + +my $usage = "\n\nUsage: $0 [-h] -r -s \n\n". + "Description: md-eval evaluates EARS metadata detection performance\n". + " by comparing system metadata output data with reference data\n". + "INPUT:\n". + " -R A file containing a list of the reference metadata files\n". + " being evaluated, in RTTM format. If the word-mediated alignment\n". + " option is used then this data must include reference STT data\n". + " in addition to the metadata being evaluated.\n". + " OR\n". + " -r A file containing reference metadata, in RTTM format\n\n". + " -S A file containing a list of the system output metadata\n". + " files to be evaluated, in RTTM format. If the word-mediated\n". + " alignment option is used then this data must include system STT\n". + " output data in addition to the metadata to be evaluated.\n". + " OR\n". + " -s A file containing system output metadata, in RTTM format\n\n". + " input options:\n". + " -x to include complex edits in the analysis and scoring.\n". + " -w for word-mediated alignment.\n". + " * The default (time-mediated) alignment aligns ref and sys metadata\n". + " according to the time overlap of the original ref and sys metadata\n". + " time intervals.\n". + " * Word-mediated alignment aligns ref and sys metadata according to\n". + " the alignment of the words that are subsumed within the metadata\n". + " time intervals.\n". + " -W for word-optimized mapping.\n". + " * The default (time-optimized) mapping maps ref and sys metadata\n". + " so as to maximize the time overlap of mapped metadata events.\n". + " * Word-optimized mapping maps ref and sys metadata so as to\n". + " maximize the overlap in terms of the number of reference words\n". + " that are subsumed within the overlapping time interval.\n". + " -a Conditional analysis options for metadata detection performance:\n". + " c for performance versus channel,\n". + " f for performance versus file,\n". + " g for performance versus gender, and\n". + " s for performance versus speaker.\n". + " -A Conditional analysis options for word coverage performance:\n". + " c for performance versus channel,\n". + " f for performance versus file,\n". + " -t